update #5

Open
mdebray wants to merge 231 commits from update into main
12 changed files with 296 additions and 28 deletions
Showing only changes of commit 58618b3a21 - Show all commits

View file

@ -2,6 +2,7 @@ Translation:
- changed-files: - changed-files:
- any-glob-to-any-file: 'src/main/resources/messages_*_*.properties' - any-glob-to-any-file: 'src/main/resources/messages_*_*.properties'
- any-glob-to-any-file: 'scripts/ignore_translation.toml' - any-glob-to-any-file: 'scripts/ignore_translation.toml'
- any-glob-to-any-file: 'src/main/resources/templates/fragments/languages.html'
Front End: Front End:
- changed-files: - changed-files:

View file

@ -39,16 +39,16 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
libreoffice \ libreoffice \
# pdftohtml # pdftohtml
poppler-utils \ poppler-utils \
# OCR MY PDF (unpaper for descew and other advanced featues) # OCR MY PDF (unpaper for descew and other advanced features)
ocrmypdf \ ocrmypdf \
tesseract-ocr-data-eng \ tesseract-ocr-data-eng \
# CV # CV
py3-opencv \ py3-opencv \
# python3/pip # python3/pip
python3 \ python3 \
py3-pip && \ py3-pip && \
# uno unoconv and HTML # uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
mv /usr/share/tessdata /usr/share/tessdata-original && \ mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
fc-cache -f -v && \ fc-cache -f -v && \

View file

@ -64,7 +64,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
python3 \ python3 \
py3-pip && \ py3-pip && \
# uno unoconv and HTML # uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
mv /usr/share/tessdata /usr/share/tessdata-original && \ mv /usr/share/tessdata /usr/share/tessdata-original && \
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
fc-cache -f -v && \ fc-cache -f -v && \

View file

@ -15,7 +15,7 @@
| file-to-pdf | | ✔️ | | | ✔️ | | | ✔️ | | | | | file-to-pdf | | ✔️ | | | ✔️ | | | ✔️ | | | |
| img-to-pdf | | ✔️ | | | | | | | | ✔️ | | | img-to-pdf | | ✔️ | | | | | | | | ✔️ | |
| pdf-to-html | | ✔️ | | | ✔️ | | | ✔️ | | | | | pdf-to-html | | ✔️ | | | ✔️ | | | ✔️ | | | |
| pdf-to-img | | ✔️ | | | | | | | | ✔️ | | | pdf-to-img | | ✔️ | | | | ✔️ | | | | ✔️ | |
| pdf-to-pdfa | | ✔️ | | | ✔️ | | | | ✔️ | | | | pdf-to-pdfa | | ✔️ | | | ✔️ | | | | ✔️ | | |
| pdf-to-markdown | | ✔️ | | | | | | | | ✔️ | | | pdf-to-markdown | | ✔️ | | | | | | | | ✔️ | |
| pdf-to-presentation | | ✔️ | | | ✔️ | | | ✔️ | | | | | pdf-to-presentation | | ✔️ | | | ✔️ | | | ✔️ | | | |

174
scripts/png_to_webp.py Normal file
View file

@ -0,0 +1,174 @@
"""
Author: Ludy87
Description: This script converts a PDF file to WebP images. It includes functionality to resize images if they exceed specified dimensions and handle conversion of PDF pages to WebP format.
Example
-------
To convert a PDF file to WebP images with each page as a separate WebP file:
python script.py input.pdf output_directory
To convert a PDF file to a single WebP image:
python script.py input.pdf output_directory --single
To adjust the DPI resolution for rendering PDF pages:
python script.py input.pdf output_directory --dpi 150
"""
import argparse
import os
from pdf2image import convert_from_path
from PIL import Image
def resize_image(input_image_path, output_image_path, max_size=(16383, 16383)):
"""
Resize the image if its dimensions exceed the maximum allowed size and save it as WebP.
Parameters
----------
input_image_path : str
Path to the input image file.
output_image_path : str
Path where the output WebP image will be saved.
max_size : tuple of int, optional
Maximum allowed dimensions for the image (width, height). Default is (16383, 16383).
Returns
-------
None
"""
try:
# Open the image
image = Image.open(input_image_path)
width, height = image.size
max_width, max_height = max_size
# Check if the image dimensions exceed the maximum allowed dimensions
if width > max_width or height > max_height:
# Calculate the scaling ratio
ratio = min(max_width / width, max_height / height)
new_width = int(width * ratio)
new_height = int(height * ratio)
# Resize the image
resized_image = image.resize((new_width, new_height), Image.LANCZOS)
resized_image.save(output_image_path, format="WEBP", quality=100)
print(
f"The image was successfully resized to ({new_width}, {new_height}) and saved as WebP: {output_image_path}"
)
else:
# If dimensions are within the allowed limits, save the image directly
image.save(output_image_path, format="WEBP", quality=100)
print(f"The image was successfully saved as WebP: {output_image_path}")
except Exception as e:
print(f"An error occurred: {e}")
def convert_image_to_webp(input_image, output_file):
"""
Convert an image to WebP format, resizing it if it exceeds the maximum dimensions.
Parameters
----------
input_image : str
Path to the input image file.
output_file : str
Path where the output WebP image will be saved.
Returns
-------
None
"""
# Resize the image if it exceeds the maximum dimensions
resize_image(input_image, output_file, max_size=(16383, 16383))
def pdf_to_webp(pdf_path, output_dir, dpi=300):
"""
Convert each page of a PDF file to WebP images.
Parameters
----------
pdf_path : str
Path to the input PDF file.
output_dir : str
Directory where the WebP images will be saved.
dpi : int, optional
DPI resolution for rendering PDF pages. Default is 300.
Returns
-------
None
"""
# Convert the PDF to a list of images
images = convert_from_path(pdf_path, dpi=dpi)
for page_number, image in enumerate(images):
# Define temporary PNG path
temp_png_path = os.path.join(output_dir, f"temp_page_{page_number + 1}.png")
image.save(temp_png_path, format="PNG")
# Define the output path for WebP
output_path = os.path.join(output_dir, f"page_{page_number + 1}.webp")
# Convert PNG to WebP
convert_image_to_webp(temp_png_path, output_path)
# Delete the temporary PNG file
os.remove(temp_png_path)
def main(pdf_image_path, output_dir, dpi=300, single_images_flag=False):
"""
Main function to handle conversion from PDF to WebP images.
Parameters
----------
pdf_image_path : str
Path to the input PDF file or image.
output_dir : str
Directory where the WebP images will be saved.
dpi : int, optional
DPI resolution for rendering PDF pages. Default is 300.
single_images_flag : bool, optional
If True, combine all pages into a single WebP image. Default is False.
Returns
-------
None
"""
if single_images_flag:
# Combine all pages into a single WebP image
output_path = os.path.join(output_dir, "combined_image.webp")
convert_image_to_webp(pdf_image_path, output_path)
else:
# Convert each PDF page to a separate WebP image
pdf_to_webp(pdf_image_path, output_dir, dpi)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert a PDF file to WebP images.")
parser.add_argument("pdf_path", help="The path to the input PDF file.")
parser.add_argument(
"output_dir", help="The directory where the WebP images should be saved."
)
parser.add_argument(
"--dpi",
type=int,
default=300,
help="The DPI resolution for rendering the PDF pages (default: 300).",
)
parser.add_argument(
"--single",
action="store_true",
help="Combine all pages into a single WebP image.",
)
args = parser.parse_args()
os.makedirs(args.output_dir, exist_ok=True)
main(
args.pdf_path,
args.output_dir,
dpi=args.dpi,
single_images_flag=args.single,
)

View file

@ -166,6 +166,7 @@ public class EndpointConfiguration {
addEndpointToGroup("Python", REMOVE_BLANKS); addEndpointToGroup("Python", REMOVE_BLANKS);
addEndpointToGroup("Python", "html-to-pdf"); addEndpointToGroup("Python", "html-to-pdf");
addEndpointToGroup("Python", "url-to-pdf"); addEndpointToGroup("Python", "url-to-pdf");
addEndpointToGroup("Python", "pdf-to-img");
// openCV // openCV
addEndpointToGroup("OpenCV", "extract-image-scans"); addEndpointToGroup("OpenCV", "extract-image-scans");

View file

@ -1,11 +1,23 @@
package stirling.software.SPDF.controller.api.converters; package stirling.software.SPDF.controller.api.converters;
import java.io.ByteArrayOutputStream;
import java.io.FileOutputStream;
import java.io.IOException; import java.io.IOException;
import java.net.URLConnection; import java.net.URLConnection;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import org.apache.commons.io.FileUtils;
import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.ImageType;
import org.slf4j.Logger; import org.slf4j.Logger;
import org.slf4j.LoggerFactory; import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -21,6 +33,8 @@ import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.converters.ConvertToImageRequest; import stirling.software.SPDF.model.api.converters.ConvertToImageRequest;
import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest; import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest;
import stirling.software.SPDF.utils.PdfUtils; import stirling.software.SPDF.utils.PdfUtils;
import stirling.software.SPDF.utils.ProcessExecutor;
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
@RestController @RestController
@ -60,15 +74,92 @@ public class ConvertImgPDFController {
result = result =
PdfUtils.convertFromPdf( PdfUtils.convertFromPdf(
pdfBytes, pdfBytes,
imageFormat.toUpperCase(), imageFormat.equalsIgnoreCase("webp") ? "png" : imageFormat.toUpperCase(),
colorTypeResult, colorTypeResult,
singleImage, singleImage,
Integer.valueOf(dpi), Integer.valueOf(dpi),
filename); filename);
if (result == null || result.length == 0) { if (result == null || result.length == 0) {
logger.error("resultant bytes for {} is null, error converting ", filename); logger.error("resultant bytes for {} is null, error converting ", filename);
} }
if (imageFormat.equalsIgnoreCase("webp")) {
// Write the output stream to a temp file
Path tempFile = Files.createTempFile("temp_png", ".png");
try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
fos.write(result);
fos.flush();
}
String pythonVersion = "python3";
try {
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(Arrays.asList("python3", "--version"));
} catch (IOException e) {
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(Arrays.asList("python", "--version"));
pythonVersion = "python";
}
List<String> command = new ArrayList<>();
command.add(pythonVersion);
command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
// Create a temporary directory for the output WebP files
Path tempOutputDir = Files.createTempDirectory("webp_output");
if (singleImage) {
// Run the Python script to convert PNG to WebP
command.add(tempFile.toString());
command.add(tempOutputDir.toString());
command.add("--single");
} else {
// Save the uploaded PDF to a temporary file
Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
file.transferTo(tempPdfPath.toFile());
// Run the Python script to convert PDF to WebP
command.add(tempPdfPath.toString());
command.add(tempOutputDir.toString());
}
command.add("--dpi");
command.add(dpi);
ProcessExecutorResult resultProcess =
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
.runCommandWithOutputHandling(command);
// Find all WebP files in the output directory
List<Path> webpFiles =
Files.walk(tempOutputDir)
.filter(path -> path.toString().endsWith(".webp"))
.collect(Collectors.toList());
if (webpFiles.isEmpty()) {
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
throw new IOException("No WebP files were created. " + resultProcess.getMessages());
}
byte[] bodyBytes = new byte[0];
if (webpFiles.size() == 1) {
// Return the single WebP file directly
Path webpFilePath = webpFiles.get(0);
bodyBytes = Files.readAllBytes(webpFilePath);
} else {
// Create a ZIP file containing all WebP images
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
for (Path webpFile : webpFiles) {
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
Files.copy(webpFile, zos);
zos.closeEntry();
}
}
bodyBytes = zipOutputStream.toByteArray();
}
// Clean up the temporary files
Files.deleteIfExists(tempFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
result = bodyBytes;
}
if (singleImage) { if (singleImage) {
String docName = filename + "." + imageFormat; String docName = filename + "." + imageFormat;
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat)); MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));

View file

@ -21,14 +21,6 @@ public class ConverterWebController {
return "convert/book-to-pdf"; return "convert/book-to-pdf";
} }
@ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
@GetMapping("/pdf-to-book")
@Hidden
public String convertPdfToBookForm(Model model) {
model.addAttribute("currentPage", "pdf-to-book");
return "convert/pdf-to-book";
}
@GetMapping("/img-to-pdf") @GetMapping("/img-to-pdf")
@Hidden @Hidden
public String convertImgToPdfForm(Model model) { public String convertImgToPdfForm(Model model) {
@ -57,13 +49,6 @@ public class ConverterWebController {
return "convert/url-to-pdf"; return "convert/url-to-pdf";
} }
@GetMapping("/pdf-to-img")
@Hidden
public String pdfToimgForm(Model model) {
model.addAttribute("currentPage", "pdf-to-img");
return "convert/pdf-to-img";
}
@GetMapping("/file-to-pdf") @GetMapping("/file-to-pdf")
@Hidden @Hidden
public String convertToPdfForm(Model model) { public String convertToPdfForm(Model model) {
@ -73,6 +58,21 @@ public class ConverterWebController {
// PDF TO...... // PDF TO......
@ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
@GetMapping("/pdf-to-book")
@Hidden
public String convertPdfToBookForm(Model model) {
model.addAttribute("currentPage", "pdf-to-book");
return "convert/pdf-to-book";
}
@GetMapping("/pdf-to-img")
@Hidden
public String pdfToimgForm(Model model) {
model.addAttribute("currentPage", "pdf-to-img");
return "convert/pdf-to-img";
}
@GetMapping("/pdf-to-html") @GetMapping("/pdf-to-html")
@Hidden @Hidden
public ModelAndView pdfToHTML() { public ModelAndView pdfToHTML() {

View file

@ -12,7 +12,7 @@ public class ConvertToImageRequest extends PDFFile {
@Schema( @Schema(
description = "The output image format", description = "The output image format",
allowableValues = {"png", "jpeg", "jpg", "gif"}) allowableValues = {"png", "jpeg", "jpg", "gif", "webp"})
private String imageFormat; private String imageFormat;
@Schema( @Schema(

View file

@ -28,6 +28,7 @@
<option value="gif">GIF</option> <option value="gif">GIF</option>
<option value="tiff">TIFF</option> <option value="tiff">TIFF</option>
<option value="bmp">BMP</option> <option value="bmp">BMP</option>
<option value="webp">WEPB</option>
</select> </select>
</div> </div>
<div class="mb-3"> <div class="mb-3">