Add: Convert PDF to WebP (#1666)
* Add PDF to WebP * add swagger param * back * creates a custom image for Docker from pymupdf * Converting with pdf2image and Pillow instead of pymupdf * webp remove to pdf-to-img * remove mupdf
This commit is contained in:
parent
4a4c7faf47
commit
58618b3a21
12 changed files with 296 additions and 28 deletions
1
.github/labeler-config.yml
vendored
1
.github/labeler-config.yml
vendored
|
@ -2,6 +2,7 @@ Translation:
|
|||
- changed-files:
|
||||
- any-glob-to-any-file: 'src/main/resources/messages_*_*.properties'
|
||||
- any-glob-to-any-file: 'scripts/ignore_translation.toml'
|
||||
- any-glob-to-any-file: 'src/main/resources/templates/fragments/languages.html'
|
||||
|
||||
Front End:
|
||||
- changed-files:
|
||||
|
|
|
@ -39,16 +39,16 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
|
|||
libreoffice \
|
||||
# pdftohtml
|
||||
poppler-utils \
|
||||
# OCR MY PDF (unpaper for descew and other advanced featues)
|
||||
# OCR MY PDF (unpaper for descew and other advanced features)
|
||||
ocrmypdf \
|
||||
tesseract-ocr-data-eng \
|
||||
# CV
|
||||
py3-opencv \
|
||||
# python3/pip
|
||||
python3 \
|
||||
py3-pip && \
|
||||
py3-pip && \
|
||||
# uno unoconv and HTML
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
|
||||
mv /usr/share/tessdata /usr/share/tessdata-original && \
|
||||
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
|
||||
fc-cache -f -v && \
|
||||
|
|
|
@ -9,7 +9,7 @@ COPY . .
|
|||
|
||||
# Build the application with DOCKER_ENABLE_SECURITY=false
|
||||
RUN DOCKER_ENABLE_SECURITY=true \
|
||||
./gradlew clean build
|
||||
./gradlew clean build
|
||||
|
||||
# Main stage
|
||||
FROM alpine:3.20.2
|
||||
|
@ -32,7 +32,7 @@ ENV DOCKER_ENABLE_SECURITY=false \
|
|||
UMASK=022 \
|
||||
FAT_DOCKER=true \
|
||||
INSTALL_BOOK_AND_ADVANCED_HTML_OPS=false
|
||||
|
||||
|
||||
|
||||
# JDK for app
|
||||
RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
|
||||
|
@ -64,7 +64,7 @@ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /et
|
|||
python3 \
|
||||
py3-pip && \
|
||||
# uno unoconv and HTML
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint pdf2image pillow && \
|
||||
mv /usr/share/tessdata /usr/share/tessdata-original && \
|
||||
mkdir -p $HOME /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
|
||||
fc-cache -f -v && \
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
| file-to-pdf | | ✔️ | | | ✔️ | | | ✔️ | | | |
|
||||
| img-to-pdf | | ✔️ | | | | | | | | ✔️ | |
|
||||
| pdf-to-html | | ✔️ | | | ✔️ | | | ✔️ | | | |
|
||||
| pdf-to-img | | ✔️ | | | | | | | | ✔️ | |
|
||||
| pdf-to-img | | ✔️ | | | | ✔️ | | | | ✔️ | |
|
||||
| pdf-to-pdfa | | ✔️ | | | ✔️ | | | | ✔️ | | |
|
||||
| pdf-to-markdown | | ✔️ | | | | | | | | ✔️ | |
|
||||
| pdf-to-presentation | | ✔️ | | | ✔️ | | | ✔️ | | | |
|
||||
|
@ -44,4 +44,4 @@
|
|||
| remove-blanks | | | | ✔️ | ✔️ | ✔️ | ✔️ | | | | |
|
||||
| repair | | | | ✔️ | ✔️ | | | ✔️ | | | |
|
||||
| show-javascript | | | | ✔️ | | | | | | | ✔️ |
|
||||
| sign | | | | ✔️ | | | | | | | ✔️ |
|
||||
| sign | | | | ✔️ | | | | | | | ✔️ |
|
||||
|
|
174
scripts/png_to_webp.py
Normal file
174
scripts/png_to_webp.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
"""
|
||||
Author: Ludy87
|
||||
Description: This script converts a PDF file to WebP images. It includes functionality to resize images if they exceed specified dimensions and handle conversion of PDF pages to WebP format.
|
||||
|
||||
Example
|
||||
-------
|
||||
To convert a PDF file to WebP images with each page as a separate WebP file:
|
||||
python script.py input.pdf output_directory
|
||||
|
||||
To convert a PDF file to a single WebP image:
|
||||
python script.py input.pdf output_directory --single
|
||||
|
||||
To adjust the DPI resolution for rendering PDF pages:
|
||||
python script.py input.pdf output_directory --dpi 150
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
from pdf2image import convert_from_path
|
||||
from PIL import Image
|
||||
|
||||
|
||||
def resize_image(input_image_path, output_image_path, max_size=(16383, 16383)):
|
||||
"""
|
||||
Resize the image if its dimensions exceed the maximum allowed size and save it as WebP.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_image_path : str
|
||||
Path to the input image file.
|
||||
output_image_path : str
|
||||
Path where the output WebP image will be saved.
|
||||
max_size : tuple of int, optional
|
||||
Maximum allowed dimensions for the image (width, height). Default is (16383, 16383).
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
try:
|
||||
# Open the image
|
||||
image = Image.open(input_image_path)
|
||||
width, height = image.size
|
||||
max_width, max_height = max_size
|
||||
|
||||
# Check if the image dimensions exceed the maximum allowed dimensions
|
||||
if width > max_width or height > max_height:
|
||||
# Calculate the scaling ratio
|
||||
ratio = min(max_width / width, max_height / height)
|
||||
new_width = int(width * ratio)
|
||||
new_height = int(height * ratio)
|
||||
|
||||
# Resize the image
|
||||
resized_image = image.resize((new_width, new_height), Image.LANCZOS)
|
||||
resized_image.save(output_image_path, format="WEBP", quality=100)
|
||||
print(
|
||||
f"The image was successfully resized to ({new_width}, {new_height}) and saved as WebP: {output_image_path}"
|
||||
)
|
||||
else:
|
||||
# If dimensions are within the allowed limits, save the image directly
|
||||
image.save(output_image_path, format="WEBP", quality=100)
|
||||
print(f"The image was successfully saved as WebP: {output_image_path}")
|
||||
except Exception as e:
|
||||
print(f"An error occurred: {e}")
|
||||
|
||||
|
||||
def convert_image_to_webp(input_image, output_file):
|
||||
"""
|
||||
Convert an image to WebP format, resizing it if it exceeds the maximum dimensions.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
input_image : str
|
||||
Path to the input image file.
|
||||
output_file : str
|
||||
Path where the output WebP image will be saved.
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
# Resize the image if it exceeds the maximum dimensions
|
||||
resize_image(input_image, output_file, max_size=(16383, 16383))
|
||||
|
||||
|
||||
def pdf_to_webp(pdf_path, output_dir, dpi=300):
|
||||
"""
|
||||
Convert each page of a PDF file to WebP images.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf_path : str
|
||||
Path to the input PDF file.
|
||||
output_dir : str
|
||||
Directory where the WebP images will be saved.
|
||||
dpi : int, optional
|
||||
DPI resolution for rendering PDF pages. Default is 300.
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
# Convert the PDF to a list of images
|
||||
images = convert_from_path(pdf_path, dpi=dpi)
|
||||
|
||||
for page_number, image in enumerate(images):
|
||||
# Define temporary PNG path
|
||||
temp_png_path = os.path.join(output_dir, f"temp_page_{page_number + 1}.png")
|
||||
image.save(temp_png_path, format="PNG")
|
||||
|
||||
# Define the output path for WebP
|
||||
output_path = os.path.join(output_dir, f"page_{page_number + 1}.webp")
|
||||
|
||||
# Convert PNG to WebP
|
||||
convert_image_to_webp(temp_png_path, output_path)
|
||||
|
||||
# Delete the temporary PNG file
|
||||
os.remove(temp_png_path)
|
||||
|
||||
|
||||
def main(pdf_image_path, output_dir, dpi=300, single_images_flag=False):
|
||||
"""
|
||||
Main function to handle conversion from PDF to WebP images.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
pdf_image_path : str
|
||||
Path to the input PDF file or image.
|
||||
output_dir : str
|
||||
Directory where the WebP images will be saved.
|
||||
dpi : int, optional
|
||||
DPI resolution for rendering PDF pages. Default is 300.
|
||||
single_images_flag : bool, optional
|
||||
If True, combine all pages into a single WebP image. Default is False.
|
||||
|
||||
Returns
|
||||
-------
|
||||
None
|
||||
"""
|
||||
if single_images_flag:
|
||||
# Combine all pages into a single WebP image
|
||||
output_path = os.path.join(output_dir, "combined_image.webp")
|
||||
convert_image_to_webp(pdf_image_path, output_path)
|
||||
else:
|
||||
# Convert each PDF page to a separate WebP image
|
||||
pdf_to_webp(pdf_image_path, output_dir, dpi)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Convert a PDF file to WebP images.")
|
||||
parser.add_argument("pdf_path", help="The path to the input PDF file.")
|
||||
parser.add_argument(
|
||||
"output_dir", help="The directory where the WebP images should be saved."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--dpi",
|
||||
type=int,
|
||||
default=300,
|
||||
help="The DPI resolution for rendering the PDF pages (default: 300).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--single",
|
||||
action="store_true",
|
||||
help="Combine all pages into a single WebP image.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
main(
|
||||
args.pdf_path,
|
||||
args.output_dir,
|
||||
dpi=args.dpi,
|
||||
single_images_flag=args.single,
|
||||
)
|
|
@ -166,6 +166,7 @@ public class EndpointConfiguration {
|
|||
addEndpointToGroup("Python", REMOVE_BLANKS);
|
||||
addEndpointToGroup("Python", "html-to-pdf");
|
||||
addEndpointToGroup("Python", "url-to-pdf");
|
||||
addEndpointToGroup("Python", "pdf-to-img");
|
||||
|
||||
// openCV
|
||||
addEndpointToGroup("OpenCV", "extract-image-scans");
|
||||
|
|
|
@ -1,11 +1,23 @@
|
|||
package stirling.software.SPDF.controller.api.converters;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.URLConnection;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.HttpHeaders;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
|
@ -21,6 +33,8 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
|||
import stirling.software.SPDF.model.api.converters.ConvertToImageRequest;
|
||||
import stirling.software.SPDF.model.api.converters.ConvertToPdfRequest;
|
||||
import stirling.software.SPDF.utils.PdfUtils;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
|
@ -60,15 +74,92 @@ public class ConvertImgPDFController {
|
|||
result =
|
||||
PdfUtils.convertFromPdf(
|
||||
pdfBytes,
|
||||
imageFormat.toUpperCase(),
|
||||
imageFormat.equalsIgnoreCase("webp") ? "png" : imageFormat.toUpperCase(),
|
||||
colorTypeResult,
|
||||
singleImage,
|
||||
Integer.valueOf(dpi),
|
||||
filename);
|
||||
|
||||
if (result == null || result.length == 0) {
|
||||
logger.error("resultant bytes for {} is null, error converting ", filename);
|
||||
}
|
||||
if (imageFormat.equalsIgnoreCase("webp")) {
|
||||
// Write the output stream to a temp file
|
||||
Path tempFile = Files.createTempFile("temp_png", ".png");
|
||||
try (FileOutputStream fos = new FileOutputStream(tempFile.toFile())) {
|
||||
fos.write(result);
|
||||
fos.flush();
|
||||
}
|
||||
|
||||
String pythonVersion = "python3";
|
||||
try {
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
|
||||
.runCommandWithOutputHandling(Arrays.asList("python3", "--version"));
|
||||
} catch (IOException e) {
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
|
||||
.runCommandWithOutputHandling(Arrays.asList("python", "--version"));
|
||||
pythonVersion = "python";
|
||||
}
|
||||
|
||||
List<String> command = new ArrayList<>();
|
||||
command.add(pythonVersion);
|
||||
command.add("./scripts/png_to_webp.py"); // Python script to handle the conversion
|
||||
|
||||
// Create a temporary directory for the output WebP files
|
||||
Path tempOutputDir = Files.createTempDirectory("webp_output");
|
||||
if (singleImage) {
|
||||
// Run the Python script to convert PNG to WebP
|
||||
command.add(tempFile.toString());
|
||||
command.add(tempOutputDir.toString());
|
||||
command.add("--single");
|
||||
} else {
|
||||
// Save the uploaded PDF to a temporary file
|
||||
Path tempPdfPath = Files.createTempFile("temp_pdf", ".pdf");
|
||||
file.transferTo(tempPdfPath.toFile());
|
||||
// Run the Python script to convert PDF to WebP
|
||||
command.add(tempPdfPath.toString());
|
||||
command.add(tempOutputDir.toString());
|
||||
}
|
||||
command.add("--dpi");
|
||||
command.add(dpi);
|
||||
ProcessExecutorResult resultProcess =
|
||||
ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV)
|
||||
.runCommandWithOutputHandling(command);
|
||||
|
||||
// Find all WebP files in the output directory
|
||||
List<Path> webpFiles =
|
||||
Files.walk(tempOutputDir)
|
||||
.filter(path -> path.toString().endsWith(".webp"))
|
||||
.collect(Collectors.toList());
|
||||
|
||||
if (webpFiles.isEmpty()) {
|
||||
logger.error("No WebP files were created in: {}", tempOutputDir.toString());
|
||||
throw new IOException("No WebP files were created. " + resultProcess.getMessages());
|
||||
}
|
||||
|
||||
byte[] bodyBytes = new byte[0];
|
||||
|
||||
if (webpFiles.size() == 1) {
|
||||
// Return the single WebP file directly
|
||||
Path webpFilePath = webpFiles.get(0);
|
||||
bodyBytes = Files.readAllBytes(webpFilePath);
|
||||
} else {
|
||||
// Create a ZIP file containing all WebP images
|
||||
ByteArrayOutputStream zipOutputStream = new ByteArrayOutputStream();
|
||||
try (ZipOutputStream zos = new ZipOutputStream(zipOutputStream)) {
|
||||
for (Path webpFile : webpFiles) {
|
||||
zos.putNextEntry(new ZipEntry(webpFile.getFileName().toString()));
|
||||
Files.copy(webpFile, zos);
|
||||
zos.closeEntry();
|
||||
}
|
||||
}
|
||||
bodyBytes = zipOutputStream.toByteArray();
|
||||
}
|
||||
// Clean up the temporary files
|
||||
Files.deleteIfExists(tempFile);
|
||||
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||
result = bodyBytes;
|
||||
}
|
||||
|
||||
if (singleImage) {
|
||||
String docName = filename + "." + imageFormat;
|
||||
MediaType mediaType = MediaType.parseMediaType(getMediaType(imageFormat));
|
||||
|
|
|
@ -21,14 +21,6 @@ public class ConverterWebController {
|
|||
return "convert/book-to-pdf";
|
||||
}
|
||||
|
||||
@ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
|
||||
@GetMapping("/pdf-to-book")
|
||||
@Hidden
|
||||
public String convertPdfToBookForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-book");
|
||||
return "convert/pdf-to-book";
|
||||
}
|
||||
|
||||
@GetMapping("/img-to-pdf")
|
||||
@Hidden
|
||||
public String convertImgToPdfForm(Model model) {
|
||||
|
@ -57,13 +49,6 @@ public class ConverterWebController {
|
|||
return "convert/url-to-pdf";
|
||||
}
|
||||
|
||||
@GetMapping("/pdf-to-img")
|
||||
@Hidden
|
||||
public String pdfToimgForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-img");
|
||||
return "convert/pdf-to-img";
|
||||
}
|
||||
|
||||
@GetMapping("/file-to-pdf")
|
||||
@Hidden
|
||||
public String convertToPdfForm(Model model) {
|
||||
|
@ -73,6 +58,21 @@ public class ConverterWebController {
|
|||
|
||||
// PDF TO......
|
||||
|
||||
@ConditionalOnExpression("#{bookAndHtmlFormatsInstalled}")
|
||||
@GetMapping("/pdf-to-book")
|
||||
@Hidden
|
||||
public String convertPdfToBookForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-book");
|
||||
return "convert/pdf-to-book";
|
||||
}
|
||||
|
||||
@GetMapping("/pdf-to-img")
|
||||
@Hidden
|
||||
public String pdfToimgForm(Model model) {
|
||||
model.addAttribute("currentPage", "pdf-to-img");
|
||||
return "convert/pdf-to-img";
|
||||
}
|
||||
|
||||
@GetMapping("/pdf-to-html")
|
||||
@Hidden
|
||||
public ModelAndView pdfToHTML() {
|
||||
|
|
|
@ -12,7 +12,7 @@ public class ConvertToImageRequest extends PDFFile {
|
|||
|
||||
@Schema(
|
||||
description = "The output image format",
|
||||
allowableValues = {"png", "jpeg", "jpg", "gif"})
|
||||
allowableValues = {"png", "jpeg", "jpg", "gif", "webp"})
|
||||
private String imageFormat;
|
||||
|
||||
@Schema(
|
||||
|
|
|
@ -1147,4 +1147,4 @@ error.discordSubmit=Discord - Submit Support post
|
|||
removeImage.title=Remove image
|
||||
removeImage.header=Remove image
|
||||
removeImage.removeImage=Remove image
|
||||
removeImage.submit=Remove image
|
||||
removeImage.submit=Remove image
|
||||
|
|
|
@ -28,6 +28,7 @@
|
|||
<option value="gif">GIF</option>
|
||||
<option value="tiff">TIFF</option>
|
||||
<option value="bmp">BMP</option>
|
||||
<option value="webp">WEPB</option>
|
||||
</select>
|
||||
</div>
|
||||
<div class="mb-3">
|
||||
|
|
|
@ -335,7 +335,7 @@
|
|||
</div>
|
||||
</div>
|
||||
</li>
|
||||
|
||||
|
||||
<li class="nav-item dropdown">
|
||||
<a class="nav-link" href="#" id="searchDropdown" role="button" data-bs-toggle="dropdown" aria-haspopup="true" aria-expanded="false">
|
||||
<span class="material-symbols-rounded">
|
||||
|
|
Loading…
Reference in a new issue