html and url to pdf init

2023-07-22 16:57:40 +01:00 · 2023-07-22 16:57:40 +01:00 · 4367ae7934
commit 4367ae7934
parent 749461334d
5 changed files with 228 additions and 69 deletions
--- a/2
+++ b/2
@ -29,7 +29,7 @@ RUN apt-get update && \
        libjpeg-dev && \
    pip install --upgrade pip && \
    pip install --no-cache-dir \
-        opencv-python-headless && \
+        opencv-python-headless WeasyPrint && \
    rm -rf /var/lib/apt/lists/*
 # Final stage: Copy necessary files from the previous stage
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java
@ -1,11 +1,12 @@
 package stirling.software.SPDF.controller.api.converters;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.ArrayList;
+import java.util.zip.ZipEntry;
-import java.util.List;
+import java.util.zip.ZipInputStream;
-
+import java.util.*;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestPart;
@ -13,8 +14,8 @@ import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Parameter;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import stirling.software.SPDF.utils.GeneralUtils;
 import stirling.software.SPDF.utils.ProcessExecutor;
 import stirling.software.SPDF.utils.WebResponseUtils;
@ -22,45 +23,81 @@ import stirling.software.SPDF.utils.WebResponseUtils;
@Tag(name = "Convert", description = "Convert APIs")
 public class ConvertHtmlToPDF {
 	@PostMapping(consumes = "multipart/form-data", value = "/pdf-to-pdfa")
 	@Operation(
 	    summary = "Convert a PDF to a PDF/A",
 	    description = "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO"
 	)
 	public ResponseEntity<byte[]> pdfToPdfA(
 	    @RequestPart(required = true, value = "fileInput")
 	    @Parameter(description = "The input PDF file to be converted to a PDF/A file", required = true)
 	        MultipartFile inputFile) throws IOException, InterruptedException {
-        // Save the uploaded file to a temporary location
+	 @PostMapping(consumes = "multipart/form-data", value = "/convert-to-pdf")
-        Path tempInputFile = Files.createTempFile("input_", ".pdf");
+	    @Operation(
-        inputFile.transferTo(tempInputFile.toFile());
+	        summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
 	        description = "This endpoint takes an HTML or ZIP file input and converts it to a PDF format."
 	    )
 	    public ResponseEntity<byte[]> HtmlToPdf(
 	            @RequestPart(required = true, value = "fileInput") MultipartFile fileInput) throws IOException, InterruptedException {
-        // Prepare the output file path
+	        if (fileInput == null) {
-        Path tempOutputFile = Files.createTempFile("output_", ".pdf");
+	            throw new IllegalArgumentException("Please provide an HTML or ZIP file for conversion.");
 	        }
-        // Prepare the OCRmyPDF command
+	        String originalFilename = fileInput.getOriginalFilename();
-        List<String> command = new ArrayList<>();
+	        if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) {
-        command.add("ocrmypdf");
+	            throw new IllegalArgumentException("File must be either .html or .zip format.");
-        command.add("--skip-text");
+	        }
        command.add("--tesseract-timeout=0");
        command.add("--output-type");
        command.add("pdfa");
        command.add(tempInputFile.toString());
        command.add(tempOutputFile.toString());
-        int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
+	        Path tempOutputFile = Files.createTempFile("output_", ".pdf");
 	        Path tempInputFile;
-        // Read the optimized PDF file
+	        if (originalFilename.endsWith(".html")) {
-        byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
+	            tempInputFile = Files.createTempFile("input_", ".html");
 	            Files.write(tempInputFile, fileInput.getBytes());
 	        } else {
 	            tempInputFile = unzipAndGetMainHtml(fileInput);
 	        }
-        // Clean up the temporary files
+	        List<String> command = new ArrayList<>();
-        Files.delete(tempInputFile);
+	        command.add("weasyprint");
-        Files.delete(tempOutputFile);
+	        command.add(tempInputFile.toString()); 
 	        command.add(tempOutputFile.toString());
 	        int returnCode = 0;
 	        if (originalFilename.endsWith(".zip")) {	        	
 	        	returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
                .runCommandWithOutputHandling(command, tempInputFile.getParent().toFile());
 	        } else {
-        // Return the optimized PDF as a response
+	        	returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
-        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
+	                                        .runCommandWithOutputHandling(command);
-        return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
+	        }
 	        byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
 	        // Clean up temporary files
 	        Files.delete(tempOutputFile);
 	        Files.delete(tempInputFile);
 	        if (originalFilename.endsWith(".zip")) {
 	        	GeneralUtils.deleteDirectory(tempInputFile.getParent());
 	        }
 	        String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf";  // Remove file extension and append .pdf
 	        return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
 	    }
    private Path unzipAndGetMainHtml(MultipartFile zipFile) throws IOException {
        Path tempDirectory = Files.createTempDirectory("unzipped_");
        try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(zipFile.getBytes()))) {
            ZipEntry entry = zipIn.getNextEntry();
            while (entry != null) {
                Path filePath = tempDirectory.resolve(entry.getName());
                if (!entry.isDirectory()) {
                    Files.copy(zipIn, filePath);
                }
                zipIn.closeEntry();
                entry = zipIn.getNextEntry();
            }
        }
        return tempDirectory.resolve("index.html");
    }
 }
--- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertWebsiteToPDF.java
@ -0,0 +1,73 @@
 package stirling.software.SPDF.controller.api.converters;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestPart;
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Parameter;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import stirling.software.SPDF.utils.GeneralUtils;
 import stirling.software.SPDF.utils.ProcessExecutor;
 import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Convert", description = "Convert APIs")
 public class ConvertWebsiteToPDF {
 	@PostMapping(consumes = "multipart/form-data", value = "/url-to-pdf")
 	@Operation(
 	    summary = "Convert a URL to a PDF",
 	    description = "This endpoint fetches content from a URL and converts it to a PDF format."
 	)
 	public ResponseEntity<byte[]> urlToPdf(
 	    @RequestPart(required = true, value = "urlInput")
 	    @Parameter(description = "The input URL to be converted to a PDF file", required = true)
 	        String URL) throws IOException, InterruptedException {
 	    // Validate the URL format
 	    if(!URL.matches("^https?://.*") && GeneralUtils.isValidURL(URL)) {
 	        throw new IllegalArgumentException("Invalid URL format provided.");
 	    }
 	    // Prepare the output file path
 	    Path tempOutputFile = Files.createTempFile("output_", ".pdf");
 	    // Prepare the OCRmyPDF command
 	    List<String> command = new ArrayList<>();
 	    command.add("weasyprint");
 	    command.add(URL);
 	    command.add(tempOutputFile.toString());
 	    int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT).runCommandWithOutputHandling(command);
 	    // Read the optimized PDF file
 	    byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
 	    // Clean up the temporary files
 	    Files.delete(tempOutputFile);
 	    // Convert URL to a safe filename
 	    String outputFilename = convertURLToFileName(URL);
 	    return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
 	}
 	private String convertURLToFileName(String url) {
 	    String safeName = url.replaceAll("[^a-zA-Z0-9]", "_");
 	    if(safeName.length() > 50) {
 	        safeName = safeName.substring(0, 50); // restrict to 50 characters
 	    }
 	    return safeName + ".pdf";
 	}
 }
--- a/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java
+++ b/src/main/java/stirling/software/SPDF/utils/GeneralUtils.java
@ -1,14 +1,54 @@
 package stirling.software.SPDF.utils;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.nio.file.FileVisitResult;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
 import java.util.ArrayList;
 import java.util.List;
 public class GeneralUtils {
 	public static void deleteDirectory(Path path) throws IOException {
        Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
            @Override
            public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
                Files.delete(file);
                return FileVisitResult.CONTINUE;
            }
            @Override
            public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
                Files.delete(dir);
                return FileVisitResult.CONTINUE;
            }
        });
    }
 	public static String convertToFileName(String name) {
        String safeName = name.replaceAll("[^a-zA-Z0-9]", "_");
        if (safeName.length() > 50) {
            safeName = safeName.substring(0, 50);
        }
        return safeName;
    }
 	public static boolean isValidURL(String urlStr) {
 	    try {
 	        new URL(urlStr);
 	        return true;
 	    } catch (MalformedURLException e) {
 	        return false;
 	    }
 	}
 	public static Long convertSizeToBytes(String sizeStr) {
 	    if (sizeStr == null) {
 	        return null;
--- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
+++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
@ -1,6 +1,7 @@
 package stirling.software.SPDF.utils;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.nio.charset.StandardCharsets;
@ -13,7 +14,7 @@ import java.util.concurrent.Semaphore;
 public class ProcessExecutor {
    public enum Processes {
-        LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
+        LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT, WEASYPRINT
    }
    private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@ -25,6 +26,7 @@ public class ProcessExecutor {
            case OCR_MY_PDF -> 2;
            case PYTHON_OPENCV -> 8;
            case GHOSTSCRIPT -> 16;
            case WEASYPRINT -> 16;
            };
            return new ProcessExecutor(semaphoreLimit);
        });
@ -35,14 +37,21 @@ public class ProcessExecutor {
    private ProcessExecutor(int semaphoreLimit) {
        this.semaphore = new Semaphore(semaphoreLimit);
    }
    public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
    	return runCommandWithOutputHandling(command, null);
    }
    public int runCommandWithOutputHandling(List<String> command, File workingDirectory) throws IOException, InterruptedException {
        int exitCode = 1;
        semaphore.acquire();
        try {
            System.out.print("Running command: " + String.join(" ", command));
            ProcessBuilder processBuilder = new ProcessBuilder(command);
            // Use the working directory if it's set
            if (workingDirectory != null) {
                processBuilder.directory(workingDirectory);
            }
            Process process = processBuilder.start();
            // Read the error stream and standard output stream concurrently