html and url to pdf init
This commit is contained in:
parent
749461334d
commit
4367ae7934
5 changed files with 228 additions and 69 deletions
|
@ -29,7 +29,7 @@ RUN apt-get update && \
|
||||||
libjpeg-dev && \
|
libjpeg-dev && \
|
||||||
pip install --upgrade pip && \
|
pip install --upgrade pip && \
|
||||||
pip install --no-cache-dir \
|
pip install --no-cache-dir \
|
||||||
opencv-python-headless && \
|
opencv-python-headless WeasyPrint && \
|
||||||
rm -rf /var/lib/apt/lists/*
|
rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Final stage: Copy necessary files from the previous stage
|
# Final stage: Copy necessary files from the previous stage
|
||||||
|
|
|
@ -1,11 +1,12 @@
|
||||||
package stirling.software.SPDF.controller.api.converters;
|
package stirling.software.SPDF.controller.api.converters;
|
||||||
|
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.List;
|
import java.util.zip.ZipInputStream;
|
||||||
|
import java.util.*;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestPart;
|
import org.springframework.web.bind.annotation.RequestPart;
|
||||||
|
@ -13,8 +14,8 @@ import org.springframework.web.bind.annotation.RestController;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import io.swagger.v3.oas.annotations.Operation;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import io.swagger.v3.oas.annotations.Parameter;
|
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
import stirling.software.SPDF.utils.GeneralUtils;
|
||||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||||
|
|
||||||
|
@ -22,45 +23,81 @@ import stirling.software.SPDF.utils.WebResponseUtils;
|
||||||
@Tag(name = "Convert", description = "Convert APIs")
|
@Tag(name = "Convert", description = "Convert APIs")
|
||||||
public class ConvertHtmlToPDF {
|
public class ConvertHtmlToPDF {
|
||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/pdf-to-pdfa")
|
|
||||||
|
@PostMapping(consumes = "multipart/form-data", value = "/convert-to-pdf")
|
||||||
@Operation(
|
@Operation(
|
||||||
summary = "Convert a PDF to a PDF/A",
|
summary = "Convert an HTML or ZIP (containing HTML and CSS) to PDF",
|
||||||
description = "This endpoint converts a PDF file to a PDF/A file. PDF/A is a format designed for long-term archiving of digital documents. Input:PDF Output:PDF Type:SISO"
|
description = "This endpoint takes an HTML or ZIP file input and converts it to a PDF format."
|
||||||
)
|
)
|
||||||
public ResponseEntity<byte[]> pdfToPdfA(
|
public ResponseEntity<byte[]> HtmlToPdf(
|
||||||
@RequestPart(required = true, value = "fileInput")
|
@RequestPart(required = true, value = "fileInput") MultipartFile fileInput) throws IOException, InterruptedException {
|
||||||
@Parameter(description = "The input PDF file to be converted to a PDF/A file", required = true)
|
|
||||||
MultipartFile inputFile) throws IOException, InterruptedException {
|
|
||||||
|
|
||||||
// Save the uploaded file to a temporary location
|
if (fileInput == null) {
|
||||||
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
throw new IllegalArgumentException("Please provide an HTML or ZIP file for conversion.");
|
||||||
inputFile.transferTo(tempInputFile.toFile());
|
}
|
||||||
|
|
||||||
|
String originalFilename = fileInput.getOriginalFilename();
|
||||||
|
if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) {
|
||||||
|
throw new IllegalArgumentException("File must be either .html or .zip format.");
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare the output file path
|
|
||||||
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||||
|
Path tempInputFile;
|
||||||
|
|
||||||
|
if (originalFilename.endsWith(".html")) {
|
||||||
|
tempInputFile = Files.createTempFile("input_", ".html");
|
||||||
|
Files.write(tempInputFile, fileInput.getBytes());
|
||||||
|
} else {
|
||||||
|
tempInputFile = unzipAndGetMainHtml(fileInput);
|
||||||
|
}
|
||||||
|
|
||||||
// Prepare the OCRmyPDF command
|
|
||||||
List<String> command = new ArrayList<>();
|
List<String> command = new ArrayList<>();
|
||||||
command.add("ocrmypdf");
|
command.add("weasyprint");
|
||||||
command.add("--skip-text");
|
|
||||||
command.add("--tesseract-timeout=0");
|
|
||||||
command.add("--output-type");
|
|
||||||
command.add("pdfa");
|
|
||||||
command.add(tempInputFile.toString());
|
command.add(tempInputFile.toString());
|
||||||
command.add(tempOutputFile.toString());
|
command.add(tempOutputFile.toString());
|
||||||
|
int returnCode = 0;
|
||||||
|
if (originalFilename.endsWith(".zip")) {
|
||||||
|
returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
|
||||||
|
.runCommandWithOutputHandling(command, tempInputFile.getParent().toFile());
|
||||||
|
} else {
|
||||||
|
|
||||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT)
|
||||||
|
.runCommandWithOutputHandling(command);
|
||||||
|
}
|
||||||
|
|
||||||
// Read the optimized PDF file
|
|
||||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
|
||||||
// Clean up the temporary files
|
// Clean up temporary files
|
||||||
Files.delete(tempInputFile);
|
|
||||||
Files.delete(tempOutputFile);
|
Files.delete(tempOutputFile);
|
||||||
|
Files.delete(tempInputFile);
|
||||||
|
if (originalFilename.endsWith(".zip")) {
|
||||||
|
GeneralUtils.deleteDirectory(tempInputFile.getParent());
|
||||||
|
}
|
||||||
|
|
||||||
// Return the optimized PDF as a response
|
String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf
|
||||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
|
|
||||||
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
private Path unzipAndGetMainHtml(MultipartFile zipFile) throws IOException {
|
||||||
|
Path tempDirectory = Files.createTempDirectory("unzipped_");
|
||||||
|
try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(zipFile.getBytes()))) {
|
||||||
|
ZipEntry entry = zipIn.getNextEntry();
|
||||||
|
while (entry != null) {
|
||||||
|
Path filePath = tempDirectory.resolve(entry.getName());
|
||||||
|
if (!entry.isDirectory()) {
|
||||||
|
Files.copy(zipIn, filePath);
|
||||||
|
}
|
||||||
|
zipIn.closeEntry();
|
||||||
|
entry = zipIn.getNextEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return tempDirectory.resolve("index.html");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,73 @@
|
||||||
|
package stirling.software.SPDF.controller.api.converters;
|
||||||
|
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.List;
|
||||||
|
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestPart;
|
||||||
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
|
import io.swagger.v3.oas.annotations.Parameter;
|
||||||
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
|
import stirling.software.SPDF.utils.GeneralUtils;
|
||||||
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||||
|
|
||||||
|
@RestController
|
||||||
|
@Tag(name = "Convert", description = "Convert APIs")
|
||||||
|
public class ConvertWebsiteToPDF {
|
||||||
|
|
||||||
|
@PostMapping(consumes = "multipart/form-data", value = "/url-to-pdf")
|
||||||
|
@Operation(
|
||||||
|
summary = "Convert a URL to a PDF",
|
||||||
|
description = "This endpoint fetches content from a URL and converts it to a PDF format."
|
||||||
|
)
|
||||||
|
public ResponseEntity<byte[]> urlToPdf(
|
||||||
|
@RequestPart(required = true, value = "urlInput")
|
||||||
|
@Parameter(description = "The input URL to be converted to a PDF file", required = true)
|
||||||
|
String URL) throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
// Validate the URL format
|
||||||
|
if(!URL.matches("^https?://.*") && GeneralUtils.isValidURL(URL)) {
|
||||||
|
throw new IllegalArgumentException("Invalid URL format provided.");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Prepare the output file path
|
||||||
|
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
||||||
|
|
||||||
|
// Prepare the OCRmyPDF command
|
||||||
|
List<String> command = new ArrayList<>();
|
||||||
|
command.add("weasyprint");
|
||||||
|
command.add(URL);
|
||||||
|
command.add(tempOutputFile.toString());
|
||||||
|
|
||||||
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
// Read the optimized PDF file
|
||||||
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
|
||||||
|
// Clean up the temporary files
|
||||||
|
Files.delete(tempOutputFile);
|
||||||
|
|
||||||
|
// Convert URL to a safe filename
|
||||||
|
String outputFilename = convertURLToFileName(URL);
|
||||||
|
|
||||||
|
return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
|
}
|
||||||
|
|
||||||
|
private String convertURLToFileName(String url) {
|
||||||
|
String safeName = url.replaceAll("[^a-zA-Z0-9]", "_");
|
||||||
|
if(safeName.length() > 50) {
|
||||||
|
safeName = safeName.substring(0, 50); // restrict to 50 characters
|
||||||
|
}
|
||||||
|
return safeName + ".pdf";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -1,14 +1,54 @@
|
||||||
package stirling.software.SPDF.utils;
|
package stirling.software.SPDF.utils;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.net.MalformedURLException;
|
||||||
|
import java.net.URL;
|
||||||
|
import java.nio.file.FileVisitResult;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.nio.file.Paths;
|
import java.nio.file.Paths;
|
||||||
|
import java.nio.file.SimpleFileVisitor;
|
||||||
|
import java.nio.file.attribute.BasicFileAttributes;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
public class GeneralUtils {
|
public class GeneralUtils {
|
||||||
|
|
||||||
|
public static void deleteDirectory(Path path) throws IOException {
|
||||||
|
Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
|
||||||
|
@Override
|
||||||
|
public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
|
||||||
|
Files.delete(file);
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
|
||||||
|
Files.delete(dir);
|
||||||
|
return FileVisitResult.CONTINUE;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
public static String convertToFileName(String name) {
|
||||||
|
String safeName = name.replaceAll("[^a-zA-Z0-9]", "_");
|
||||||
|
if (safeName.length() > 50) {
|
||||||
|
safeName = safeName.substring(0, 50);
|
||||||
|
}
|
||||||
|
return safeName;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static boolean isValidURL(String urlStr) {
|
||||||
|
try {
|
||||||
|
new URL(urlStr);
|
||||||
|
return true;
|
||||||
|
} catch (MalformedURLException e) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
public static Long convertSizeToBytes(String sizeStr) {
|
public static Long convertSizeToBytes(String sizeStr) {
|
||||||
if (sizeStr == null) {
|
if (sizeStr == null) {
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
package stirling.software.SPDF.utils;
|
package stirling.software.SPDF.utils;
|
||||||
|
|
||||||
import java.io.BufferedReader;
|
import java.io.BufferedReader;
|
||||||
|
import java.io.File;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStreamReader;
|
import java.io.InputStreamReader;
|
||||||
import java.nio.charset.StandardCharsets;
|
import java.nio.charset.StandardCharsets;
|
||||||
|
@ -13,7 +14,7 @@ import java.util.concurrent.Semaphore;
|
||||||
public class ProcessExecutor {
|
public class ProcessExecutor {
|
||||||
|
|
||||||
public enum Processes {
|
public enum Processes {
|
||||||
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
|
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT, WEASYPRINT
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||||
|
@ -25,6 +26,7 @@ public class ProcessExecutor {
|
||||||
case OCR_MY_PDF -> 2;
|
case OCR_MY_PDF -> 2;
|
||||||
case PYTHON_OPENCV -> 8;
|
case PYTHON_OPENCV -> 8;
|
||||||
case GHOSTSCRIPT -> 16;
|
case GHOSTSCRIPT -> 16;
|
||||||
|
case WEASYPRINT -> 16;
|
||||||
};
|
};
|
||||||
return new ProcessExecutor(semaphoreLimit);
|
return new ProcessExecutor(semaphoreLimit);
|
||||||
});
|
});
|
||||||
|
@ -35,14 +37,21 @@ public class ProcessExecutor {
|
||||||
private ProcessExecutor(int semaphoreLimit) {
|
private ProcessExecutor(int semaphoreLimit) {
|
||||||
this.semaphore = new Semaphore(semaphoreLimit);
|
this.semaphore = new Semaphore(semaphoreLimit);
|
||||||
}
|
}
|
||||||
|
|
||||||
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
public int runCommandWithOutputHandling(List<String> command) throws IOException, InterruptedException {
|
||||||
|
return runCommandWithOutputHandling(command, null);
|
||||||
|
}
|
||||||
|
public int runCommandWithOutputHandling(List<String> command, File workingDirectory) throws IOException, InterruptedException {
|
||||||
int exitCode = 1;
|
int exitCode = 1;
|
||||||
semaphore.acquire();
|
semaphore.acquire();
|
||||||
try {
|
try {
|
||||||
|
|
||||||
System.out.print("Running command: " + String.join(" ", command));
|
System.out.print("Running command: " + String.join(" ", command));
|
||||||
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
ProcessBuilder processBuilder = new ProcessBuilder(command);
|
||||||
|
|
||||||
|
// Use the working directory if it's set
|
||||||
|
if (workingDirectory != null) {
|
||||||
|
processBuilder.directory(workingDirectory);
|
||||||
|
}
|
||||||
Process process = processBuilder.start();
|
Process process = processBuilder.start();
|
||||||
|
|
||||||
// Read the error stream and standard output stream concurrently
|
// Read the error stream and standard output stream concurrently
|
||||||
|
|
Loading…
Reference in a new issue