From 15ad46fe1cb5cb14f83d4a9624db6f224550db9c Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Fri, 9 Feb 2024 23:24:25 +0000 Subject: [PATCH] book htmk --- .../SPDF/config/PostStartupProcesses.java | 27 +---- .../api/converters/ConvertPDFToOffice.java | 20 +++- .../software/SPDF/utils/FileToPdf.java | 104 ++++-------------- .../templates/convert/html-to-pdf.html | 59 +--------- .../templates/convert/pdf-to-text.html | 1 + 5 files changed, 45 insertions(+), 166 deletions(-) diff --git a/src/main/java/stirling/software/SPDF/config/PostStartupProcesses.java b/src/main/java/stirling/software/SPDF/config/PostStartupProcesses.java index 862e5f9e..e7b684ff 100644 --- a/src/main/java/stirling/software/SPDF/config/PostStartupProcesses.java +++ b/src/main/java/stirling/software/SPDF/config/PostStartupProcesses.java @@ -41,32 +41,13 @@ public class PostStartupProcesses { // Checking for DOCKER_INSTALL_BOOK_FORMATS environment variable if (bookFormatsInstalled) { List tmpList = new ArrayList<>(); - // Set up the timezone configuration commands - tmpList.addAll( - Arrays.asList( - "sh", - "-c", - "echo 'tzdata tzdata/Areas select Europe' | debconf-set-selections; " - + "echo 'tzdata tzdata/Zones/Europe select Berlin' | debconf-set-selections")); - commands.add(tmpList); - // Install calibre with DEBIAN_FRONTEND set to noninteractive tmpList = new ArrayList<>(); - tmpList.addAll( - Arrays.asList( - "sh", - "-c", - "DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends calibre")); + tmpList.addAll(Arrays.asList("whoami")); commands.add(tmpList); - } - // Checking for DOCKER_INSTALL_HTML_FORMATS environment variable - if (htmlFormatsInstalled) { - List tmpList = new ArrayList<>(); - // Add -y flag for automatic yes to prompts and --no-install-recommends to reduce size - tmpList.addAll( - Arrays.asList( - "apt-get", "install", "wkhtmltopdf", "-y", "--no-install-recommends")); + tmpList = new ArrayList<>(); + tmpList.addAll(Arrays.asList("id")); commands.add(tmpList); } @@ -74,8 +55,6 @@ public class PostStartupProcesses { // Run the command if (runningInDocker) { List tmpList = new ArrayList<>(); - tmpList.addAll(Arrays.asList("apt-get", "update")); - commands.add(0, tmpList); for (List list : commands) { ProcessExecutorResult returnCode = diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java index 74b292b5..94e33982 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertPDFToOffice.java @@ -2,13 +2,18 @@ package stirling.software.SPDF.controller.api.converters; import java.io.IOException; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; +import org.apache.pdfbox.text.PDFTextStripper; +import io.github.pixee.security.Filenames; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; @@ -17,6 +22,7 @@ import stirling.software.SPDF.model.api.converters.PdfToPresentationRequest; import stirling.software.SPDF.model.api.converters.PdfToTextOrRTFRequest; import stirling.software.SPDF.model.api.converters.PdfToWordRequest; import stirling.software.SPDF.utils.PDFToFile; +import stirling.software.SPDF.utils.WebResponseUtils; @RestController @RequestMapping("/api/v1/convert") @@ -59,9 +65,17 @@ public class ConvertPDFToOffice { throws IOException, InterruptedException { MultipartFile inputFile = request.getFileInput(); String outputFormat = request.getOutputFormat(); - - PDFToFile pdfToFile = new PDFToFile(); - return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import"); + if ("txt".equals(request.getOutputFormat())) { + try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { + PDFTextStripper stripper = new PDFTextStripper(); + String text = stripper.getText(document); + return WebResponseUtils.bytesToWebResponse(text.getBytes(), Filenames.toSimpleFileName(inputFile.getOriginalFilename()).replaceFirst("[.][^.]+$", "") + + ".txt" , MediaType.TEXT_PLAIN); + } + } else { + PDFToFile pdfToFile = new PDFToFile(); + return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import"); + } } @PostMapping(consumes = "multipart/form-data", value = "/pdf/word") diff --git a/src/main/java/stirling/software/SPDF/utils/FileToPdf.java b/src/main/java/stirling/software/SPDF/utils/FileToPdf.java index 90f7dbc3..84f2596a 100644 --- a/src/main/java/stirling/software/SPDF/utils/FileToPdf.java +++ b/src/main/java/stirling/software/SPDF/utils/FileToPdf.java @@ -1,8 +1,9 @@ package stirling.software.SPDF.utils; import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.FileWriter; import java.io.IOException; -import java.nio.charset.StandardCharsets; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; @@ -34,95 +35,40 @@ public class FileToPdf { tempInputFile = Files.createTempFile("input_", ".html"); Files.write(tempInputFile, fileBytes); } else { - tempInputFile = unzipAndGetMainHtml(fileBytes); + tempInputFile = Files.createTempFile("input_", ".zip"); + Files.write(tempInputFile, fileBytes); } List command = new ArrayList<>(); if (!htmlFormatsInstalled) { command.add("weasyprint"); - } else { - command.add("wkhtmltopdf"); - command.add("--enable-local-file-access"); - command.add("--load-error-handling"); - command.add("ignore"); - command.add("--load-media-error-handling"); - command.add("ignore"); - command.add("--zoom"); - command.add(String.valueOf(request.getZoom())); + command.add(tempInputFile.toString()); + command.add(tempOutputFile.toString()); + + } else { + command.add("ebook-convert"); + command.add(tempInputFile.toString()); + command.add(tempOutputFile.toString()); + command.add("--paper-size"); + command.add("a4"); - // if custom zoom add zoom style direct to html - // https://github.com/wkhtmltopdf/wkhtmltopdf/issues/4900 if (request.getZoom() != 1.0) { - String htmlContent = new String(Files.readAllBytes(tempInputFile)); - - String zoomStyle = ""; - // Check for tag, add style tag to associated tag - if (htmlContent.contains("")) { - htmlContent = htmlContent.replace("", "" + zoomStyle); - } else if (htmlContent.contains("")) { - // If no tag, but tag exists - htmlContent = htmlContent.replace("", "" + zoomStyle); - } else { - // If neither nor tags exist - htmlContent = zoomStyle + htmlContent; + // Create a temporary CSS file + File tempCssFile = File.createTempFile("customStyle", ".css"); + try (FileWriter writer = new FileWriter(tempCssFile)) { + // Write the CSS rule to the file + writer.write("body { zoom: " + request.getZoom() + "; }"); } - // rewrite new html to file - Files.write(tempInputFile, htmlContent.getBytes(StandardCharsets.UTF_8)); - } - - if (request.getPageWidth() != null) { - command.add("--page-width"); - command.add(request.getPageWidth() + "cm"); - } - - if (request.getPageHeight() != null) { - command.add("--page-height"); - command.add(request.getPageHeight() + "cm"); - } - - if (request.getMarginTop() != null) { - command.add("--margin-top"); - command.add(request.getMarginTop() + "mm"); - } - - // Repeat similar pattern for marginBottom, marginLeft, marginRight - - if ("Yes".equalsIgnoreCase(request.getPrintBackground())) { - command.add("--background"); - } else { - command.add("--no-background"); - } - - if ("Yes".equalsIgnoreCase(request.getDefaultHeader())) { - command.add("--default-header"); - } - - if ("print".equalsIgnoreCase(request.getCssMediaType())) { - command.add("--print-media-type"); - } else if ("screen".equalsIgnoreCase(request.getCssMediaType())) { - command.add("--no-print-media-type"); + command.add("--extra-css"); + command.add(tempCssFile.getAbsolutePath()); } } - command.add(tempInputFile.toString()); - command.add(tempOutputFile.toString()); ProcessExecutorResult returnCode; - if (fileName.endsWith(".zip")) { - if (htmlFormatsInstalled) { - // command.add(1, "--allow"); - // command.add(2, tempInputFile.getParent().toString()); - } - returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) - .runCommandWithOutputHandling( - command, tempInputFile.getParent().toFile()); - } else { - - returnCode = - ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) - .runCommandWithOutputHandling(command); - } + returnCode = + ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) + .runCommandWithOutputHandling(command); pdfBytes = Files.readAllBytes(tempOutputFile); } catch (IOException e) { @@ -135,10 +81,6 @@ public class FileToPdf { // Clean up temporary files Files.delete(tempOutputFile); Files.delete(tempInputFile); - - if (fileName.endsWith(".zip")) { - GeneralUtils.deleteDirectory(tempInputFile.getParent()); - } } return pdfBytes; diff --git a/src/main/resources/templates/convert/html-to-pdf.html b/src/main/resources/templates/convert/html-to-pdf.html index cbb22b99..534bed5f 100644 --- a/src/main/resources/templates/convert/html-to-pdf.html +++ b/src/main/resources/templates/convert/html-to-pdf.html @@ -19,64 +19,7 @@ - -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- -
- - -
- - -
- - -
- - +
diff --git a/src/main/resources/templates/convert/pdf-to-text.html b/src/main/resources/templates/convert/pdf-to-text.html index 75b71d11..de649025 100644 --- a/src/main/resources/templates/convert/pdf-to-text.html +++ b/src/main/resources/templates/convert/pdf-to-text.html @@ -19,6 +19,7 @@