From cd0e1a3962c7dc4089f610a9cf4da5db8b3a1e86 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Thu, 24 Aug 2023 23:23:25 +0100 Subject: [PATCH] redact --- README.md | 6 +- .../api/security/RedactController.java | 106 ++++++++++++++++++ .../controller/web/SecurityWebController.java | 6 + .../stirling/software/SPDF/model/PDFText.java | 42 +++++++ .../software/SPDF/pdf/TextFinder.java | 91 +++++++++++++++ src/main/resources/messages_en_GB.properties | 4 + .../templates/security/auto-redact.html | 56 +++++++++ 7 files changed, 310 insertions(+), 1 deletion(-) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java create mode 100644 src/main/java/stirling/software/SPDF/model/PDFText.java create mode 100644 src/main/java/stirling/software/SPDF/pdf/TextFinder.java create mode 100644 src/main/resources/templates/security/auto-redact.html diff --git a/README.md b/README.md index a0a0dd50..54b654a1 100644 --- a/README.md +++ b/README.md @@ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API - Progress bar/Tracking - Full custom logic pipelines to combine multiple operations together. - Folder support with auto scanning to perform operations on -- Redact sections of pages +- Redact text (not au +- Add Forms +- Annotations +- Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing +- Fill forms mannual and automatic ### Q2: Why is my application downloading .htm files? This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files. diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java new file mode 100644 index 00000000..863c31cd --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java @@ -0,0 +1,106 @@ +package stirling.software.SPDF.controller.api.security; + +import java.awt.Color; +import java.awt.image.BufferedImage; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageContentStream; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; +import org.apache.pdfbox.rendering.ImageType; +import org.apache.pdfbox.rendering.PDFRenderer; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestParam; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.media.Schema; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.model.PDFText; +import stirling.software.SPDF.pdf.TextFinder; +import stirling.software.SPDF.utils.WebResponseUtils; +@RestController +@Tag(name = "Security", description = "Security APIs") +public class RedactController { + + private static final Logger logger = LoggerFactory.getLogger(RedactController.class); + + + @PostMapping(value = "/auto-redact", consumes = "multipart/form-data") + @Operation(summary = "Redacts listOfText in a PDF document", + description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO") + public ResponseEntity redactPdf( + @Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file, + @Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString, + @RequestParam(value = "useRegex", required = false) boolean useRegex, + @RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool, + @RequestParam(value = "customPadding", required = false) float customPadding, + @RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception { + + System.out.println(listOfTextString); + String[] listOfText = listOfTextString.split("\n"); + byte[] bytes = file.getBytes(); + PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes)); + for (String text : listOfText) { + text = text.trim(); + System.out.println(text); + TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool); + List foundTexts = textFinder.getTextLocations(document); + redactFoundText(document, foundTexts, customPadding); + } + + + + if (convertPDFToImage) { + PDDocument imageDocument = new PDDocument(); + PDFRenderer pdfRenderer = new PDFRenderer(document); + for (int page = 0; page < document.getNumberOfPages(); ++page) { + BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB); + PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight())); + imageDocument.addPage(newPage); + PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim); + PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage); + contentStream.drawImage(pdImage, 0, 0); + contentStream.close(); + } + document.close(); + document = imageDocument; + } + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + document.save(baos); + document.close(); + + byte[] pdfContent = baos.toByteArray(); + return WebResponseUtils.bytesToWebResponse(pdfContent, + file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf"); + } + + + private void redactFoundText(PDDocument document, List blocks, float customPadding) throws IOException { + var allPages = document.getDocumentCatalog().getPages(); + + for (PDFText block : blocks) { + var page = allPages.get(block.getPageIndex()); + PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true); + contentStream.setNonStrokingColor(Color.BLACK); + float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding; + PDRectangle pageBox = page.getBBox(); + contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding); + contentStream.fill(); + contentStream.close(); + } + } + +} diff --git a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java index 3857cc9e..2cbf245f 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java @@ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag; @Tag(name = "Security", description = "Security APIs") public class SecurityWebController { + @GetMapping("/auto-redact") + @Hidden + public String autoRedactForm(Model model) { + model.addAttribute("currentPage", "auto-redact"); + return "security/auto-redact"; + } @GetMapping("/add-password") @Hidden diff --git a/src/main/java/stirling/software/SPDF/model/PDFText.java b/src/main/java/stirling/software/SPDF/model/PDFText.java new file mode 100644 index 00000000..9a4909d0 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/model/PDFText.java @@ -0,0 +1,42 @@ +package stirling.software.SPDF.model; +public class PDFText { + private final int pageIndex; + private final float x1; + private final float y1; + private final float x2; + private final float y2; + private final String text; + + public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) { + this.pageIndex = pageIndex; + this.x1 = x1; + this.y1 = y1; + this.x2 = x2; + this.y2 = y2; + this.text = text; + } + + public int getPageIndex() { + return pageIndex; + } + + public float getX1() { + return x1; + } + + public float getY1() { + return y1; + } + + public float getX2() { + return x2; + } + + public float getY2() { + return y2; + } + + public String getText() { + return text; + } +} \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java new file mode 100644 index 00000000..ee84f177 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java @@ -0,0 +1,91 @@ +package stirling.software.SPDF.pdf; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.text.PDFTextStripper; +import org.apache.pdfbox.text.TextPosition; +import org.springframework.http.ResponseEntity; + +import stirling.software.SPDF.model.PDFText; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class TextFinder extends PDFTextStripper { + + private final String searchText; + private final boolean useRegex; + private final boolean wholeWordSearch; + private final List textOccurrences = new ArrayList<>(); + + public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException { + this.searchText = searchText.toLowerCase(); + this.useRegex = useRegex; + this.wholeWordSearch = wholeWordSearch; + setSortByPosition(true); + } + + private List findOccurrencesInText(String searchText, String content) { + List indexes = new ArrayList<>(); + Pattern pattern; + + if (useRegex) { + // Use regex-based search + pattern = wholeWordSearch + ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)") + : Pattern.compile(searchText); + } else { + // Use normal text search + pattern = wholeWordSearch + ? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)") + : Pattern.compile(Pattern.quote(searchText)); + } + + Matcher matcher = pattern.matcher(content); + while (matcher.find()) { + indexes.add(matcher.start()); + } + return indexes; + } + + @Override + protected void writeString(String text, List textPositions) { + for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) { + if (index + searchText.length() <= textPositions.size()) { + // Initial values based on the first character + TextPosition first = textPositions.get(index); + float minX = first.getX(); + float minY = first.getY(); + float maxX = first.getX() + first.getWidth(); + float maxY = first.getY() + first.getHeight(); + + // Loop over the rest of the characters and adjust bounding box values + for (int i = index; i < index + searchText.length(); i++) { + TextPosition position = textPositions.get(i); + minX = Math.min(minX, position.getX()); + minY = Math.min(minY, position.getY()); + maxX = Math.max(maxX, position.getX() + position.getWidth()); + maxY = Math.max(maxY, position.getY() + position.getHeight()); + } + + textOccurrences.add(new PDFText( + getCurrentPageNo() - 1, + minX, + minY, + maxX, + maxY, + text + )); + } + } + } + + public List getTextLocations(PDDocument document) throws Exception { + this.getText(document); + System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document."); + + return textOccurrences; + } + +} \ No newline at end of file diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 5ca0bd74..c9fad5e4 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -304,6 +304,10 @@ showJS.tags=JS # WEB PAGES # # # ########################### +#auto-redact +autoRedact.title=Auto Redact +autoRedact.header=Auto Redact + #showJS showJS.title=Show Javascript showJS.header=Show Javascript diff --git a/src/main/resources/templates/security/auto-redact.html b/src/main/resources/templates/security/auto-redact.html new file mode 100644 index 00000000..132ccb3e --- /dev/null +++ b/src/main/resources/templates/security/auto-redact.html @@ -0,0 +1,56 @@ + + + + + + + +
+
+
+

+
+
+
+

+ +
+
+ +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ +
+ + +
+ + +
+
+
+
+
+
+
+ + \ No newline at end of file