redact

2023-08-24 23:23:25 +01:00 · 2023-08-24 23:23:25 +01:00 · cd0e1a3962
commit cd0e1a3962
parent 7c2f482b3b
7 changed files with 310 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API
 - Progress bar/Tracking
 - Full custom logic pipelines to combine multiple operations together.
 - Folder support with auto scanning to perform operations on
- Redact sections of pages
+- Redact text (not au
+- Add Forms
+- Annotations
+- Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing 
+- Fill forms mannual and automatic 

 ### Q2: Why is my application downloading .htm files?
 This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files.
--- a/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
@ -0,0 +1,106 @@
+package stirling.software.SPDF.controller.api.security;
+
+import java.awt.Color;
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.PDPage;
+import org.apache.pdfbox.pdmodel.PDPageContentStream;
+import org.apache.pdfbox.pdmodel.common.PDRectangle;
+import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
+import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
+import org.apache.pdfbox.rendering.ImageType;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.bind.annotation.RestController;
+import org.springframework.web.multipart.MultipartFile;
+
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.Parameter;
+import io.swagger.v3.oas.annotations.media.Schema;
+import io.swagger.v3.oas.annotations.tags.Tag;
+import stirling.software.SPDF.model.PDFText;
+import stirling.software.SPDF.pdf.TextFinder;
+import stirling.software.SPDF.utils.WebResponseUtils;
+@RestController
+@Tag(name = "Security", description = "Security APIs")
+public class RedactController {
+
+    private static final Logger logger = LoggerFactory.getLogger(RedactController.class);
+
+
+    @PostMapping(value = "/auto-redact", consumes = "multipart/form-data")
+    @Operation(summary = "Redacts listOfText in a PDF document", 
+               description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO")
+    public ResponseEntity<byte[]> redactPdf(
+            @Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file,
+            @Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString,
+            @RequestParam(value = "useRegex", required = false) boolean useRegex,
+            @RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool,
+            @RequestParam(value = "customPadding", required = false) float customPadding,
+            @RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception {
+        
+    	System.out.println(listOfTextString);
+    	String[] listOfText = listOfTextString.split("\n");
+        byte[] bytes = file.getBytes();
+        PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes));
+        for (String text : listOfText) {
+        	text = text.trim();
+        	System.out.println(text);
+        	TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
+            List<PDFText> foundTexts = textFinder.getTextLocations(document);
+            redactFoundText(document, foundTexts, customPadding);
+        }
+        
+        
+        
+        if (convertPDFToImage) {
+            PDDocument imageDocument = new PDDocument();
+            PDFRenderer pdfRenderer = new PDFRenderer(document);
+            for (int page = 0; page < document.getNumberOfPages(); ++page) {
+                BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
+                PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight()));
+                imageDocument.addPage(newPage);
+                PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim);
+                PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage);
+                contentStream.drawImage(pdImage, 0, 0);
+                contentStream.close();
+            }
+            document.close();
+            document = imageDocument;
+        }
+
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        document.save(baos);
+        document.close();
+        
+        byte[] pdfContent = baos.toByteArray();
+        return WebResponseUtils.bytesToWebResponse(pdfContent,
+                file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf");
+    }
+
+    
+    private void redactFoundText(PDDocument document, List<PDFText> blocks, float customPadding) throws IOException {
+        var allPages = document.getDocumentCatalog().getPages();
+
+        for (PDFText block : blocks) {
+            var page = allPages.get(block.getPageIndex());
+            PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true);
+            contentStream.setNonStrokingColor(Color.BLACK);
+            float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
+            PDRectangle pageBox = page.getBBox();
+            contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding);
+            contentStream.fill();
+            contentStream.close();
+        }
+    }
+
+}
--- a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java
+++ b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java
@ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag;
@Tag(name = "Security", description = "Security APIs")
 public class SecurityWebController {
 	
+	@GetMapping("/auto-redact")
+    @Hidden
+    public String autoRedactForm(Model model) {
+        model.addAttribute("currentPage", "auto-redact");
+        return "security/auto-redact";
+    }
 	
    @GetMapping("/add-password")
    @Hidden
--- a/src/main/java/stirling/software/SPDF/model/PDFText.java
+++ b/src/main/java/stirling/software/SPDF/model/PDFText.java
@ -0,0 +1,42 @@
+package stirling.software.SPDF.model;
+public class PDFText {
+    private final int pageIndex;
+    private final float x1;
+    private final float y1;
+    private final float x2;
+    private final float y2;
+    private final String text;
+
+    public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
+        this.pageIndex = pageIndex;
+        this.x1 = x1;
+        this.y1 = y1;
+        this.x2 = x2;
+        this.y2 = y2;
+        this.text = text;
+    }
+
+    public int getPageIndex() {
+        return pageIndex;
+    }
+
+    public float getX1() {
+        return x1;
+    }
+
+    public float getY1() {
+        return y1;
+    }
+
+    public float getX2() {
+        return x2;
+    }
+
+    public float getY2() {
+        return y2;
+    }
+
+    public String getText() {
+        return text;
+    }
+}
--- a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
+++ b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
@ -0,0 +1,91 @@
+package stirling.software.SPDF.pdf;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.text.PDFTextStripper;
+import org.apache.pdfbox.text.TextPosition;
+import org.springframework.http.ResponseEntity;
+
+import stirling.software.SPDF.model.PDFText;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class TextFinder extends PDFTextStripper {
+
+	private final String searchText;
+	private final boolean useRegex;
+	private final boolean wholeWordSearch;
+	private final List<PDFText> textOccurrences = new ArrayList<>();
+
+	public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException {
+	    this.searchText = searchText.toLowerCase();
+	    this.useRegex = useRegex;
+	    this.wholeWordSearch = wholeWordSearch;
+	    setSortByPosition(true);
+	}
+
+	private List<Integer> findOccurrencesInText(String searchText, String content) {
+	    List<Integer> indexes = new ArrayList<>();
+	    Pattern pattern;
+
+	    if (useRegex) {
+	        // Use regex-based search
+	        pattern = wholeWordSearch 
+	            ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)") 
+	            : Pattern.compile(searchText);
+	    } else {
+	        // Use normal text search
+	        pattern = wholeWordSearch 
+		            ? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)") 
+		    	            : Pattern.compile(Pattern.quote(searchText));
+	    }
+
+	    Matcher matcher = pattern.matcher(content);
+	    while (matcher.find()) {
+	        indexes.add(matcher.start());
+	    }
+	    return indexes;
+	}
+	
+	@Override
+	protected void writeString(String text, List<TextPosition> textPositions) {
+	    for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) {
+	        if (index + searchText.length() <= textPositions.size()) {
+	            // Initial values based on the first character
+	            TextPosition first = textPositions.get(index);
+	            float minX = first.getX();
+	            float minY = first.getY();
+	            float maxX = first.getX() + first.getWidth();
+	            float maxY = first.getY() + first.getHeight();
+
+	            // Loop over the rest of the characters and adjust bounding box values
+	            for (int i = index; i < index + searchText.length(); i++) {
+	                TextPosition position = textPositions.get(i);
+	                minX = Math.min(minX, position.getX());
+	                minY = Math.min(minY, position.getY());
+	                maxX = Math.max(maxX, position.getX() + position.getWidth());
+	                maxY = Math.max(maxY, position.getY() + position.getHeight());
+	            }
+
+	            textOccurrences.add(new PDFText(
+	                    getCurrentPageNo() - 1,
+	                    minX,
+	                    minY,
+	                    maxX,
+	                    maxY,
+	                    text
+	            ));
+	        }
+	    }
+	}
+
+	public List<PDFText> getTextLocations(PDDocument document) throws Exception {
+		this.getText(document);
+		System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document.");
+
+		return textOccurrences;
+	}
+
+}
--- a/src/main/resources/messages_en_GB.properties
+++ b/src/main/resources/messages_en_GB.properties
@ -304,6 +304,10 @@ showJS.tags=JS
 #       WEB PAGES         #
 #                         #
 ###########################
+#auto-redact
+autoRedact.title=Auto Redact
+autoRedact.header=Auto Redact
+
 #showJS
 showJS.title=Show Javascript
 showJS.header=Show Javascript
--- a/src/main/resources/templates/security/auto-redact.html
+++ b/src/main/resources/templates/security/auto-redact.html
@ -0,0 +1,56 @@
+<!DOCTYPE html>
+<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
+
+<th:block th:insert="~{fragments/common :: head(title=#{autoRedact.title})}"></th:block>
+
+
+<body>
+  <div id="page-container">
+    <div id="content-wrap">
+      <div th:insert="~{fragments/navbar.html :: navbar}"></div>
+      <br> <br>
+      <div class="container">
+        <div class="row justify-content-center">
+          <div class="col-md-6">
+            <h2 th:text="#{autoRedact.header}"></h2>
+
+            <form action="/auto-redact" method="post" enctype="multipart/form-data">
+		        <div class="mb-3">
+		            <input type="file" class="form-control" id="fileInput" name="fileInput" required accept="application/pdf">
+		        </div>
+		        
+		        <div class="mb-3">
+		            <label for="listOfText" class="form-label">Texts to Redact (line-separated)</label>
+		            <textarea class="form-control" id="listOfText" name="listOfText" rows="4" required placeholder="e.g. Confidential,Top-Secret"></textarea>
+		        </div>
+		        
+		        <div class="mb-3 form-check">
+		            <input type="checkbox" class="form-check-input" id="useRegex" name="useRegex">
+		            <label class="form-check-label" for="useRegex">Use Regex</label>
+		        </div>
+		
+		        <div class="mb-3 form-check">
+		            <input type="checkbox" class="form-check-input" id="wholeWordSearch" name="wholeWordSearch">
+		            <label class="form-check-label" for="wholeWordSearch">Whole Word Search</label>
+		        </div>
+		        
+		        <div class="mb-3">
+		            <label for="customPadding" class="form-label">Custom Padding</label>
+		            <input type="number" step="0.1" class="form-control" id="customPadding" name="customPadding" placeholder="0.0" value="0.1">
+		        </div>
+		
+		        <div class="mb-3 form-check">
+		            <input type="checkbox" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" checked>
+		            <label class="form-check-label" for="convertPDFToImage">Convert PDF to Image</label>
+		        </div>
+		        
+		        <button type="submit" class="btn btn-primary">Submit</button>
+		    </form>
+          </div>
+        </div>
+      </div>
+    </div>
+    <div th:insert="~{fragments/footer.html :: footer}"></div>
+  </div>
+</body>
+</html>