redact

2023-08-24 23:23:25 +01:00 · 2023-08-24 23:23:25 +01:00 · cd0e1a3962
commit cd0e1a3962
parent 7c2f482b3b
7 changed files with 310 additions and 1 deletions
--- a/README.md
+++ b/README.md
@ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API
 - Progress bar/Tracking
 - Full custom logic pipelines to combine multiple operations together.
 - Folder support with auto scanning to perform operations on
- Redact sections of pages
+- Redact text (not au
 - Add Forms
 - Annotations
 - Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing 
 - Fill forms mannual and automatic 
 ### Q2: Why is my application downloading .htm files?
 This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files.
--- a/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
+++ b/src/main/java/stirling/software/SPDF/controller/api/security/RedactController.java
@ -0,0 +1,106 @@
 package stirling.software.SPDF.controller.api.security;
 import java.awt.Color;
 import java.awt.image.BufferedImage;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.List;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.PDPageContentStream;
 import org.apache.pdfbox.pdmodel.common.PDRectangle;
 import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.apache.pdfbox.rendering.ImageType;
 import org.apache.pdfbox.rendering.PDFRenderer;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.springframework.http.ResponseEntity;
 import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestParam;
 import org.springframework.web.bind.annotation.RestController;
 import org.springframework.web.multipart.MultipartFile;
 import io.swagger.v3.oas.annotations.Operation;
 import io.swagger.v3.oas.annotations.Parameter;
 import io.swagger.v3.oas.annotations.media.Schema;
 import io.swagger.v3.oas.annotations.tags.Tag;
 import stirling.software.SPDF.model.PDFText;
 import stirling.software.SPDF.pdf.TextFinder;
 import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Security", description = "Security APIs")
 public class RedactController {
    private static final Logger logger = LoggerFactory.getLogger(RedactController.class);
    @PostMapping(value = "/auto-redact", consumes = "multipart/form-data")
    @Operation(summary = "Redacts listOfText in a PDF document", 
               description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO")
    public ResponseEntity<byte[]> redactPdf(
            @Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file,
            @Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString,
            @RequestParam(value = "useRegex", required = false) boolean useRegex,
            @RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool,
            @RequestParam(value = "customPadding", required = false) float customPadding,
            @RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception {
    	System.out.println(listOfTextString);
    	String[] listOfText = listOfTextString.split("\n");
        byte[] bytes = file.getBytes();
        PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes));
        for (String text : listOfText) {
        	text = text.trim();
        	System.out.println(text);
        	TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
            List<PDFText> foundTexts = textFinder.getTextLocations(document);
            redactFoundText(document, foundTexts, customPadding);
        }
        if (convertPDFToImage) {
            PDDocument imageDocument = new PDDocument();
            PDFRenderer pdfRenderer = new PDFRenderer(document);
            for (int page = 0; page < document.getNumberOfPages(); ++page) {
                BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
                PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight()));
                imageDocument.addPage(newPage);
                PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim);
                PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage);
                contentStream.drawImage(pdImage, 0, 0);
                contentStream.close();
            }
            document.close();
            document = imageDocument;
        }
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        document.save(baos);
        document.close();
        byte[] pdfContent = baos.toByteArray();
        return WebResponseUtils.bytesToWebResponse(pdfContent,
                file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf");
    }
    private void redactFoundText(PDDocument document, List<PDFText> blocks, float customPadding) throws IOException {
        var allPages = document.getDocumentCatalog().getPages();
        for (PDFText block : blocks) {
            var page = allPages.get(block.getPageIndex());
            PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true);
            contentStream.setNonStrokingColor(Color.BLACK);
            float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
            PDRectangle pageBox = page.getBBox();
            contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding);
            contentStream.fill();
            contentStream.close();
        }
    }
 }
--- a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java
+++ b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java
@ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag;
@Tag(name = "Security", description = "Security APIs")
 public class SecurityWebController {
 	@GetMapping("/auto-redact")
    @Hidden
    public String autoRedactForm(Model model) {
        model.addAttribute("currentPage", "auto-redact");
        return "security/auto-redact";
    }
    @GetMapping("/add-password")
    @Hidden
--- a/src/main/java/stirling/software/SPDF/model/PDFText.java
+++ b/src/main/java/stirling/software/SPDF/model/PDFText.java
@ -0,0 +1,42 @@
 package stirling.software.SPDF.model;
 public class PDFText {
    private final int pageIndex;
    private final float x1;
    private final float y1;
    private final float x2;
    private final float y2;
    private final String text;
    public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
        this.pageIndex = pageIndex;
        this.x1 = x1;
        this.y1 = y1;
        this.x2 = x2;
        this.y2 = y2;
        this.text = text;
    }
    public int getPageIndex() {
        return pageIndex;
    }
    public float getX1() {
        return x1;
    }
    public float getY1() {
        return y1;
    }
    public float getX2() {
        return x2;
    }
    public float getY2() {
        return y2;
    }
    public String getText() {
        return text;
    }
 }
--- a/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
+++ b/src/main/java/stirling/software/SPDF/pdf/TextFinder.java
@ -0,0 +1,91 @@
 package stirling.software.SPDF.pdf;
 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.text.PDFTextStripper;
 import org.apache.pdfbox.text.TextPosition;
 import org.springframework.http.ResponseEntity;
 import stirling.software.SPDF.model.PDFText;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 public class TextFinder extends PDFTextStripper {
 	private final String searchText;
 	private final boolean useRegex;
 	private final boolean wholeWordSearch;
 	private final List<PDFText> textOccurrences = new ArrayList<>();
 	public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException {
 	    this.searchText = searchText.toLowerCase();
 	    this.useRegex = useRegex;
 	    this.wholeWordSearch = wholeWordSearch;
 	    setSortByPosition(true);
 	}
 	private List<Integer> findOccurrencesInText(String searchText, String content) {
 	    List<Integer> indexes = new ArrayList<>();
 	    Pattern pattern;
 	    if (useRegex) {
 	        // Use regex-based search
 	        pattern = wholeWordSearch 
 	            ? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)") 
 	            : Pattern.compile(searchText);
 	    } else {
 	        // Use normal text search
 	        pattern = wholeWordSearch 
 		            ? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)") 
 		    	            : Pattern.compile(Pattern.quote(searchText));
 	    }
 	    Matcher matcher = pattern.matcher(content);
 	    while (matcher.find()) {
 	        indexes.add(matcher.start());
 	    }
 	    return indexes;
 	}
 	@Override
 	protected void writeString(String text, List<TextPosition> textPositions) {
 	    for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) {
 	        if (index + searchText.length() <= textPositions.size()) {
 	            // Initial values based on the first character
 	            TextPosition first = textPositions.get(index);
 	            float minX = first.getX();
 	            float minY = first.getY();
 	            float maxX = first.getX() + first.getWidth();
 	            float maxY = first.getY() + first.getHeight();
 	            // Loop over the rest of the characters and adjust bounding box values
 	            for (int i = index; i < index + searchText.length(); i++) {
 	                TextPosition position = textPositions.get(i);
 	                minX = Math.min(minX, position.getX());
 	                minY = Math.min(minY, position.getY());
 	                maxX = Math.max(maxX, position.getX() + position.getWidth());
 	                maxY = Math.max(maxY, position.getY() + position.getHeight());
 	            }
 	            textOccurrences.add(new PDFText(
 	                    getCurrentPageNo() - 1,
 	                    minX,
 	                    minY,
 	                    maxX,
 	                    maxY,
 	                    text
 	            ));
 	        }
 	    }
 	}
 	public List<PDFText> getTextLocations(PDDocument document) throws Exception {
 		this.getText(document);
 		System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document.");
 		return textOccurrences;
 	}
 }
--- a/src/main/resources/messages_en_GB.properties
+++ b/src/main/resources/messages_en_GB.properties
@ -304,6 +304,10 @@ showJS.tags=JS
 #       WEB PAGES         #
 #                         #
 ###########################
 #auto-redact
 autoRedact.title=Auto Redact
 autoRedact.header=Auto Redact
 #showJS
 showJS.title=Show Javascript
 showJS.header=Show Javascript
--- a/src/main/resources/templates/security/auto-redact.html
+++ b/src/main/resources/templates/security/auto-redact.html
@ -0,0 +1,56 @@
 <!DOCTYPE html>
 <html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
 <th:block th:insert="~{fragments/common :: head(title=#{autoRedact.title})}"></th:block>
 <body>
  <div id="page-container">
    <div id="content-wrap">
      <div th:insert="~{fragments/navbar.html :: navbar}"></div>
      <br> <br>
      <div class="container">
        <div class="row justify-content-center">
          <div class="col-md-6">
            <h2 th:text="#{autoRedact.header}"></h2>
            <form action="/auto-redact" method="post" enctype="multipart/form-data">
 		        <div class="mb-3">
 		            <input type="file" class="form-control" id="fileInput" name="fileInput" required accept="application/pdf">
 		        </div>
 		        <div class="mb-3">
 		            <label for="listOfText" class="form-label">Texts to Redact (line-separated)</label>
 		            <textarea class="form-control" id="listOfText" name="listOfText" rows="4" required placeholder="e.g. Confidential,Top-Secret"></textarea>
 		        </div>
 		        <div class="mb-3 form-check">
 		            <input type="checkbox" class="form-check-input" id="useRegex" name="useRegex">
 		            <label class="form-check-label" for="useRegex">Use Regex</label>
 		        </div>
 		        <div class="mb-3 form-check">
 		            <input type="checkbox" class="form-check-input" id="wholeWordSearch" name="wholeWordSearch">
 		            <label class="form-check-label" for="wholeWordSearch">Whole Word Search</label>
 		        </div>
 		        <div class="mb-3">
 		            <label for="customPadding" class="form-label">Custom Padding</label>
 		            <input type="number" step="0.1" class="form-control" id="customPadding" name="customPadding" placeholder="0.0" value="0.1">
 		        </div>
 		        <div class="mb-3 form-check">
 		            <input type="checkbox" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" checked>
 		            <label class="form-check-label" for="convertPDFToImage">Convert PDF to Image</label>
 		        </div>
 		        <button type="submit" class="btn btn-primary">Submit</button>
 		    </form>
          </div>
        </div>
      </div>
    </div>
    <div th:insert="~{fragments/footer.html :: footer}"></div>
  </div>
 </body>
 </html>