redact
This commit is contained in:
parent
7c2f482b3b
commit
cd0e1a3962
7 changed files with 310 additions and 1 deletions
|
@ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API
|
|||
- Progress bar/Tracking
|
||||
- Full custom logic pipelines to combine multiple operations together.
|
||||
- Folder support with auto scanning to perform operations on
|
||||
- Redact sections of pages
|
||||
- Redact text (not au
|
||||
- Add Forms
|
||||
- Annotations
|
||||
- Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing
|
||||
- Fill forms mannual and automatic
|
||||
|
||||
### Q2: Why is my application downloading .htm files?
|
||||
This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files.
|
||||
|
|
|
@ -0,0 +1,106 @@
|
|||
package stirling.software.SPDF.controller.api.security;
|
||||
|
||||
import java.awt.Color;
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageContentStream;
|
||||
import org.apache.pdfbox.pdmodel.common.PDRectangle;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
import org.apache.pdfbox.rendering.ImageType;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.Parameter;
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
import stirling.software.SPDF.model.PDFText;
|
||||
import stirling.software.SPDF.pdf.TextFinder;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
@RestController
|
||||
@Tag(name = "Security", description = "Security APIs")
|
||||
public class RedactController {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(RedactController.class);
|
||||
|
||||
|
||||
@PostMapping(value = "/auto-redact", consumes = "multipart/form-data")
|
||||
@Operation(summary = "Redacts listOfText in a PDF document",
|
||||
description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO")
|
||||
public ResponseEntity<byte[]> redactPdf(
|
||||
@Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file,
|
||||
@Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString,
|
||||
@RequestParam(value = "useRegex", required = false) boolean useRegex,
|
||||
@RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool,
|
||||
@RequestParam(value = "customPadding", required = false) float customPadding,
|
||||
@RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception {
|
||||
|
||||
System.out.println(listOfTextString);
|
||||
String[] listOfText = listOfTextString.split("\n");
|
||||
byte[] bytes = file.getBytes();
|
||||
PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes));
|
||||
for (String text : listOfText) {
|
||||
text = text.trim();
|
||||
System.out.println(text);
|
||||
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
|
||||
List<PDFText> foundTexts = textFinder.getTextLocations(document);
|
||||
redactFoundText(document, foundTexts, customPadding);
|
||||
}
|
||||
|
||||
|
||||
|
||||
if (convertPDFToImage) {
|
||||
PDDocument imageDocument = new PDDocument();
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (int page = 0; page < document.getNumberOfPages(); ++page) {
|
||||
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
|
||||
PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight()));
|
||||
imageDocument.addPage(newPage);
|
||||
PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim);
|
||||
PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage);
|
||||
contentStream.drawImage(pdImage, 0, 0);
|
||||
contentStream.close();
|
||||
}
|
||||
document.close();
|
||||
document = imageDocument;
|
||||
}
|
||||
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
document.save(baos);
|
||||
document.close();
|
||||
|
||||
byte[] pdfContent = baos.toByteArray();
|
||||
return WebResponseUtils.bytesToWebResponse(pdfContent,
|
||||
file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf");
|
||||
}
|
||||
|
||||
|
||||
private void redactFoundText(PDDocument document, List<PDFText> blocks, float customPadding) throws IOException {
|
||||
var allPages = document.getDocumentCatalog().getPages();
|
||||
|
||||
for (PDFText block : blocks) {
|
||||
var page = allPages.get(block.getPageIndex());
|
||||
PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true);
|
||||
contentStream.setNonStrokingColor(Color.BLACK);
|
||||
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
|
||||
PDRectangle pageBox = page.getBBox();
|
||||
contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding);
|
||||
contentStream.fill();
|
||||
contentStream.close();
|
||||
}
|
||||
}
|
||||
|
||||
}
|
|
@ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag;
|
|||
@Tag(name = "Security", description = "Security APIs")
|
||||
public class SecurityWebController {
|
||||
|
||||
@GetMapping("/auto-redact")
|
||||
@Hidden
|
||||
public String autoRedactForm(Model model) {
|
||||
model.addAttribute("currentPage", "auto-redact");
|
||||
return "security/auto-redact";
|
||||
}
|
||||
|
||||
@GetMapping("/add-password")
|
||||
@Hidden
|
||||
|
|
42
src/main/java/stirling/software/SPDF/model/PDFText.java
Normal file
42
src/main/java/stirling/software/SPDF/model/PDFText.java
Normal file
|
@ -0,0 +1,42 @@
|
|||
package stirling.software.SPDF.model;
|
||||
public class PDFText {
|
||||
private final int pageIndex;
|
||||
private final float x1;
|
||||
private final float y1;
|
||||
private final float x2;
|
||||
private final float y2;
|
||||
private final String text;
|
||||
|
||||
public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
|
||||
this.pageIndex = pageIndex;
|
||||
this.x1 = x1;
|
||||
this.y1 = y1;
|
||||
this.x2 = x2;
|
||||
this.y2 = y2;
|
||||
this.text = text;
|
||||
}
|
||||
|
||||
public int getPageIndex() {
|
||||
return pageIndex;
|
||||
}
|
||||
|
||||
public float getX1() {
|
||||
return x1;
|
||||
}
|
||||
|
||||
public float getY1() {
|
||||
return y1;
|
||||
}
|
||||
|
||||
public float getX2() {
|
||||
return x2;
|
||||
}
|
||||
|
||||
public float getY2() {
|
||||
return y2;
|
||||
}
|
||||
|
||||
public String getText() {
|
||||
return text;
|
||||
}
|
||||
}
|
91
src/main/java/stirling/software/SPDF/pdf/TextFinder.java
Normal file
91
src/main/java/stirling/software/SPDF/pdf/TextFinder.java
Normal file
|
@ -0,0 +1,91 @@
|
|||
package stirling.software.SPDF.pdf;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.apache.pdfbox.text.TextPosition;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
|
||||
import stirling.software.SPDF.model.PDFText;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public class TextFinder extends PDFTextStripper {
|
||||
|
||||
private final String searchText;
|
||||
private final boolean useRegex;
|
||||
private final boolean wholeWordSearch;
|
||||
private final List<PDFText> textOccurrences = new ArrayList<>();
|
||||
|
||||
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException {
|
||||
this.searchText = searchText.toLowerCase();
|
||||
this.useRegex = useRegex;
|
||||
this.wholeWordSearch = wholeWordSearch;
|
||||
setSortByPosition(true);
|
||||
}
|
||||
|
||||
private List<Integer> findOccurrencesInText(String searchText, String content) {
|
||||
List<Integer> indexes = new ArrayList<>();
|
||||
Pattern pattern;
|
||||
|
||||
if (useRegex) {
|
||||
// Use regex-based search
|
||||
pattern = wholeWordSearch
|
||||
? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)")
|
||||
: Pattern.compile(searchText);
|
||||
} else {
|
||||
// Use normal text search
|
||||
pattern = wholeWordSearch
|
||||
? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)")
|
||||
: Pattern.compile(Pattern.quote(searchText));
|
||||
}
|
||||
|
||||
Matcher matcher = pattern.matcher(content);
|
||||
while (matcher.find()) {
|
||||
indexes.add(matcher.start());
|
||||
}
|
||||
return indexes;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void writeString(String text, List<TextPosition> textPositions) {
|
||||
for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) {
|
||||
if (index + searchText.length() <= textPositions.size()) {
|
||||
// Initial values based on the first character
|
||||
TextPosition first = textPositions.get(index);
|
||||
float minX = first.getX();
|
||||
float minY = first.getY();
|
||||
float maxX = first.getX() + first.getWidth();
|
||||
float maxY = first.getY() + first.getHeight();
|
||||
|
||||
// Loop over the rest of the characters and adjust bounding box values
|
||||
for (int i = index; i < index + searchText.length(); i++) {
|
||||
TextPosition position = textPositions.get(i);
|
||||
minX = Math.min(minX, position.getX());
|
||||
minY = Math.min(minY, position.getY());
|
||||
maxX = Math.max(maxX, position.getX() + position.getWidth());
|
||||
maxY = Math.max(maxY, position.getY() + position.getHeight());
|
||||
}
|
||||
|
||||
textOccurrences.add(new PDFText(
|
||||
getCurrentPageNo() - 1,
|
||||
minX,
|
||||
minY,
|
||||
maxX,
|
||||
maxY,
|
||||
text
|
||||
));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
|
||||
this.getText(document);
|
||||
System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document.");
|
||||
|
||||
return textOccurrences;
|
||||
}
|
||||
|
||||
}
|
|
@ -304,6 +304,10 @@ showJS.tags=JS
|
|||
# WEB PAGES #
|
||||
# #
|
||||
###########################
|
||||
#auto-redact
|
||||
autoRedact.title=Auto Redact
|
||||
autoRedact.header=Auto Redact
|
||||
|
||||
#showJS
|
||||
showJS.title=Show Javascript
|
||||
showJS.header=Show Javascript
|
||||
|
|
56
src/main/resources/templates/security/auto-redact.html
Normal file
56
src/main/resources/templates/security/auto-redact.html
Normal file
|
@ -0,0 +1,56 @@
|
|||
<!DOCTYPE html>
|
||||
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||
|
||||
<th:block th:insert="~{fragments/common :: head(title=#{autoRedact.title})}"></th:block>
|
||||
|
||||
|
||||
<body>
|
||||
<div id="page-container">
|
||||
<div id="content-wrap">
|
||||
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||
<br> <br>
|
||||
<div class="container">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-md-6">
|
||||
<h2 th:text="#{autoRedact.header}"></h2>
|
||||
|
||||
<form action="/auto-redact" method="post" enctype="multipart/form-data">
|
||||
<div class="mb-3">
|
||||
<input type="file" class="form-control" id="fileInput" name="fileInput" required accept="application/pdf">
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label for="listOfText" class="form-label">Texts to Redact (line-separated)</label>
|
||||
<textarea class="form-control" id="listOfText" name="listOfText" rows="4" required placeholder="e.g. Confidential,Top-Secret"></textarea>
|
||||
</div>
|
||||
|
||||
<div class="mb-3 form-check">
|
||||
<input type="checkbox" class="form-check-input" id="useRegex" name="useRegex">
|
||||
<label class="form-check-label" for="useRegex">Use Regex</label>
|
||||
</div>
|
||||
|
||||
<div class="mb-3 form-check">
|
||||
<input type="checkbox" class="form-check-input" id="wholeWordSearch" name="wholeWordSearch">
|
||||
<label class="form-check-label" for="wholeWordSearch">Whole Word Search</label>
|
||||
</div>
|
||||
|
||||
<div class="mb-3">
|
||||
<label for="customPadding" class="form-label">Custom Padding</label>
|
||||
<input type="number" step="0.1" class="form-control" id="customPadding" name="customPadding" placeholder="0.0" value="0.1">
|
||||
</div>
|
||||
|
||||
<div class="mb-3 form-check">
|
||||
<input type="checkbox" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" checked>
|
||||
<label class="form-check-label" for="convertPDFToImage">Convert PDF to Image</label>
|
||||
</div>
|
||||
|
||||
<button type="submit" class="btn btn-primary">Submit</button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue