This commit is contained in:
Anthony Stirling 2023-08-24 23:23:25 +01:00
parent 7c2f482b3b
commit cd0e1a3962
7 changed files with 310 additions and 1 deletions

View file

@ -208,7 +208,11 @@ For API usage you must provide a header with 'X-API-Key' and the associated API
- Progress bar/Tracking
- Full custom logic pipelines to combine multiple operations together.
- Folder support with auto scanning to perform operations on
- Redact sections of pages
- Redact text (not au
- Add Forms
- Annotations
- Multi page layout (Stich PDF pages together) support x rows y columns and custom page sizing
- Fill forms mannual and automatic
### Q2: Why is my application downloading .htm files?
This is a issue caused commonly by your NGINX congifuration. The default file upload size for NGINX is 1MB, you need to add the following in your Nginx sites-available file. client_max_body_size SIZE; Where "SIZE" is 50M for example for 50MB files.

View file

@ -0,0 +1,106 @@
package stirling.software.SPDF.controller.api.security;
import java.awt.Color;
import java.awt.image.BufferedImage;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import org.apache.pdfbox.rendering.ImageType;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.Parameter;
import io.swagger.v3.oas.annotations.media.Schema;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.PDFText;
import stirling.software.SPDF.pdf.TextFinder;
import stirling.software.SPDF.utils.WebResponseUtils;
@RestController
@Tag(name = "Security", description = "Security APIs")
public class RedactController {
private static final Logger logger = LoggerFactory.getLogger(RedactController.class);
@PostMapping(value = "/auto-redact", consumes = "multipart/form-data")
@Operation(summary = "Redacts listOfText in a PDF document",
description = "This operation takes an input PDF file and redacts the provided listOfText. Input:PDF, Output:PDF, Type:SISO")
public ResponseEntity<byte[]> redactPdf(
@Parameter(description = "The input PDF file", required = true) @RequestParam("fileInput") MultipartFile file,
@Parameter(description = "List of listOfText to redact from the PDF", required = true, schema = @Schema(type = "string")) @RequestParam("listOfText") String listOfTextString,
@RequestParam(value = "useRegex", required = false) boolean useRegex,
@RequestParam(value = "wholeWordSearch", required = false) boolean wholeWordSearchBool,
@RequestParam(value = "customPadding", required = false) float customPadding,
@RequestParam(value = "convertPDFToImage", required = false) boolean convertPDFToImage) throws Exception {
System.out.println(listOfTextString);
String[] listOfText = listOfTextString.split("\n");
byte[] bytes = file.getBytes();
PDDocument document = PDDocument.load(new ByteArrayInputStream(bytes));
for (String text : listOfText) {
text = text.trim();
System.out.println(text);
TextFinder textFinder = new TextFinder(text, useRegex, wholeWordSearchBool);
List<PDFText> foundTexts = textFinder.getTextLocations(document);
redactFoundText(document, foundTexts, customPadding);
}
if (convertPDFToImage) {
PDDocument imageDocument = new PDDocument();
PDFRenderer pdfRenderer = new PDFRenderer(document);
for (int page = 0; page < document.getNumberOfPages(); ++page) {
BufferedImage bim = pdfRenderer.renderImageWithDPI(page, 300, ImageType.RGB);
PDPage newPage = new PDPage(new PDRectangle(bim.getWidth(), bim.getHeight()));
imageDocument.addPage(newPage);
PDImageXObject pdImage = LosslessFactory.createFromImage(imageDocument, bim);
PDPageContentStream contentStream = new PDPageContentStream(imageDocument, newPage);
contentStream.drawImage(pdImage, 0, 0);
contentStream.close();
}
document.close();
document = imageDocument;
}
ByteArrayOutputStream baos = new ByteArrayOutputStream();
document.save(baos);
document.close();
byte[] pdfContent = baos.toByteArray();
return WebResponseUtils.bytesToWebResponse(pdfContent,
file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_redacted.pdf");
}
private void redactFoundText(PDDocument document, List<PDFText> blocks, float customPadding) throws IOException {
var allPages = document.getDocumentCatalog().getPages();
for (PDFText block : blocks) {
var page = allPages.get(block.getPageIndex());
PDPageContentStream contentStream = new PDPageContentStream(document, page, PDPageContentStream.AppendMode.APPEND, true, true);
contentStream.setNonStrokingColor(Color.BLACK);
float padding = (block.getY2() - block.getY1()) * 0.3f + customPadding;
PDRectangle pageBox = page.getBBox();
contentStream.addRect(block.getX1(), pageBox.getHeight() - block.getY1() - padding, block.getX2() - block.getX1(), block.getY2() - block.getY1() + 2 * padding);
contentStream.fill();
contentStream.close();
}
}
}

View file

@ -11,6 +11,12 @@ import io.swagger.v3.oas.annotations.tags.Tag;
@Tag(name = "Security", description = "Security APIs")
public class SecurityWebController {
@GetMapping("/auto-redact")
@Hidden
public String autoRedactForm(Model model) {
model.addAttribute("currentPage", "auto-redact");
return "security/auto-redact";
}
@GetMapping("/add-password")
@Hidden

View file

@ -0,0 +1,42 @@
package stirling.software.SPDF.model;
public class PDFText {
private final int pageIndex;
private final float x1;
private final float y1;
private final float x2;
private final float y2;
private final String text;
public PDFText(int pageIndex, float x1, float y1, float x2, float y2, String text) {
this.pageIndex = pageIndex;
this.x1 = x1;
this.y1 = y1;
this.x2 = x2;
this.y2 = y2;
this.text = text;
}
public int getPageIndex() {
return pageIndex;
}
public float getX1() {
return x1;
}
public float getY1() {
return y1;
}
public float getX2() {
return x2;
}
public float getY2() {
return y2;
}
public String getText() {
return text;
}
}

View file

@ -0,0 +1,91 @@
package stirling.software.SPDF.pdf;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.http.ResponseEntity;
import stirling.software.SPDF.model.PDFText;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class TextFinder extends PDFTextStripper {
private final String searchText;
private final boolean useRegex;
private final boolean wholeWordSearch;
private final List<PDFText> textOccurrences = new ArrayList<>();
public TextFinder(String searchText, boolean useRegex, boolean wholeWordSearch) throws IOException {
this.searchText = searchText.toLowerCase();
this.useRegex = useRegex;
this.wholeWordSearch = wholeWordSearch;
setSortByPosition(true);
}
private List<Integer> findOccurrencesInText(String searchText, String content) {
List<Integer> indexes = new ArrayList<>();
Pattern pattern;
if (useRegex) {
// Use regex-based search
pattern = wholeWordSearch
? Pattern.compile("(\\b|_|\\.)" + searchText + "(\\b|_|\\.)")
: Pattern.compile(searchText);
} else {
// Use normal text search
pattern = wholeWordSearch
? Pattern.compile("(\\b|_|\\.)" + Pattern.quote(searchText) + "(\\b|_|\\.)")
: Pattern.compile(Pattern.quote(searchText));
}
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
indexes.add(matcher.start());
}
return indexes;
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) {
for (Integer index : findOccurrencesInText(searchText, text.toLowerCase())) {
if (index + searchText.length() <= textPositions.size()) {
// Initial values based on the first character
TextPosition first = textPositions.get(index);
float minX = first.getX();
float minY = first.getY();
float maxX = first.getX() + first.getWidth();
float maxY = first.getY() + first.getHeight();
// Loop over the rest of the characters and adjust bounding box values
for (int i = index; i < index + searchText.length(); i++) {
TextPosition position = textPositions.get(i);
minX = Math.min(minX, position.getX());
minY = Math.min(minY, position.getY());
maxX = Math.max(maxX, position.getX() + position.getWidth());
maxY = Math.max(maxY, position.getY() + position.getHeight());
}
textOccurrences.add(new PDFText(
getCurrentPageNo() - 1,
minX,
minY,
maxX,
maxY,
text
));
}
}
}
public List<PDFText> getTextLocations(PDDocument document) throws Exception {
this.getText(document);
System.out.println("Found " + textOccurrences.size() + " occurrences of '" + searchText + "' in the document.");
return textOccurrences;
}
}

View file

@ -304,6 +304,10 @@ showJS.tags=JS
# WEB PAGES #
# #
###########################
#auto-redact
autoRedact.title=Auto Redact
autoRedact.header=Auto Redact
#showJS
showJS.title=Show Javascript
showJS.header=Show Javascript

View file

@ -0,0 +1,56 @@
<!DOCTYPE html>
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{autoRedact.title})}"></th:block>
<body>
<div id="page-container">
<div id="content-wrap">
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
<br> <br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{autoRedact.header}"></h2>
<form action="/auto-redact" method="post" enctype="multipart/form-data">
<div class="mb-3">
<input type="file" class="form-control" id="fileInput" name="fileInput" required accept="application/pdf">
</div>
<div class="mb-3">
<label for="listOfText" class="form-label">Texts to Redact (line-separated)</label>
<textarea class="form-control" id="listOfText" name="listOfText" rows="4" required placeholder="e.g. Confidential,Top-Secret"></textarea>
</div>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="useRegex" name="useRegex">
<label class="form-check-label" for="useRegex">Use Regex</label>
</div>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="wholeWordSearch" name="wholeWordSearch">
<label class="form-check-label" for="wholeWordSearch">Whole Word Search</label>
</div>
<div class="mb-3">
<label for="customPadding" class="form-label">Custom Padding</label>
<input type="number" step="0.1" class="form-control" id="customPadding" name="customPadding" placeholder="0.0" value="0.1">
</div>
<div class="mb-3 form-check">
<input type="checkbox" class="form-check-input" id="convertPDFToImage" name="convertPDFToImage" checked>
<label class="form-check-label" for="convertPDFToImage">Convert PDF to Image</label>
</div>
<button type="submit" class="btn btn-primary">Submit</button>
</form>
</div>
</div>
</div>
</div>
<div th:insert="~{fragments/footer.html :: footer}"></div>
</div>
</body>
</html>