diff --git a/README.md b/README.md index a2bc9003..9c7d3e0c 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,8 @@ Feel free to request any features or bug fixes either in github issues or our [D - Parallel file processing and downloads - API for integration with external scripts +Hosted instance/demo of the app can be seen [here](https://pdf.adminforge.de/) hosted by the team at adminforge.de + ## Technologies used - Spring Boot + Thymeleaf - PDFBox diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java new file mode 100644 index 00000000..0934418c --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java @@ -0,0 +1,98 @@ +package stirling.software.SPDF.controller.api.other; +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; +import stirling.software.SPDF.utils.ImageFinder; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +@RestController +public class BlankPageController { + + @PostMapping(consumes = "multipart/form-data", value = "/remove-blanks") + public ResponseEntity removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException { + boolean removeNoText = false; + boolean removeNoTextOrImages = false; + + if(processType == 0) { + removeNoText = true; + } else if (processType == 1) { + removeNoTextOrImages = true; + } else if (processType == 2) { + //run OCR + OCRController ocr = new OCRController(); + ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false); + + removeNoText = true; + } + + try { + PDDocument document = PDDocument.load(inputFile.getInputStream()); + PDPageTree pages = document.getDocumentCatalog().getPages(); + PDFTextStripper textStripper = new PDFTextStripper(); + + List pagesToKeep = new ArrayList<>(); + int pageIndex = 0; + + for (PDPage page : pages) { + pageIndex++; + textStripper.setStartPage(pageIndex); + textStripper.setEndPage(pageIndex); + String pageText = textStripper.getText(document); + boolean hasText = !pageText.trim().isEmpty(); + boolean hasImages = hasImagesOnPage(page); + + if (removeNoText && removeNoTextOrImages) { + if (hasText || hasImages) { + pagesToKeep.add(page); + } + } else if (removeNoText) { + if (hasText) { + pagesToKeep.add(page); + } + } else if (removeNoTextOrImages) { + if (hasText && hasImages) { + pagesToKeep.add(page); + } + } else { + pagesToKeep.add(page); + } + } + + PDDocument outputDocument = new PDDocument(); + for (PDPage page : pagesToKeep) { + outputDocument.addPage(page); + } + + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + outputDocument.save(outputStream); + outputDocument.close(); + document.close(); + + return new ResponseEntity<>(outputStream.toByteArray(), HttpStatus.OK); + } catch (IOException e) { + e.printStackTrace(); + return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); + } + } + + private static boolean hasImagesOnPage(PDPage page) throws IOException { + ImageFinder imageFinder = new ImageFinder(page); + imageFinder.processPage(page); + return imageFinder.hasImages(); + } + + + + // ... rest of the code (ImageFinder class and hasImagesOnPage method) +} diff --git a/src/main/java/stirling/software/SPDF/utils/ImageFinder.java b/src/main/java/stirling/software/SPDF/utils/ImageFinder.java new file mode 100644 index 00000000..b8eb4f59 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/utils/ImageFinder.java @@ -0,0 +1,130 @@ +package stirling.software.SPDF.utils; + +import org.apache.pdfbox.contentstream.operator.Operator; +import org.apache.pdfbox.contentstream.operator.OperatorName; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.graphics.PDXObject; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.apache.pdfbox.pdmodel.graphics.image.PDImage; +import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject; + +import java.awt.geom.Point2D; +import java.io.IOException; +import java.util.List; + +public class ImageFinder extends org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine { + private boolean hasImages = false; + + public ImageFinder(PDPage page) { + super(page); + } + + public boolean hasImages() { + return hasImages; + } + + @Override + protected void processOperator(Operator operator, List operands) throws IOException { + String operation = operator.getName(); + if (operation.equals(OperatorName.DRAW_OBJECT)) { + COSBase base = operands.get(0); + if (base instanceof COSName) { + COSName objectName = (COSName) base; + PDXObject xobject = getResources().getXObject(objectName); + if (xobject instanceof PDImageXObject) { + hasImages = true; + } else if (xobject instanceof PDFormXObject) { + PDFormXObject form = (PDFormXObject) xobject; + ImageFinder innerFinder = new ImageFinder(getPage()); + innerFinder.processPage(getPage()); + if (innerFinder.hasImages()) { + hasImages = true; + } + } + } + } + super.processOperator(operator, operands); + } + + @Override + public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void drawImage(PDImage pdImage) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void clip(int windingRule) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void moveTo(float x, float y) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void lineTo(float x, float y) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public Point2D getCurrentPoint() throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public void closePath() throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void endPath() throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void strokePath() throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void fillPath(int windingRule) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void fillAndStrokePath(int windingRule) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public void shadingFill(COSName shadingName) throws IOException { + // TODO Auto-generated method stub + + } + + // ... rest of the overridden methods +}