blank page test

This commit is contained in:
Anthony Stirling 2023-05-07 12:56:38 +01:00
parent 07e81a117b
commit cc919ea614
2 changed files with 228 additions and 0 deletions

View file

@ -0,0 +1,98 @@
package stirling.software.SPDF.controller.api.other;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.ImageFinder;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@RestController
public class BlankPageController {
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
boolean removeNoText = false;
boolean removeNoTextOrImages = false;
if(processType == 0) {
removeNoText = true;
} else if (processType == 1) {
removeNoTextOrImages = true;
} else if (processType == 2) {
//run OCR
OCRController ocr = new OCRController();
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
removeNoText = true;
}
try {
PDDocument document = PDDocument.load(inputFile.getInputStream());
PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper();
List<PDPage> pagesToKeep = new ArrayList<>();
int pageIndex = 0;
for (PDPage page : pages) {
pageIndex++;
textStripper.setStartPage(pageIndex);
textStripper.setEndPage(pageIndex);
String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty();
boolean hasImages = hasImagesOnPage(page);
if (removeNoText && removeNoTextOrImages) {
if (hasText || hasImages) {
pagesToKeep.add(page);
}
} else if (removeNoText) {
if (hasText) {
pagesToKeep.add(page);
}
} else if (removeNoTextOrImages) {
if (hasText && hasImages) {
pagesToKeep.add(page);
}
} else {
pagesToKeep.add(page);
}
}
PDDocument outputDocument = new PDDocument();
for (PDPage page : pagesToKeep) {
outputDocument.addPage(page);
}
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
outputDocument.save(outputStream);
outputDocument.close();
document.close();
return new ResponseEntity<>(outputStream.toByteArray(), HttpStatus.OK);
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
}
}
private static boolean hasImagesOnPage(PDPage page) throws IOException {
ImageFinder imageFinder = new ImageFinder(page);
imageFinder.processPage(page);
return imageFinder.hasImages();
}
// ... rest of the code (ImageFinder class and hasImagesOnPage method)
}

View file

@ -0,0 +1,130 @@
package stirling.software.SPDF.utils;
import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.contentstream.operator.OperatorName;
import org.apache.pdfbox.cos.COSBase;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import java.awt.geom.Point2D;
import java.io.IOException;
import java.util.List;
public class ImageFinder extends org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine {
private boolean hasImages = false;
public ImageFinder(PDPage page) {
super(page);
}
public boolean hasImages() {
return hasImages;
}
@Override
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
String operation = operator.getName();
if (operation.equals(OperatorName.DRAW_OBJECT)) {
COSBase base = operands.get(0);
if (base instanceof COSName) {
COSName objectName = (COSName) base;
PDXObject xobject = getResources().getXObject(objectName);
if (xobject instanceof PDImageXObject) {
hasImages = true;
} else if (xobject instanceof PDFormXObject) {
PDFormXObject form = (PDFormXObject) xobject;
ImageFinder innerFinder = new ImageFinder(getPage());
innerFinder.processPage(getPage());
if (innerFinder.hasImages()) {
hasImages = true;
}
}
}
}
super.processOperator(operator, operands);
}
@Override
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void drawImage(PDImage pdImage) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void clip(int windingRule) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void moveTo(float x, float y) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void lineTo(float x, float y) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
// TODO Auto-generated method stub
}
@Override
public Point2D getCurrentPoint() throws IOException {
// TODO Auto-generated method stub
return null;
}
@Override
public void closePath() throws IOException {
// TODO Auto-generated method stub
}
@Override
public void endPath() throws IOException {
// TODO Auto-generated method stub
}
@Override
public void strokePath() throws IOException {
// TODO Auto-generated method stub
}
@Override
public void fillPath(int windingRule) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void fillAndStrokePath(int windingRule) throws IOException {
// TODO Auto-generated method stub
}
@Override
public void shadingFill(COSName shadingName) throws IOException {
// TODO Auto-generated method stub
}
// ... rest of the overridden methods
}