Merge branch 'cleanups' of https://github.com/Frooodle/Stirling-PDF into cleanups
This commit is contained in:
commit
1d55ee7f93
3 changed files with 230 additions and 0 deletions
|
@ -40,6 +40,8 @@ Feel free to request any features or bug fixes either in github issues or our [D
|
|||
- Parallel file processing and downloads
|
||||
- API for integration with external scripts
|
||||
|
||||
Hosted instance/demo of the app can be seen [here](https://pdf.adminforge.de/) hosted by the team at adminforge.de
|
||||
|
||||
## Technologies used
|
||||
- Spring Boot + Thymeleaf
|
||||
- PDFBox
|
||||
|
|
|
@ -0,0 +1,98 @@
|
|||
package stirling.software.SPDF.controller.api.other;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestPart;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import stirling.software.SPDF.utils.ImageFinder;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
@RestController
|
||||
public class BlankPageController {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
|
||||
boolean removeNoText = false;
|
||||
boolean removeNoTextOrImages = false;
|
||||
|
||||
if(processType == 0) {
|
||||
removeNoText = true;
|
||||
} else if (processType == 1) {
|
||||
removeNoTextOrImages = true;
|
||||
} else if (processType == 2) {
|
||||
//run OCR
|
||||
OCRController ocr = new OCRController();
|
||||
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
|
||||
|
||||
removeNoText = true;
|
||||
}
|
||||
|
||||
try {
|
||||
PDDocument document = PDDocument.load(inputFile.getInputStream());
|
||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
|
||||
List<PDPage> pagesToKeep = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
|
||||
for (PDPage page : pages) {
|
||||
pageIndex++;
|
||||
textStripper.setStartPage(pageIndex);
|
||||
textStripper.setEndPage(pageIndex);
|
||||
String pageText = textStripper.getText(document);
|
||||
boolean hasText = !pageText.trim().isEmpty();
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
|
||||
if (removeNoText && removeNoTextOrImages) {
|
||||
if (hasText || hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else if (removeNoText) {
|
||||
if (hasText) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else if (removeNoTextOrImages) {
|
||||
if (hasText && hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
}
|
||||
|
||||
PDDocument outputDocument = new PDDocument();
|
||||
for (PDPage page : pagesToKeep) {
|
||||
outputDocument.addPage(page);
|
||||
}
|
||||
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
outputDocument.save(outputStream);
|
||||
outputDocument.close();
|
||||
document.close();
|
||||
|
||||
return new ResponseEntity<>(outputStream.toByteArray(), HttpStatus.OK);
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean hasImagesOnPage(PDPage page) throws IOException {
|
||||
ImageFinder imageFinder = new ImageFinder(page);
|
||||
imageFinder.processPage(page);
|
||||
return imageFinder.hasImages();
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ... rest of the code (ImageFinder class and hasImagesOnPage method)
|
||||
}
|
130
src/main/java/stirling/software/SPDF/utils/ImageFinder.java
Normal file
130
src/main/java/stirling/software/SPDF/utils/ImageFinder.java
Normal file
|
@ -0,0 +1,130 @@
|
|||
package stirling.software.SPDF.utils;
|
||||
|
||||
import org.apache.pdfbox.contentstream.operator.Operator;
|
||||
import org.apache.pdfbox.contentstream.operator.OperatorName;
|
||||
import org.apache.pdfbox.cos.COSBase;
|
||||
import org.apache.pdfbox.cos.COSName;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.PDXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImage;
|
||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||
|
||||
import java.awt.geom.Point2D;
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
|
||||
public class ImageFinder extends org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine {
|
||||
private boolean hasImages = false;
|
||||
|
||||
public ImageFinder(PDPage page) {
|
||||
super(page);
|
||||
}
|
||||
|
||||
public boolean hasImages() {
|
||||
return hasImages;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void processOperator(Operator operator, List<COSBase> operands) throws IOException {
|
||||
String operation = operator.getName();
|
||||
if (operation.equals(OperatorName.DRAW_OBJECT)) {
|
||||
COSBase base = operands.get(0);
|
||||
if (base instanceof COSName) {
|
||||
COSName objectName = (COSName) base;
|
||||
PDXObject xobject = getResources().getXObject(objectName);
|
||||
if (xobject instanceof PDImageXObject) {
|
||||
hasImages = true;
|
||||
} else if (xobject instanceof PDFormXObject) {
|
||||
PDFormXObject form = (PDFormXObject) xobject;
|
||||
ImageFinder innerFinder = new ImageFinder(getPage());
|
||||
innerFinder.processPage(getPage());
|
||||
if (innerFinder.hasImages()) {
|
||||
hasImages = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
super.processOperator(operator, operands);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void appendRectangle(Point2D p0, Point2D p1, Point2D p2, Point2D p3) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void drawImage(PDImage pdImage) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clip(int windingRule) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void moveTo(float x, float y) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void lineTo(float x, float y) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void curveTo(float x1, float y1, float x2, float y2, float x3, float y3) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public Point2D getCurrentPoint() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void closePath() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void endPath() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void strokePath() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillPath(int windingRule) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void fillAndStrokePath(int windingRule) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shadingFill(COSName shadingName) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
// ... rest of the overridden methods
|
||||
}
|
Loading…
Reference in a new issue