remove blanks

This commit is contained in:
Anthony Stirling 2023-05-08 12:18:48 +01:00
parent cc919ea614
commit 0b1cdf6a68
5 changed files with 116 additions and 35 deletions

View file

@ -0,0 +1,39 @@
import cv2
import numpy as np
import sys
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
if image is None:
print(f"Error: Unable to read the image file: {image_path}")
return False
# Apply Gaussian blur to reduce noise
blurred_image = cv2.GaussianBlur(image, (blur_size, blur_size), 0)
_, thresholded_image = cv2.threshold(blurred_image, white_value - threshold, white_value, cv2.THRESH_BINARY)
# Calculate the percentage of white pixels in the thresholded image
white_pixels = np.sum(thresholded_image == white_value)
total_pixels = thresholded_image.size
white_pixel_percentage = (white_pixels / total_pixels) * 100
return white_pixel_percentage > 99
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python detect_blank_page.py <image_path>")
sys.exit(1)
image_path = sys.argv[1]
blank = is_blank_image(image_path)
if blank:
# Return code 1: The image is considered blank.
sys.exit(1)
else:
# Return code 0: The image is not considered blank.
sys.exit(0)

View file

@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageTree; import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper; import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.HttpStatus; import org.springframework.http.HttpStatus;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import stirling.software.SPDF.utils.ImageFinder; import stirling.software.SPDF.utils.ImageFinder;
import stirling.software.SPDF.utils.ProcessExecutor;
import java.awt.image.BufferedImage;
import java.io.ByteArrayOutputStream; import java.io.ByteArrayOutputStream;
import java.io.IOException; import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import javax.imageio.ImageIO;
@RestController @RestController
public class BlankPageController { public class BlankPageController {
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks") @PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException { public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
boolean removeNoText = false;
boolean removeNoTextOrImages = false;
if(processType == 0) {
removeNoText = true;
} else if (processType == 1) {
removeNoTextOrImages = true;
} else if (processType == 2) {
//run OCR
OCRController ocr = new OCRController();
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
removeNoText = true;
}
try { try {
PDDocument document = PDDocument.load(inputFile.getInputStream()); PDDocument document = PDDocument.load(inputFile.getInputStream());
PDPageTree pages = document.getDocumentCatalog().getPages(); PDPageTree pages = document.getDocumentCatalog().getPages();
PDFTextStripper textStripper = new PDFTextStripper(); PDFTextStripper textStripper = new PDFTextStripper();
List<PDPage> pagesToKeep = new ArrayList<>(); List<Integer> pagesToKeepIndex = new ArrayList<>();
int pageIndex = 0; int pageIndex = 0;
for (PDPage page : pages) { for (PDPage page : pages) {
@ -50,28 +43,40 @@ public class BlankPageController {
textStripper.setEndPage(pageIndex); textStripper.setEndPage(pageIndex);
String pageText = textStripper.getText(document); String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty(); boolean hasText = !pageText.trim().isEmpty();
if (hasText) {
pagesToKeepIndex.add(pageIndex);
System.out.print("page " + pageIndex + " has text");
continue;
}
boolean hasImages = hasImagesOnPage(page); boolean hasImages = hasImagesOnPage(page);
if (hasImages) {
if (removeNoText && removeNoTextOrImages) { pagesToKeepIndex.add(pageIndex);
if (hasText || hasImages) { System.out.print("page " + pageIndex + " has image");
pagesToKeep.add(page); continue;
}
} else if (removeNoText) {
if (hasText) {
pagesToKeep.add(page);
}
} else if (removeNoTextOrImages) {
if (hasText && hasImages) {
pagesToKeep.add(page);
}
} else {
pagesToKeep.add(page);
} }
} }
System.out.print(pagesToKeepIndex.size());
PDDocument outputDocument = new PDDocument(); PDDocument outputDocument = new PDDocument();
for (PDPage page : pagesToKeep) { PDFRenderer pdfRenderer = new PDFRenderer(document);
outputDocument.addPage(page); for (Integer i : pagesToKeepIndex) {
// Create temp file to save the image
Path tempFile = Files.createTempFile("image_", ".png");
// Render image and save as temp file
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
ImageIO.write(image, "png", tempFile.toFile());
List<String> command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString()));
// Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
//does contain data
if(returnCode ==0) {
outputDocument.addPage(document.getPage(i - 1));
} else {
System.out.print("Found blank page skipping, page #" + i);
}
} }
ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); ByteArrayOutputStream outputStream = new ByteArrayOutputStream();

View file

@ -95,4 +95,11 @@ public class OtherWebController {
return "other/repair"; return "other/repair";
} }
@GetMapping("/remove-blanks")
@Hidden
public String removeBlanksForm(Model model) {
model.addAttribute("currentPage", "remove-blanks");
return "other/remove-blanks";
}
} }

View file

@ -112,6 +112,8 @@ filter: invert(0.2) sepia(2) saturate(50) hue-rotate(190deg);
<div th:replace="~{fragments/card :: card(cardTitle=#{home.flatten.title}, cardText=#{home.flatten.desc}, cardLink='flatten', svgPath='images/flatten.svg')}"></div> <div th:replace="~{fragments/card :: card(cardTitle=#{home.flatten.title}, cardText=#{home.flatten.desc}, cardLink='flatten', svgPath='images/flatten.svg')}"></div>
<div th:replace="~{fragments/card :: card(cardTitle=#{home.repair.title}, cardText=#{home.repair.desc}, cardLink='repair', svgPath='images/wrench.svg')}"></div> <div th:replace="~{fragments/card :: card(cardTitle=#{home.repair.title}, cardText=#{home.repair.desc}, cardLink='repair', svgPath='images/wrench.svg')}"></div>
<div th:replace="~{fragments/card :: card(cardTitle=#{home.removeBlanks.title}, cardText=#{home.removeBlanks.desc}, cardLink='remove-blanks', svgPath='images/wrench.svg')}"></div>
<div th:replace="~{fragments/card :: card(cardTitle=#{home.compare.title}, cardText=#{home.compare.desc}, cardLink='compare', svgPath='images/wrench.svg')}"></div>
</div> </div>
</div> </div>
<div th:insert="~{fragments/footer.html :: footer}"></div> <div th:insert="~{fragments/footer.html :: footer}"></div>

View file

@ -0,0 +1,28 @@
<!DOCTYPE html>
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{removeBlanks.title})}"></th:block>
<body>
<div id="page-container">
<div id="content-wrap">
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
<br> <br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{removeBlanks.header}"></h2>
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
</form>
</div>
</div>
</div>
</div>
<div th:insert="~{fragments/footer.html :: footer}"></div>
</div>
</body>
</html>