From a647347e1089a5f64db555cee24940ec1dbfbad4 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Thu, 11 May 2023 23:05:33 +0100 Subject: [PATCH] Changes to blank detection, WIP for % --- scripts/detect-blank-pages.py | 19 ++-- .../api/other/BlankPageController.java | 94 +++++++++++-------- .../other/ExtractImageScansController.java | 14 ++- .../templates/other/remove-blanks.html | 10 ++ 4 files changed, 85 insertions(+), 52 deletions(-) diff --git a/scripts/detect-blank-pages.py b/scripts/detect-blank-pages.py index 1fb80187..05a358ad 100644 --- a/scripts/detect-blank-pages.py +++ b/scripts/detect-blank-pages.py @@ -1,8 +1,9 @@ import cv2 import numpy as np import sys +import argparse -def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5): +def is_blank_image(image_path, threshold=10, white_percent=99, white_value=255, blur_size=5): image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) if image is None: @@ -19,21 +20,21 @@ def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5): total_pixels = thresholded_image.size white_pixel_percentage = (white_pixels / total_pixels) * 100 - return white_pixel_percentage > 99 - + return white_pixel_percentage > white_percent if __name__ == "__main__": - if len(sys.argv) != 2: - print("Usage: python detect_blank_page.py ") - sys.exit(1) + parser = argparse.ArgumentParser(description='Detect if an image is considered blank or not.') + parser.add_argument('image_path', help='The path to the image file.') + parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for determining white pixels. The default value is 10.') + parser.add_argument('-w', '--white_percent', type=int, default=99, help='The percentage of white pixels for an image to be considered blank. The default value is 99.') + args = parser.parse_args() - image_path = sys.argv[1] - blank = is_blank_image(image_path) + blank = is_blank_image(args.image_path, args.threshold, args.white_percent) if blank: # Return code 1: The image is considered blank. sys.exit(1) else: # Return code 0: The image is not considered blank. - sys.exit(0) \ No newline at end of file + sys.exit(0) diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java index bf29c774..5e538281 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/other/BlankPageController.java @@ -1,11 +1,15 @@ package stirling.software.SPDF.controller.api.other; + import java.awt.image.BufferedImage; import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.IntStream; import javax.imageio.ImageIO; @@ -17,6 +21,7 @@ import org.apache.pdfbox.text.PDFTextStripper; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RequestPart; import org.springframework.web.bind.annotation.RestController; import org.springframework.web.multipart.MultipartFile; @@ -29,7 +34,9 @@ import stirling.software.SPDF.utils.ProcessExecutor; public class BlankPageController { @PostMapping(consumes = "multipart/form-data", value = "/remove-blanks") - public ResponseEntity removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException { + public ResponseEntity removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, + @RequestParam(defaultValue = "10", name = "threshold") int threshold, + @RequestParam(defaultValue = "99", name = "whitePercent") int whitePercent) throws IOException, InterruptedException { PDDocument document = null; try { document = PDDocument.load(inputFile.getInputStream()); @@ -38,62 +45,67 @@ public class BlankPageController { List pagesToKeepIndex = new ArrayList<>(); int pageIndex = 0; + PDFRenderer pdfRenderer = new PDFRenderer(document); for (PDPage page : pages) { - pageIndex++; - textStripper.setStartPage(pageIndex); - textStripper.setEndPage(pageIndex); + System.out.println("checking page " + pageIndex); + textStripper.setStartPage(pageIndex + 1); + textStripper.setEndPage(pageIndex + 1); String pageText = textStripper.getText(document); boolean hasText = !pageText.trim().isEmpty(); if (hasText) { - pagesToKeepIndex.add(pageIndex); - System.out.println("page " + pageIndex + " has text"); - continue; - } - boolean hasImages = hasImagesOnPage(page); - if (hasImages) { - pagesToKeepIndex.add(pageIndex); - System.out.println("page " + pageIndex + " has image"); - continue; - } - } - System.out.print(pagesToKeepIndex.size()); - PDDocument outputDocument = new PDDocument(); - PDFRenderer pdfRenderer = new PDFRenderer(document); - for (Integer i : pagesToKeepIndex) { - // Create temp file to save the image - Path tempFile = Files.createTempFile("image_", ".png"); - - // Render image and save as temp file - BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300); - ImageIO.write(image, "png", tempFile.toFile()); - - List command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString())); - - // Run CLI command - int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command); - - //does contain data - if(returnCode ==0) { - outputDocument.addPage(document.getPage(i - 1)); + pagesToKeepIndex.add(pageIndex); + System.out.println("page " + pageIndex + " has text"); } else { - System.out.print("Found blank page skipping, page #" + i); + boolean hasImages = hasImagesOnPage(page); + if (hasImages) { + System.out.println("page " + pageIndex + " has image"); + + Path tempFile = Files.createTempFile("image_", ".png"); + + // Render image and save as temp file + BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300); + ImageIO.write(image, "png", tempFile.toFile()); + + List command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString() ,"--threshold", String.valueOf(threshold), "--white_percent", String.valueOf(whitePercent))); + + // Run CLI command + int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command); + + // does contain data + if (returnCode == 0) { + System.out.println("page " + pageIndex + " has image which is not blank"); + pagesToKeepIndex.add(pageIndex); + } else { + System.out.println("Skipping, Image was blank for page #" + pageIndex); + } + } + } + pageIndex++; + + } + System.out.print("pagesToKeep=" + pagesToKeepIndex.size()); + + // Remove pages not present in pagesToKeepIndex + List pageIndices = IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList()); + Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal + for (Integer i : pageIndices) { + if (!pagesToKeepIndex.contains(i)) { + pages.remove(i); } } - - - - return PdfUtils.pdfDocToWebResponse(outputDocument, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf"); + return PdfUtils.pdfDocToWebResponse(document, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf"); } catch (IOException e) { e.printStackTrace(); return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR); } finally { - if(document != null) + if (document != null) document.close(); } } - + + private static boolean hasImagesOnPage(PDPage page) throws IOException { ImageFinder imageFinder = new ImageFinder(page); imageFinder.processPage(page); diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/ExtractImageScansController.java b/src/main/java/stirling/software/SPDF/controller/api/other/ExtractImageScansController.java index 89ca7064..1f6d1c5a 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/other/ExtractImageScansController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/other/ExtractImageScansController.java @@ -101,8 +101,18 @@ public class ExtractImageScansController { for (int i = 0; i < images.size(); i++) { Path tempDir = Files.createTempDirectory("openCV_output"); - List command = new ArrayList<>(Arrays.asList("python3", "./scripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold), - String.valueOf(tolerance), String.valueOf(minArea), String.valueOf(minContourArea), String.valueOf(borderSize))); + List command = new ArrayList<>(Arrays.asList( + "python3", + "./scripts/split_photos.py", + images.get(i), + tempDir.toString(), + "--angle_threshold", String.valueOf(angleThreshold), + "--tolerance", String.valueOf(tolerance), + "--min_area", String.valueOf(minArea), + "--min_contour_area", String.valueOf(minContourArea), + "--border_size", String.valueOf(borderSize) + )); + // Run CLI command int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command); diff --git a/src/main/resources/templates/other/remove-blanks.html b/src/main/resources/templates/other/remove-blanks.html index e6d99d4b..2118755a 100644 --- a/src/main/resources/templates/other/remove-blanks.html +++ b/src/main/resources/templates/other/remove-blanks.html @@ -16,6 +16,16 @@
+
+ + + Threshold for determining how white a white pixel must be) +
+
+ + + Percent of page that must be white to be removed +