Changes to blank detection, WIP for %
This commit is contained in:
parent
2d42ae9a36
commit
a647347e10
4 changed files with 85 additions and 52 deletions
|
@ -1,8 +1,9 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
||||
def is_blank_image(image_path, threshold=10, white_percent=99, white_value=255, blur_size=5):
|
||||
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
if image is None:
|
||||
|
@ -19,17 +20,17 @@ def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
|||
total_pixels = thresholded_image.size
|
||||
white_pixel_percentage = (white_pixels / total_pixels) * 100
|
||||
|
||||
return white_pixel_percentage > 99
|
||||
|
||||
return white_pixel_percentage > white_percent
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python detect_blank_page.py <image_path>")
|
||||
sys.exit(1)
|
||||
parser = argparse.ArgumentParser(description='Detect if an image is considered blank or not.')
|
||||
parser.add_argument('image_path', help='The path to the image file.')
|
||||
parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for determining white pixels. The default value is 10.')
|
||||
parser.add_argument('-w', '--white_percent', type=int, default=99, help='The percentage of white pixels for an image to be considered blank. The default value is 99.')
|
||||
args = parser.parse_args()
|
||||
|
||||
image_path = sys.argv[1]
|
||||
blank = is_blank_image(image_path)
|
||||
blank = is_blank_image(args.image_path, args.threshold, args.white_percent)
|
||||
|
||||
if blank:
|
||||
# Return code 1: The image is considered blank.
|
||||
|
|
|
@ -1,11 +1,15 @@
|
|||
package stirling.software.SPDF.controller.api.other;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
|
@ -17,6 +21,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
|||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestParam;
|
||||
import org.springframework.web.bind.annotation.RequestPart;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
@ -29,7 +34,9 @@ import stirling.software.SPDF.utils.ProcessExecutor;
|
|||
public class BlankPageController {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
|
||||
@RequestParam(defaultValue = "10", name = "threshold") int threshold,
|
||||
@RequestParam(defaultValue = "99", name = "whitePercent") int whitePercent) throws IOException, InterruptedException {
|
||||
PDDocument document = null;
|
||||
try {
|
||||
document = PDDocument.load(inputFile.getInputStream());
|
||||
|
@ -38,62 +45,67 @@ public class BlankPageController {
|
|||
|
||||
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
|
||||
for (PDPage page : pages) {
|
||||
pageIndex++;
|
||||
textStripper.setStartPage(pageIndex);
|
||||
textStripper.setEndPage(pageIndex);
|
||||
System.out.println("checking page " + pageIndex);
|
||||
textStripper.setStartPage(pageIndex + 1);
|
||||
textStripper.setEndPage(pageIndex + 1);
|
||||
String pageText = textStripper.getText(document);
|
||||
boolean hasText = !pageText.trim().isEmpty();
|
||||
if (hasText) {
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.println("page " + pageIndex + " has text");
|
||||
continue;
|
||||
}
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
if (hasImages) {
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.println("page " + pageIndex + " has image");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
System.out.print(pagesToKeepIndex.size());
|
||||
PDDocument outputDocument = new PDDocument();
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (Integer i : pagesToKeepIndex) {
|
||||
// Create temp file to save the image
|
||||
Path tempFile = Files.createTempFile("image_", ".png");
|
||||
|
||||
// Render image and save as temp file
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
|
||||
ImageIO.write(image, "png", tempFile.toFile());
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString()));
|
||||
|
||||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||
|
||||
//does contain data
|
||||
if(returnCode ==0) {
|
||||
outputDocument.addPage(document.getPage(i - 1));
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.println("page " + pageIndex + " has text");
|
||||
} else {
|
||||
System.out.print("Found blank page skipping, page #" + i);
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
if (hasImages) {
|
||||
System.out.println("page " + pageIndex + " has image");
|
||||
|
||||
Path tempFile = Files.createTempFile("image_", ".png");
|
||||
|
||||
// Render image and save as temp file
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
|
||||
ImageIO.write(image, "png", tempFile.toFile());
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString() ,"--threshold", String.valueOf(threshold), "--white_percent", String.valueOf(whitePercent)));
|
||||
|
||||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||
|
||||
// does contain data
|
||||
if (returnCode == 0) {
|
||||
System.out.println("page " + pageIndex + " has image which is not blank");
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
} else {
|
||||
System.out.println("Skipping, Image was blank for page #" + pageIndex);
|
||||
}
|
||||
}
|
||||
}
|
||||
pageIndex++;
|
||||
|
||||
}
|
||||
System.out.print("pagesToKeep=" + pagesToKeepIndex.size());
|
||||
|
||||
// Remove pages not present in pagesToKeepIndex
|
||||
List<Integer> pageIndices = IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList());
|
||||
Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal
|
||||
for (Integer i : pageIndices) {
|
||||
if (!pagesToKeepIndex.contains(i)) {
|
||||
pages.remove(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
return PdfUtils.pdfDocToWebResponse(outputDocument, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
|
||||
return PdfUtils.pdfDocToWebResponse(document, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
|
||||
} catch (IOException e) {
|
||||
e.printStackTrace();
|
||||
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
||||
} finally {
|
||||
if(document != null)
|
||||
if (document != null)
|
||||
document.close();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static boolean hasImagesOnPage(PDPage page) throws IOException {
|
||||
ImageFinder imageFinder = new ImageFinder(page);
|
||||
imageFinder.processPage(page);
|
||||
|
|
|
@ -101,8 +101,18 @@ public class ExtractImageScansController {
|
|||
for (int i = 0; i < images.size(); i++) {
|
||||
|
||||
Path tempDir = Files.createTempDirectory("openCV_output");
|
||||
List<String> command = new ArrayList<>(Arrays.asList("python3", "./scripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold),
|
||||
String.valueOf(tolerance), String.valueOf(minArea), String.valueOf(minContourArea), String.valueOf(borderSize)));
|
||||
List<String> command = new ArrayList<>(Arrays.asList(
|
||||
"python3",
|
||||
"./scripts/split_photos.py",
|
||||
images.get(i),
|
||||
tempDir.toString(),
|
||||
"--angle_threshold", String.valueOf(angleThreshold),
|
||||
"--tolerance", String.valueOf(tolerance),
|
||||
"--min_area", String.valueOf(minArea),
|
||||
"--min_contour_area", String.valueOf(minContourArea),
|
||||
"--border_size", String.valueOf(borderSize)
|
||||
));
|
||||
|
||||
|
||||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||
|
|
|
@ -16,6 +16,16 @@
|
|||
|
||||
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||
<div class="form-group">
|
||||
<label for="threshold">Threshold:</label>
|
||||
<input type="number" class="form-control" id="threshold" name="threshold" value="10">
|
||||
<small id="thresholdHelp" class="form-text text-muted">Threshold for determining how white a white pixel must be)</small>
|
||||
</div>
|
||||
<div class="form-group">
|
||||
<label for="whitePercent">White Percent (%):</label>
|
||||
<input type="number" class="form-control" id="whitePercent" name="whitePercent" value="99">
|
||||
<small id="whitePercentHelp" class="form-text text-muted">Percent of page that must be white to be removed</small>
|
||||
</div>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
|
||||
</form>
|
||||
</div>
|
||||
|
|
Loading…
Reference in a new issue