Changes to blank detection, WIP for %
This commit is contained in:
parent
2d42ae9a36
commit
a647347e10
4 changed files with 85 additions and 52 deletions
|
@ -1,8 +1,9 @@
|
||||||
import cv2
|
import cv2
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import sys
|
import sys
|
||||||
|
import argparse
|
||||||
|
|
||||||
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
def is_blank_image(image_path, threshold=10, white_percent=99, white_value=255, blur_size=5):
|
||||||
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
||||||
|
|
||||||
if image is None:
|
if image is None:
|
||||||
|
@ -19,17 +20,17 @@ def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
||||||
total_pixels = thresholded_image.size
|
total_pixels = thresholded_image.size
|
||||||
white_pixel_percentage = (white_pixels / total_pixels) * 100
|
white_pixel_percentage = (white_pixels / total_pixels) * 100
|
||||||
|
|
||||||
return white_pixel_percentage > 99
|
return white_pixel_percentage > white_percent
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
if len(sys.argv) != 2:
|
parser = argparse.ArgumentParser(description='Detect if an image is considered blank or not.')
|
||||||
print("Usage: python detect_blank_page.py <image_path>")
|
parser.add_argument('image_path', help='The path to the image file.')
|
||||||
sys.exit(1)
|
parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for determining white pixels. The default value is 10.')
|
||||||
|
parser.add_argument('-w', '--white_percent', type=int, default=99, help='The percentage of white pixels for an image to be considered blank. The default value is 99.')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
image_path = sys.argv[1]
|
blank = is_blank_image(args.image_path, args.threshold, args.white_percent)
|
||||||
blank = is_blank_image(image_path)
|
|
||||||
|
|
||||||
if blank:
|
if blank:
|
||||||
# Return code 1: The image is considered blank.
|
# Return code 1: The image is considered blank.
|
||||||
|
|
|
@ -1,11 +1,15 @@
|
||||||
package stirling.software.SPDF.controller.api.other;
|
package stirling.software.SPDF.controller.api.other;
|
||||||
|
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.nio.file.Path;
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
|
import java.util.Collections;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.stream.IntStream;
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
import javax.imageio.ImageIO;
|
||||||
|
|
||||||
|
@ -17,6 +21,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.bind.annotation.RequestPart;
|
import org.springframework.web.bind.annotation.RequestPart;
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
@ -29,7 +34,9 @@ import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
public class BlankPageController {
|
public class BlankPageController {
|
||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
|
||||||
|
@RequestParam(defaultValue = "10", name = "threshold") int threshold,
|
||||||
|
@RequestParam(defaultValue = "99", name = "whitePercent") int whitePercent) throws IOException, InterruptedException {
|
||||||
PDDocument document = null;
|
PDDocument document = null;
|
||||||
try {
|
try {
|
||||||
document = PDDocument.load(inputFile.getInputStream());
|
document = PDDocument.load(inputFile.getInputStream());
|
||||||
|
@ -38,62 +45,67 @@ public class BlankPageController {
|
||||||
|
|
||||||
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
||||||
int pageIndex = 0;
|
int pageIndex = 0;
|
||||||
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
|
|
||||||
for (PDPage page : pages) {
|
for (PDPage page : pages) {
|
||||||
pageIndex++;
|
System.out.println("checking page " + pageIndex);
|
||||||
textStripper.setStartPage(pageIndex);
|
textStripper.setStartPage(pageIndex + 1);
|
||||||
textStripper.setEndPage(pageIndex);
|
textStripper.setEndPage(pageIndex + 1);
|
||||||
String pageText = textStripper.getText(document);
|
String pageText = textStripper.getText(document);
|
||||||
boolean hasText = !pageText.trim().isEmpty();
|
boolean hasText = !pageText.trim().isEmpty();
|
||||||
if (hasText) {
|
if (hasText) {
|
||||||
pagesToKeepIndex.add(pageIndex);
|
pagesToKeepIndex.add(pageIndex);
|
||||||
System.out.println("page " + pageIndex + " has text");
|
System.out.println("page " + pageIndex + " has text");
|
||||||
continue;
|
|
||||||
}
|
|
||||||
boolean hasImages = hasImagesOnPage(page);
|
|
||||||
if (hasImages) {
|
|
||||||
pagesToKeepIndex.add(pageIndex);
|
|
||||||
System.out.println("page " + pageIndex + " has image");
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
System.out.print(pagesToKeepIndex.size());
|
|
||||||
PDDocument outputDocument = new PDDocument();
|
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
|
||||||
for (Integer i : pagesToKeepIndex) {
|
|
||||||
// Create temp file to save the image
|
|
||||||
Path tempFile = Files.createTempFile("image_", ".png");
|
|
||||||
|
|
||||||
// Render image and save as temp file
|
|
||||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
|
|
||||||
ImageIO.write(image, "png", tempFile.toFile());
|
|
||||||
|
|
||||||
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString()));
|
|
||||||
|
|
||||||
// Run CLI command
|
|
||||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
|
||||||
|
|
||||||
//does contain data
|
|
||||||
if(returnCode ==0) {
|
|
||||||
outputDocument.addPage(document.getPage(i - 1));
|
|
||||||
} else {
|
} else {
|
||||||
System.out.print("Found blank page skipping, page #" + i);
|
boolean hasImages = hasImagesOnPage(page);
|
||||||
|
if (hasImages) {
|
||||||
|
System.out.println("page " + pageIndex + " has image");
|
||||||
|
|
||||||
|
Path tempFile = Files.createTempFile("image_", ".png");
|
||||||
|
|
||||||
|
// Render image and save as temp file
|
||||||
|
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 300);
|
||||||
|
ImageIO.write(image, "png", tempFile.toFile());
|
||||||
|
|
||||||
|
List<String> command = new ArrayList<>(Arrays.asList("python3", System.getProperty("user.dir") + "scripts/detect-blank-pages.py", tempFile.toString() ,"--threshold", String.valueOf(threshold), "--white_percent", String.valueOf(whitePercent)));
|
||||||
|
|
||||||
|
// Run CLI command
|
||||||
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
// does contain data
|
||||||
|
if (returnCode == 0) {
|
||||||
|
System.out.println("page " + pageIndex + " has image which is not blank");
|
||||||
|
pagesToKeepIndex.add(pageIndex);
|
||||||
|
} else {
|
||||||
|
System.out.println("Skipping, Image was blank for page #" + pageIndex);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageIndex++;
|
||||||
|
|
||||||
|
}
|
||||||
|
System.out.print("pagesToKeep=" + pagesToKeepIndex.size());
|
||||||
|
|
||||||
|
// Remove pages not present in pagesToKeepIndex
|
||||||
|
List<Integer> pageIndices = IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList());
|
||||||
|
Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal
|
||||||
|
for (Integer i : pageIndices) {
|
||||||
|
if (!pagesToKeepIndex.contains(i)) {
|
||||||
|
pages.remove(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return PdfUtils.pdfDocToWebResponse(document, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
|
||||||
|
|
||||||
|
|
||||||
return PdfUtils.pdfDocToWebResponse(outputDocument, inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_blanksRemoved.pdf");
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
e.printStackTrace();
|
e.printStackTrace();
|
||||||
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
||||||
} finally {
|
} finally {
|
||||||
if(document != null)
|
if (document != null)
|
||||||
document.close();
|
document.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private static boolean hasImagesOnPage(PDPage page) throws IOException {
|
private static boolean hasImagesOnPage(PDPage page) throws IOException {
|
||||||
ImageFinder imageFinder = new ImageFinder(page);
|
ImageFinder imageFinder = new ImageFinder(page);
|
||||||
imageFinder.processPage(page);
|
imageFinder.processPage(page);
|
||||||
|
|
|
@ -101,8 +101,18 @@ public class ExtractImageScansController {
|
||||||
for (int i = 0; i < images.size(); i++) {
|
for (int i = 0; i < images.size(); i++) {
|
||||||
|
|
||||||
Path tempDir = Files.createTempDirectory("openCV_output");
|
Path tempDir = Files.createTempDirectory("openCV_output");
|
||||||
List<String> command = new ArrayList<>(Arrays.asList("python3", "./scripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold),
|
List<String> command = new ArrayList<>(Arrays.asList(
|
||||||
String.valueOf(tolerance), String.valueOf(minArea), String.valueOf(minContourArea), String.valueOf(borderSize)));
|
"python3",
|
||||||
|
"./scripts/split_photos.py",
|
||||||
|
images.get(i),
|
||||||
|
tempDir.toString(),
|
||||||
|
"--angle_threshold", String.valueOf(angleThreshold),
|
||||||
|
"--tolerance", String.valueOf(tolerance),
|
||||||
|
"--min_area", String.valueOf(minArea),
|
||||||
|
"--min_contour_area", String.valueOf(minContourArea),
|
||||||
|
"--border_size", String.valueOf(borderSize)
|
||||||
|
));
|
||||||
|
|
||||||
|
|
||||||
// Run CLI command
|
// Run CLI command
|
||||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||||
|
|
|
@ -16,6 +16,16 @@
|
||||||
|
|
||||||
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
|
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
|
||||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="threshold">Threshold:</label>
|
||||||
|
<input type="number" class="form-control" id="threshold" name="threshold" value="10">
|
||||||
|
<small id="thresholdHelp" class="form-text text-muted">Threshold for determining how white a white pixel must be)</small>
|
||||||
|
</div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="whitePercent">White Percent (%):</label>
|
||||||
|
<input type="number" class="form-control" id="whitePercent" name="whitePercent" value="99">
|
||||||
|
<small id="whitePercentHelp" class="form-text text-muted">Percent of page that must be white to be removed</small>
|
||||||
|
</div>
|
||||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
|
||||||
</form>
|
</form>
|
||||||
</div>
|
</div>
|
||||||
|
|
Loading…
Reference in a new issue