remove blanks
This commit is contained in:
parent
cc919ea614
commit
0b1cdf6a68
5 changed files with 116 additions and 35 deletions
39
scripts/detect-blank-pages.sh
Normal file
39
scripts/detect-blank-pages.sh
Normal file
|
@ -0,0 +1,39 @@
|
|||
import cv2
|
||||
import numpy as np
|
||||
import sys
|
||||
|
||||
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
||||
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
||||
|
||||
if image is None:
|
||||
print(f"Error: Unable to read the image file: {image_path}")
|
||||
return False
|
||||
|
||||
# Apply Gaussian blur to reduce noise
|
||||
blurred_image = cv2.GaussianBlur(image, (blur_size, blur_size), 0)
|
||||
|
||||
_, thresholded_image = cv2.threshold(blurred_image, white_value - threshold, white_value, cv2.THRESH_BINARY)
|
||||
|
||||
# Calculate the percentage of white pixels in the thresholded image
|
||||
white_pixels = np.sum(thresholded_image == white_value)
|
||||
total_pixels = thresholded_image.size
|
||||
white_pixel_percentage = (white_pixels / total_pixels) * 100
|
||||
|
||||
return white_pixel_percentage > 99
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) != 2:
|
||||
print("Usage: python detect_blank_page.py <image_path>")
|
||||
sys.exit(1)
|
||||
|
||||
image_path = sys.argv[1]
|
||||
blank = is_blank_image(image_path)
|
||||
|
||||
if blank:
|
||||
# Return code 1: The image is considered blank.
|
||||
sys.exit(1)
|
||||
else:
|
||||
# Return code 0: The image is not considered blank.
|
||||
sys.exit(0)
|
|
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other;
|
|||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||
import org.apache.pdfbox.text.PDFTextStripper;
|
||||
import org.springframework.http.HttpStatus;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
|
@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart;
|
|||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
import stirling.software.SPDF.utils.ImageFinder;
|
||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||
|
||||
import java.awt.image.BufferedImage;
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.List;
|
||||
|
||||
import javax.imageio.ImageIO;
|
||||
|
||||
@RestController
|
||||
public class BlankPageController {
|
||||
|
||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
|
||||
boolean removeNoText = false;
|
||||
boolean removeNoTextOrImages = false;
|
||||
|
||||
if(processType == 0) {
|
||||
removeNoText = true;
|
||||
} else if (processType == 1) {
|
||||
removeNoTextOrImages = true;
|
||||
} else if (processType == 2) {
|
||||
//run OCR
|
||||
OCRController ocr = new OCRController();
|
||||
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
|
||||
|
||||
removeNoText = true;
|
||||
}
|
||||
|
||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
||||
try {
|
||||
PDDocument document = PDDocument.load(inputFile.getInputStream());
|
||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||
PDFTextStripper textStripper = new PDFTextStripper();
|
||||
|
||||
List<PDPage> pagesToKeep = new ArrayList<>();
|
||||
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
||||
int pageIndex = 0;
|
||||
|
||||
for (PDPage page : pages) {
|
||||
|
@ -50,28 +43,40 @@ public class BlankPageController {
|
|||
textStripper.setEndPage(pageIndex);
|
||||
String pageText = textStripper.getText(document);
|
||||
boolean hasText = !pageText.trim().isEmpty();
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
|
||||
if (removeNoText && removeNoTextOrImages) {
|
||||
if (hasText || hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else if (removeNoText) {
|
||||
if (hasText) {
|
||||
pagesToKeep.add(page);
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.print("page " + pageIndex + " has text");
|
||||
continue;
|
||||
}
|
||||
} else if (removeNoTextOrImages) {
|
||||
if (hasText && hasImages) {
|
||||
pagesToKeep.add(page);
|
||||
}
|
||||
} else {
|
||||
pagesToKeep.add(page);
|
||||
boolean hasImages = hasImagesOnPage(page);
|
||||
if (hasImages) {
|
||||
pagesToKeepIndex.add(pageIndex);
|
||||
System.out.print("page " + pageIndex + " has image");
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
System.out.print(pagesToKeepIndex.size());
|
||||
PDDocument outputDocument = new PDDocument();
|
||||
for (PDPage page : pagesToKeep) {
|
||||
outputDocument.addPage(page);
|
||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||
for (Integer i : pagesToKeepIndex) {
|
||||
// Create temp file to save the image
|
||||
Path tempFile = Files.createTempFile("image_", ".png");
|
||||
|
||||
// Render image and save as temp file
|
||||
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
|
||||
ImageIO.write(image, "png", tempFile.toFile());
|
||||
|
||||
List<String> command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString()));
|
||||
|
||||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||
|
||||
//does contain data
|
||||
if(returnCode ==0) {
|
||||
outputDocument.addPage(document.getPage(i - 1));
|
||||
} else {
|
||||
System.out.print("Found blank page skipping, page #" + i);
|
||||
}
|
||||
}
|
||||
|
||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||
|
|
|
@ -95,4 +95,11 @@ public class OtherWebController {
|
|||
return "other/repair";
|
||||
}
|
||||
|
||||
@GetMapping("/remove-blanks")
|
||||
@Hidden
|
||||
public String removeBlanksForm(Model model) {
|
||||
model.addAttribute("currentPage", "remove-blanks");
|
||||
return "other/remove-blanks";
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -112,6 +112,8 @@ filter: invert(0.2) sepia(2) saturate(50) hue-rotate(190deg);
|
|||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.flatten.title}, cardText=#{home.flatten.desc}, cardLink='flatten', svgPath='images/flatten.svg')}"></div>
|
||||
|
||||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.repair.title}, cardText=#{home.repair.desc}, cardLink='repair', svgPath='images/wrench.svg')}"></div>
|
||||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.removeBlanks.title}, cardText=#{home.removeBlanks.desc}, cardLink='remove-blanks', svgPath='images/wrench.svg')}"></div>
|
||||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.compare.title}, cardText=#{home.compare.desc}, cardLink='compare', svgPath='images/wrench.svg')}"></div>
|
||||
</div>
|
||||
</div>
|
||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||
|
|
28
src/main/resources/templates/other/remove-blanks.html
Normal file
28
src/main/resources/templates/other/remove-blanks.html
Normal file
|
@ -0,0 +1,28 @@
|
|||
<!DOCTYPE html>
|
||||
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||
|
||||
<th:block th:insert="~{fragments/common :: head(title=#{removeBlanks.title})}"></th:block>
|
||||
|
||||
|
||||
<body>
|
||||
<div id="page-container">
|
||||
<div id="content-wrap">
|
||||
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||
<br> <br>
|
||||
<div class="container">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-md-6">
|
||||
<h2 th:text="#{removeBlanks.header}"></h2>
|
||||
|
||||
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
Loading…
Reference in a new issue