remove blanks
This commit is contained in:
parent
cc919ea614
commit
0b1cdf6a68
5 changed files with 116 additions and 35 deletions
39
scripts/detect-blank-pages.sh
Normal file
39
scripts/detect-blank-pages.sh
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import sys
|
||||||
|
|
||||||
|
def is_blank_image(image_path, threshold=10, white_value=255, blur_size=5):
|
||||||
|
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
|
||||||
|
|
||||||
|
if image is None:
|
||||||
|
print(f"Error: Unable to read the image file: {image_path}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Apply Gaussian blur to reduce noise
|
||||||
|
blurred_image = cv2.GaussianBlur(image, (blur_size, blur_size), 0)
|
||||||
|
|
||||||
|
_, thresholded_image = cv2.threshold(blurred_image, white_value - threshold, white_value, cv2.THRESH_BINARY)
|
||||||
|
|
||||||
|
# Calculate the percentage of white pixels in the thresholded image
|
||||||
|
white_pixels = np.sum(thresholded_image == white_value)
|
||||||
|
total_pixels = thresholded_image.size
|
||||||
|
white_pixel_percentage = (white_pixels / total_pixels) * 100
|
||||||
|
|
||||||
|
return white_pixel_percentage > 99
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) != 2:
|
||||||
|
print("Usage: python detect_blank_page.py <image_path>")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
image_path = sys.argv[1]
|
||||||
|
blank = is_blank_image(image_path)
|
||||||
|
|
||||||
|
if blank:
|
||||||
|
# Return code 1: The image is considered blank.
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
# Return code 0: The image is not considered blank.
|
||||||
|
sys.exit(0)
|
|
@ -2,6 +2,7 @@ package stirling.software.SPDF.controller.api.other;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
import org.apache.pdfbox.pdmodel.PDPage;
|
import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.PDPageTree;
|
import org.apache.pdfbox.pdmodel.PDPageTree;
|
||||||
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
import org.apache.pdfbox.text.PDFTextStripper;
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
|
@ -10,38 +11,30 @@ import org.springframework.web.bind.annotation.RequestPart;
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
import stirling.software.SPDF.utils.ImageFinder;
|
import stirling.software.SPDF.utils.ImageFinder;
|
||||||
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
import java.io.ByteArrayOutputStream;
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
public class BlankPageController {
|
public class BlankPageController {
|
||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
@PostMapping(consumes = "multipart/form-data", value = "/remove-blanks")
|
||||||
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @RequestPart(required = true, value = "processType") int processType) throws IOException, InterruptedException {
|
public ResponseEntity<byte[]> removeBlankPages(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile) throws IOException, InterruptedException {
|
||||||
boolean removeNoText = false;
|
|
||||||
boolean removeNoTextOrImages = false;
|
|
||||||
|
|
||||||
if(processType == 0) {
|
|
||||||
removeNoText = true;
|
|
||||||
} else if (processType == 1) {
|
|
||||||
removeNoTextOrImages = true;
|
|
||||||
} else if (processType == 2) {
|
|
||||||
//run OCR
|
|
||||||
OCRController ocr = new OCRController();
|
|
||||||
ocr.processPdfWithOCR(inputFile, Arrays.asList("eng"), false, false, true, false, "type", "hocr", false);
|
|
||||||
|
|
||||||
removeNoText = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
PDDocument document = PDDocument.load(inputFile.getInputStream());
|
PDDocument document = PDDocument.load(inputFile.getInputStream());
|
||||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||||
PDFTextStripper textStripper = new PDFTextStripper();
|
PDFTextStripper textStripper = new PDFTextStripper();
|
||||||
|
|
||||||
List<PDPage> pagesToKeep = new ArrayList<>();
|
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
||||||
int pageIndex = 0;
|
int pageIndex = 0;
|
||||||
|
|
||||||
for (PDPage page : pages) {
|
for (PDPage page : pages) {
|
||||||
|
@ -50,28 +43,40 @@ public class BlankPageController {
|
||||||
textStripper.setEndPage(pageIndex);
|
textStripper.setEndPage(pageIndex);
|
||||||
String pageText = textStripper.getText(document);
|
String pageText = textStripper.getText(document);
|
||||||
boolean hasText = !pageText.trim().isEmpty();
|
boolean hasText = !pageText.trim().isEmpty();
|
||||||
|
if (hasText) {
|
||||||
|
pagesToKeepIndex.add(pageIndex);
|
||||||
|
System.out.print("page " + pageIndex + " has text");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
boolean hasImages = hasImagesOnPage(page);
|
boolean hasImages = hasImagesOnPage(page);
|
||||||
|
if (hasImages) {
|
||||||
if (removeNoText && removeNoTextOrImages) {
|
pagesToKeepIndex.add(pageIndex);
|
||||||
if (hasText || hasImages) {
|
System.out.print("page " + pageIndex + " has image");
|
||||||
pagesToKeep.add(page);
|
continue;
|
||||||
}
|
|
||||||
} else if (removeNoText) {
|
|
||||||
if (hasText) {
|
|
||||||
pagesToKeep.add(page);
|
|
||||||
}
|
|
||||||
} else if (removeNoTextOrImages) {
|
|
||||||
if (hasText && hasImages) {
|
|
||||||
pagesToKeep.add(page);
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pagesToKeep.add(page);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
System.out.print(pagesToKeepIndex.size());
|
||||||
PDDocument outputDocument = new PDDocument();
|
PDDocument outputDocument = new PDDocument();
|
||||||
for (PDPage page : pagesToKeep) {
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
outputDocument.addPage(page);
|
for (Integer i : pagesToKeepIndex) {
|
||||||
|
// Create temp file to save the image
|
||||||
|
Path tempFile = Files.createTempFile("image_", ".png");
|
||||||
|
|
||||||
|
// Render image and save as temp file
|
||||||
|
BufferedImage image = pdfRenderer.renderImageWithDPI(i - 1, 300);
|
||||||
|
ImageIO.write(image, "png", tempFile.toFile());
|
||||||
|
|
||||||
|
List<String> command = new ArrayList<>(Arrays.asList("python3", "/scripts/detect-blank-pages.py", tempFile.toString()));
|
||||||
|
|
||||||
|
// Run CLI command
|
||||||
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
//does contain data
|
||||||
|
if(returnCode ==0) {
|
||||||
|
outputDocument.addPage(document.getPage(i - 1));
|
||||||
|
} else {
|
||||||
|
System.out.print("Found blank page skipping, page #" + i);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
|
||||||
|
|
|
@ -95,4 +95,11 @@ public class OtherWebController {
|
||||||
return "other/repair";
|
return "other/repair";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@GetMapping("/remove-blanks")
|
||||||
|
@Hidden
|
||||||
|
public String removeBlanksForm(Model model) {
|
||||||
|
model.addAttribute("currentPage", "remove-blanks");
|
||||||
|
return "other/remove-blanks";
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -112,6 +112,8 @@ filter: invert(0.2) sepia(2) saturate(50) hue-rotate(190deg);
|
||||||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.flatten.title}, cardText=#{home.flatten.desc}, cardLink='flatten', svgPath='images/flatten.svg')}"></div>
|
<div th:replace="~{fragments/card :: card(cardTitle=#{home.flatten.title}, cardText=#{home.flatten.desc}, cardLink='flatten', svgPath='images/flatten.svg')}"></div>
|
||||||
|
|
||||||
<div th:replace="~{fragments/card :: card(cardTitle=#{home.repair.title}, cardText=#{home.repair.desc}, cardLink='repair', svgPath='images/wrench.svg')}"></div>
|
<div th:replace="~{fragments/card :: card(cardTitle=#{home.repair.title}, cardText=#{home.repair.desc}, cardLink='repair', svgPath='images/wrench.svg')}"></div>
|
||||||
|
<div th:replace="~{fragments/card :: card(cardTitle=#{home.removeBlanks.title}, cardText=#{home.removeBlanks.desc}, cardLink='remove-blanks', svgPath='images/wrench.svg')}"></div>
|
||||||
|
<div th:replace="~{fragments/card :: card(cardTitle=#{home.compare.title}, cardText=#{home.compare.desc}, cardLink='compare', svgPath='images/wrench.svg')}"></div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||||
|
|
28
src/main/resources/templates/other/remove-blanks.html
Normal file
28
src/main/resources/templates/other/remove-blanks.html
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||||
|
|
||||||
|
<th:block th:insert="~{fragments/common :: head(title=#{removeBlanks.title})}"></th:block>
|
||||||
|
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id="page-container">
|
||||||
|
<div id="content-wrap">
|
||||||
|
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||||
|
<br> <br>
|
||||||
|
<div class="container">
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2 th:text="#{removeBlanks.header}"></h2>
|
||||||
|
|
||||||
|
<form id="multiPdfForm" th:action="@{remove-blanks}" method="post" enctype="multipart/form-data">
|
||||||
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||||
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{removeBlanks.submit}"></button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in a new issue