separated blank and non-blank pages and created unified ZIP archive Co-authored-by: mannam <101550345+ManoharMannam@users.noreply.github.com> Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
parent
191e79da18
commit
ea6d4a293e
1 changed files with 53 additions and 29 deletions
|
@ -1,12 +1,12 @@
|
||||||
package stirling.software.SPDF.controller.api.misc;
|
package stirling.software.SPDF.controller.api.misc;
|
||||||
|
|
||||||
import java.awt.image.BufferedImage;
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collections;
|
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.stream.Collectors;
|
import java.util.zip.ZipEntry;
|
||||||
import java.util.stream.IntStream;
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
@ -17,6 +17,7 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||||
import org.springframework.web.bind.annotation.PostMapping;
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
@ -50,31 +51,31 @@ public class BlankPageController {
|
||||||
int threshold = request.getThreshold();
|
int threshold = request.getThreshold();
|
||||||
float whitePercent = request.getWhitePercent();
|
float whitePercent = request.getWhitePercent();
|
||||||
|
|
||||||
PDDocument document = null;
|
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
|
||||||
try {
|
|
||||||
document = Loader.loadPDF(inputFile.getBytes());
|
|
||||||
PDPageTree pages = document.getDocumentCatalog().getPages();
|
PDPageTree pages = document.getDocumentCatalog().getPages();
|
||||||
PDFTextStripper textStripper = new PDFTextStripper();
|
PDFTextStripper textStripper = new PDFTextStripper();
|
||||||
|
|
||||||
List<Integer> pagesToKeepIndex = new ArrayList<>();
|
List<PDPage> nonBlankPages = new ArrayList<>();
|
||||||
|
List<PDPage> blankPages = new ArrayList<>();
|
||||||
int pageIndex = 0;
|
int pageIndex = 0;
|
||||||
|
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
pdfRenderer.setSubsamplingAllowed(true);
|
pdfRenderer.setSubsamplingAllowed(true);
|
||||||
for (PDPage page : pages) {
|
for (PDPage page : pages) {
|
||||||
logger.info("checking page " + pageIndex);
|
logger.info("checking page {}", pageIndex);
|
||||||
textStripper.setStartPage(pageIndex + 1);
|
textStripper.setStartPage(pageIndex + 1);
|
||||||
textStripper.setEndPage(pageIndex + 1);
|
textStripper.setEndPage(pageIndex + 1);
|
||||||
String pageText = textStripper.getText(document);
|
String pageText = textStripper.getText(document);
|
||||||
boolean hasText = !pageText.trim().isEmpty();
|
boolean hasText = !pageText.trim().isEmpty();
|
||||||
|
|
||||||
Boolean blank = true;
|
boolean blank = true;
|
||||||
if (hasText) {
|
if (hasText) {
|
||||||
logger.info("page " + pageIndex + " has text, not blank");
|
logger.info("page {} has text, not blank", pageIndex);
|
||||||
blank = false;
|
blank = false;
|
||||||
} else {
|
} else {
|
||||||
boolean hasImages = PdfUtils.hasImagesOnPage(page);
|
boolean hasImages = PdfUtils.hasImagesOnPage(page);
|
||||||
if (hasImages) {
|
if (hasImages) {
|
||||||
logger.info("page " + pageIndex + " has image, running blank detection");
|
logger.info("page {} has image, running blank detection", pageIndex);
|
||||||
// Render image and save as temp file
|
// Render image and save as temp file
|
||||||
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 30);
|
BufferedImage image = pdfRenderer.renderImageWithDPI(pageIndex, 30);
|
||||||
blank = isBlankImage(image, threshold, whitePercent, threshold);
|
blank = isBlankImage(image, threshold, whitePercent, threshold);
|
||||||
|
@ -82,34 +83,57 @@ public class BlankPageController {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (blank) {
|
if (blank) {
|
||||||
logger.info("Skipping, Image was blank for page #" + pageIndex);
|
logger.info("Skipping, Image was blank for page #{}", pageIndex);
|
||||||
|
blankPages.add(page);
|
||||||
} else {
|
} else {
|
||||||
logger.info("page " + pageIndex + " has image which is not blank");
|
logger.info("page {} has image which is not blank", pageIndex);
|
||||||
pagesToKeepIndex.add(pageIndex);
|
nonBlankPages.add(page);
|
||||||
}
|
}
|
||||||
|
|
||||||
pageIndex++;
|
pageIndex++;
|
||||||
}
|
}
|
||||||
// Remove pages not present in pagesToKeepIndex
|
|
||||||
List<Integer> pageIndices =
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
IntStream.range(0, pages.getCount()).boxed().collect(Collectors.toList());
|
ZipOutputStream zos = new ZipOutputStream(baos);
|
||||||
Collections.reverse(pageIndices); // Reverse to prevent index shifting during removal
|
|
||||||
for (Integer i : pageIndices) {
|
String filename =
|
||||||
if (!pagesToKeepIndex.contains(i)) {
|
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
||||||
pages.remove(i);
|
.replaceFirst("[.][^.]+$", "");
|
||||||
}
|
|
||||||
|
if (!nonBlankPages.isEmpty()) {
|
||||||
|
createZipEntry(zos, nonBlankPages, filename + "_nonBlankPages.pdf");
|
||||||
|
} else {
|
||||||
|
createZipEntry(zos, blankPages, filename + "_allBlankPages.pdf");
|
||||||
}
|
}
|
||||||
|
|
||||||
return WebResponseUtils.pdfDocToWebResponse(
|
if (!nonBlankPages.isEmpty() && !blankPages.isEmpty()) {
|
||||||
document,
|
createZipEntry(zos, blankPages, filename + "_blankPages.pdf");
|
||||||
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
|
}
|
||||||
.replaceFirst("[.][^.]+$", "")
|
|
||||||
+ "_blanksRemoved.pdf");
|
zos.close();
|
||||||
|
|
||||||
|
logger.info("Returning ZIP file: {}", filename + "_processed.zip");
|
||||||
|
return WebResponseUtils.boasToWebResponse(
|
||||||
|
baos, filename + "_processed.zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||||
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
logger.error("exception", e);
|
logger.error("exception", e);
|
||||||
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
return new ResponseEntity<>(HttpStatus.INTERNAL_SERVER_ERROR);
|
||||||
} finally {
|
}
|
||||||
if (document != null) document.close();
|
}
|
||||||
|
|
||||||
|
public void createZipEntry(ZipOutputStream zos, List<PDPage> pages, String entryName)
|
||||||
|
throws IOException {
|
||||||
|
try (PDDocument document = new PDDocument()) {
|
||||||
|
|
||||||
|
for (PDPage page : pages) {
|
||||||
|
document.addPage(page);
|
||||||
|
}
|
||||||
|
|
||||||
|
ZipEntry zipEntry = new ZipEntry(entryName);
|
||||||
|
zos.putNextEntry(zipEntry);
|
||||||
|
document.save(zos);
|
||||||
|
zos.closeEntry();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue