ocr remove images

This commit is contained in:
Anthony Stirling 2023-05-01 21:53:10 +01:00
parent 30c56a0ec9
commit 9af537c985
10 changed files with 76 additions and 4 deletions

View file

@ -47,7 +47,8 @@ public class OCRController {
@RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar, @RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar,
@RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean, @RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean,
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType, @RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType,
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType) @RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType,
@RequestParam(name = "removeImagesAfter", required = false) Boolean removeImagesAfter)
throws IOException, InterruptedException { throws IOException, InterruptedException {
// --output-type pdfa // --output-type pdfa
@ -114,11 +115,24 @@ public class OCRController {
// Run CLI command // Run CLI command
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command); int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
// Remove images from the OCR processed PDF if the flag is set to true
if (removeImagesAfter != null && removeImagesAfter) {
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
List<String> gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString());
int gsReturnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand);
tempOutputFile = tempPdfWithoutImages;
}
// Read the OCR processed PDF file // Read the OCR processed PDF file
byte[] pdfBytes = Files.readAllBytes(tempOutputFile); byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
// Clean up the temporary files // Clean up the temporary files
Files.delete(tempInputFile); Files.delete(tempInputFile);
// Return the OCR processed PDF as a response // Return the OCR processed PDF as a response
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf"; String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";

View file

@ -72,4 +72,13 @@ public class OtherWebController {
model.addAttribute("currentPage", "add-image"); model.addAttribute("currentPage", "add-image");
return "other/add-image"; return "other/add-image";
} }
@GetMapping("/adjust-contrast")
@Hidden
public String contrast(Model model) {
model.addAttribute("currentPage", "adjust-contrast");
return "other/adjust-contrast";
}
} }

View file

@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
public class ProcessExecutor { public class ProcessExecutor {
public enum Processes { public enum Processes {
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
} }
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>(); private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@ -24,6 +24,7 @@ public class ProcessExecutor {
case LIBRE_OFFICE -> 1; case LIBRE_OFFICE -> 1;
case OCR_MY_PDF -> 2; case OCR_MY_PDF -> 2;
case PYTHON_OPENCV -> 8; case PYTHON_OPENCV -> 8;
case GHOSTSCRIPT -> 16;
}; };
return new ProcessExecutor(semaphoreLimit); return new ProcessExecutor(semaphoreLimit);
}); });

View file

@ -149,6 +149,8 @@ ocr.selectText.7=\u0641\u0631\u0636 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\
ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635) ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635)
ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629 ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629
ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641 ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641
ocr.selectText.11 = إزالة الصور بعد التعرف الضوئي على الحروف (يزيل كل الصور ، يكون مفيدًا فقط إذا كان جزءًا من خطوة التحويل)
ocr.selectText.12 = نوع العرض (متقدم)
ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621 ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621
ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR. ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR.
ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR

View file

@ -142,6 +142,8 @@ ocr.selectText.7=OCR erzwingen, OCR wird jede Seite entfernen und alle ursprüng
ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält) ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält)
ocr.selectText.9=Zusätzliche Einstellungen ocr.selectText.9=Zusätzliche Einstellungen
ocr.selectText.10=OCR-Modus ocr.selectText.10=OCR-Modus
ocr.selectText.11=Bilder nach OCR entfernen (Entfernt ALLE Bilder, nur sinnvoll, wenn Teil des Konvertierungsschritts)
ocr.selectText.12=Rendertyp (Erweitert)
ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können
ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR. ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR.
ocr.submit=PDF mit OCR verarbeiten ocr.submit=PDF mit OCR verarbeiten

View file

@ -148,6 +148,8 @@ ocr.selectText.7=Force OCR, will OCR Every page removing all original text eleme
ocr.selectText.8=Normal (Will error if PDF contains text) ocr.selectText.8=Normal (Will error if PDF contains text)
ocr.selectText.9=Additional Settings ocr.selectText.9=Additional Settings
ocr.selectText.10=OCR Mode ocr.selectText.10=OCR Mode
ocr.selectText.11=Remove images after OCR (Removes ALL images, only useful if part of conversion step)
ocr.selectText.12=Render Type (Advanced)
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR. ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
ocr.submit=Process PDF with OCR ocr.submit=Process PDF with OCR

View file

@ -144,6 +144,8 @@ ocr.selectText.7=Fuerza OCR, OCR eliminará en cada página todo el texto origin
ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto) ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto)
ocr.selectText.9=Ajustes Adicionales ocr.selectText.9=Ajustes Adicionales
ocr.selectText.10=Modo OCR ocr.selectText.10=Modo OCR
ocr.selectText.11=Eliminar imágenes después de OCR (Elimina TODAS las imágenes, solo es útil si es parte del paso de conversión)
ocr.selectText.12=Tipo de procesamiento (avanzado)
ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker
ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR. ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR.
ocr.submit=Procesa PDF con OCR ocr.submit=Procesa PDF con OCR

View file

@ -148,6 +148,8 @@ ocr.selectText.7=Forcer l'OCR, OCR chaque page supprimera tous les éléments de
ocr.selectText.8=Normal (Erreur si le PDF contient du texte) ocr.selectText.8=Normal (Erreur si le PDF contient du texte)
ocr.selectText.9=Paramètres supplémentaires ocr.selectText.9=Paramètres supplémentaires
ocr.selectText.10=Mode ROC ocr.selectText.10=Mode ROC
ocr.selectText.11=Supprimer les images après l'OCR (Supprime TOUTES les images, utile uniquement si elles font partie de l'étape de conversion)
ocr.selectText.12=Type de rendu (avancé)
ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker
ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR. ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR.
ocr.submit=Traiter PDF avec OCR ocr.submit=Traiter PDF avec OCR

View file

@ -0,0 +1,32 @@
<!DOCTYPE html>
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
<th:block th:insert="~{fragments/common :: head(title=#{extractImages.title})}"></th:block>
<body>
<div id="page-container">
<div id="content-wrap">
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
<br> <br>
<div class="container">
<div class="row justify-content-center">
<div class="col-md-6">
<h2 th:text="#{extractImages.header}"></h2>
<form id="multiPdfForm" th:action="@{adjust-contrast}" method="post" enctype="multipart/form-data">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<div class="form-group">
<label for="contrastRange">Contrast</label>
<input name="contrastRange" type="range" class="form-control-range" id="contrastRange" min="-100" max="100" value="0" step="1">
</div>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImages.submit}"></button>
</form>
</div>
</div>
</div>
</div>
<div th:insert="~{fragments/footer.html :: footer}"></div>
</div>
</body>
</html>

View file

@ -53,8 +53,14 @@
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" /> <input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label> <label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
</div> </div>
<div class="form-check">
<input type="checkbox" class="form-check-input" name="removeImagesAfter" id="removeImagesAfter" />
<label class="form-check-label" for="removeImagesAfter" th:text="#{ocr.selectText.11}"></label>
</div>
<div class="form-group"> <div class="form-group">
<label>Render Type (Advanced)</label> <label th:text="#{ocr.selectText.12}"></label>
<select class="form-control" name="ocrRenderType"> <select class="form-control" name="ocrRenderType">
<option value="hocr">HOCR (Latin/Roman alphabet only)</option> <option value="hocr">HOCR (Latin/Roman alphabet only)</option>
<option value="sandwich">Sandwich</option> <option value="sandwich">Sandwich</option>