ocr remove images
This commit is contained in:
parent
30c56a0ec9
commit
9af537c985
10 changed files with 76 additions and 4 deletions
|
@ -47,7 +47,8 @@ public class OCRController {
|
||||||
@RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
@RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
||||||
@RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean,
|
@RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean,
|
||||||
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType,
|
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType,
|
||||||
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType)
|
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType,
|
||||||
|
@RequestParam(name = "removeImagesAfter", required = false) Boolean removeImagesAfter)
|
||||||
throws IOException, InterruptedException {
|
throws IOException, InterruptedException {
|
||||||
|
|
||||||
// --output-type pdfa
|
// --output-type pdfa
|
||||||
|
@ -114,11 +115,24 @@ public class OCRController {
|
||||||
// Run CLI command
|
// Run CLI command
|
||||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Remove images from the OCR processed PDF if the flag is set to true
|
||||||
|
if (removeImagesAfter != null && removeImagesAfter) {
|
||||||
|
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
|
||||||
|
|
||||||
|
List<String> gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString());
|
||||||
|
|
||||||
|
int gsReturnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand);
|
||||||
|
tempOutputFile = tempPdfWithoutImages;
|
||||||
|
}
|
||||||
// Read the OCR processed PDF file
|
// Read the OCR processed PDF file
|
||||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
|
||||||
// Clean up the temporary files
|
// Clean up the temporary files
|
||||||
Files.delete(tempInputFile);
|
Files.delete(tempInputFile);
|
||||||
|
|
||||||
// Return the OCR processed PDF as a response
|
// Return the OCR processed PDF as a response
|
||||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
||||||
|
|
||||||
|
|
|
@ -72,4 +72,13 @@ public class OtherWebController {
|
||||||
model.addAttribute("currentPage", "add-image");
|
model.addAttribute("currentPage", "add-image");
|
||||||
return "other/add-image";
|
return "other/add-image";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@GetMapping("/adjust-contrast")
|
||||||
|
@Hidden
|
||||||
|
public String contrast(Model model) {
|
||||||
|
model.addAttribute("currentPage", "adjust-contrast");
|
||||||
|
return "other/adjust-contrast";
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
|
||||||
public class ProcessExecutor {
|
public class ProcessExecutor {
|
||||||
|
|
||||||
public enum Processes {
|
public enum Processes {
|
||||||
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
|
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||||
|
@ -24,6 +24,7 @@ public class ProcessExecutor {
|
||||||
case LIBRE_OFFICE -> 1;
|
case LIBRE_OFFICE -> 1;
|
||||||
case OCR_MY_PDF -> 2;
|
case OCR_MY_PDF -> 2;
|
||||||
case PYTHON_OPENCV -> 8;
|
case PYTHON_OPENCV -> 8;
|
||||||
|
case GHOSTSCRIPT -> 16;
|
||||||
};
|
};
|
||||||
return new ProcessExecutor(semaphoreLimit);
|
return new ProcessExecutor(semaphoreLimit);
|
||||||
});
|
});
|
||||||
|
|
|
@ -149,6 +149,8 @@ ocr.selectText.7=\u0641\u0631\u0636 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\
|
||||||
ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635)
|
ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635)
|
||||||
ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629
|
ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629
|
||||||
ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641
|
ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641
|
||||||
|
ocr.selectText.11 = إزالة الصور بعد التعرف الضوئي على الحروف (يزيل كل الصور ، يكون مفيدًا فقط إذا كان جزءًا من خطوة التحويل)
|
||||||
|
ocr.selectText.12 = نوع العرض (متقدم)
|
||||||
ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621
|
ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621
|
||||||
ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR.
|
ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR.
|
||||||
ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR
|
ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR
|
||||||
|
|
|
@ -142,6 +142,8 @@ ocr.selectText.7=OCR erzwingen, OCR wird jede Seite entfernen und alle ursprüng
|
||||||
ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält)
|
ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält)
|
||||||
ocr.selectText.9=Zusätzliche Einstellungen
|
ocr.selectText.9=Zusätzliche Einstellungen
|
||||||
ocr.selectText.10=OCR-Modus
|
ocr.selectText.10=OCR-Modus
|
||||||
|
ocr.selectText.11=Bilder nach OCR entfernen (Entfernt ALLE Bilder, nur sinnvoll, wenn Teil des Konvertierungsschritts)
|
||||||
|
ocr.selectText.12=Rendertyp (Erweitert)
|
||||||
ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können
|
ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können
|
||||||
ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR.
|
ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR.
|
||||||
ocr.submit=PDF mit OCR verarbeiten
|
ocr.submit=PDF mit OCR verarbeiten
|
||||||
|
|
|
@ -148,6 +148,8 @@ ocr.selectText.7=Force OCR, will OCR Every page removing all original text eleme
|
||||||
ocr.selectText.8=Normal (Will error if PDF contains text)
|
ocr.selectText.8=Normal (Will error if PDF contains text)
|
||||||
ocr.selectText.9=Additional Settings
|
ocr.selectText.9=Additional Settings
|
||||||
ocr.selectText.10=OCR Mode
|
ocr.selectText.10=OCR Mode
|
||||||
|
ocr.selectText.11=Remove images after OCR (Removes ALL images, only useful if part of conversion step)
|
||||||
|
ocr.selectText.12=Render Type (Advanced)
|
||||||
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
||||||
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
||||||
ocr.submit=Process PDF with OCR
|
ocr.submit=Process PDF with OCR
|
||||||
|
|
|
@ -144,6 +144,8 @@ ocr.selectText.7=Fuerza OCR, OCR eliminará en cada página todo el texto origin
|
||||||
ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto)
|
ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto)
|
||||||
ocr.selectText.9=Ajustes Adicionales
|
ocr.selectText.9=Ajustes Adicionales
|
||||||
ocr.selectText.10=Modo OCR
|
ocr.selectText.10=Modo OCR
|
||||||
|
ocr.selectText.11=Eliminar imágenes después de OCR (Elimina TODAS las imágenes, solo es útil si es parte del paso de conversión)
|
||||||
|
ocr.selectText.12=Tipo de procesamiento (avanzado)
|
||||||
ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker
|
ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker
|
||||||
ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR.
|
ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR.
|
||||||
ocr.submit=Procesa PDF con OCR
|
ocr.submit=Procesa PDF con OCR
|
||||||
|
|
|
@ -148,6 +148,8 @@ ocr.selectText.7=Forcer l'OCR, OCR chaque page supprimera tous les éléments de
|
||||||
ocr.selectText.8=Normal (Erreur si le PDF contient du texte)
|
ocr.selectText.8=Normal (Erreur si le PDF contient du texte)
|
||||||
ocr.selectText.9=Paramètres supplémentaires
|
ocr.selectText.9=Paramètres supplémentaires
|
||||||
ocr.selectText.10=Mode ROC
|
ocr.selectText.10=Mode ROC
|
||||||
|
ocr.selectText.11=Supprimer les images après l'OCR (Supprime TOUTES les images, utile uniquement si elles font partie de l'étape de conversion)
|
||||||
|
ocr.selectText.12=Type de rendu (avancé)
|
||||||
ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker
|
ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker
|
||||||
ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR.
|
ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR.
|
||||||
ocr.submit=Traiter PDF avec OCR
|
ocr.submit=Traiter PDF avec OCR
|
||||||
|
|
32
src/main/resources/templates/other/adjust-contrast.html
Normal file
32
src/main/resources/templates/other/adjust-contrast.html
Normal file
|
@ -0,0 +1,32 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||||
|
|
||||||
|
<th:block th:insert="~{fragments/common :: head(title=#{extractImages.title})}"></th:block>
|
||||||
|
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id="page-container">
|
||||||
|
<div id="content-wrap">
|
||||||
|
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||||
|
<br> <br>
|
||||||
|
<div class="container">
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2 th:text="#{extractImages.header}"></h2>
|
||||||
|
|
||||||
|
<form id="multiPdfForm" th:action="@{adjust-contrast}" method="post" enctype="multipart/form-data">
|
||||||
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||||
|
<div class="form-group">
|
||||||
|
<label for="contrastRange">Contrast</label>
|
||||||
|
<input name="contrastRange" type="range" class="form-control-range" id="contrastRange" min="-100" max="100" value="0" step="1">
|
||||||
|
</div>
|
||||||
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImages.submit}"></button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
|
@ -53,8 +53,14 @@
|
||||||
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
|
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
|
||||||
</div>
|
</div>
|
||||||
|
<div class="form-check">
|
||||||
|
<input type="checkbox" class="form-check-input" name="removeImagesAfter" id="removeImagesAfter" />
|
||||||
|
<label class="form-check-label" for="removeImagesAfter" th:text="#{ocr.selectText.11}"></label>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
|
||||||
<div class="form-group">
|
<div class="form-group">
|
||||||
<label>Render Type (Advanced)</label>
|
<label th:text="#{ocr.selectText.12}"></label>
|
||||||
<select class="form-control" name="ocrRenderType">
|
<select class="form-control" name="ocrRenderType">
|
||||||
<option value="hocr">HOCR (Latin/Roman alphabet only)</option>
|
<option value="hocr">HOCR (Latin/Roman alphabet only)</option>
|
||||||
<option value="sandwich">Sandwich</option>
|
<option value="sandwich">Sandwich</option>
|
||||||
|
|
Loading…
Reference in a new issue