ocr remove images
This commit is contained in:
parent
30c56a0ec9
commit
9af537c985
10 changed files with 76 additions and 4 deletions
|
@ -47,7 +47,8 @@ public class OCRController {
|
|||
@RequestParam("languages") List<String> selectedLanguages, @RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
||||
@RequestParam(name = "deskew", required = false) Boolean deskew, @RequestParam(name = "clean", required = false) Boolean clean,
|
||||
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal, @RequestParam(name = "ocrType", required = false) String ocrType,
|
||||
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType)
|
||||
@RequestParam(name = "ocrRenderType", required = false, defaultValue = "hocr") String ocrRenderType,
|
||||
@RequestParam(name = "removeImagesAfter", required = false) Boolean removeImagesAfter)
|
||||
throws IOException, InterruptedException {
|
||||
|
||||
// --output-type pdfa
|
||||
|
@ -114,11 +115,24 @@ public class OCRController {
|
|||
// Run CLI command
|
||||
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// Remove images from the OCR processed PDF if the flag is set to true
|
||||
if (removeImagesAfter != null && removeImagesAfter) {
|
||||
Path tempPdfWithoutImages = Files.createTempFile("output_", "_no_images.pdf");
|
||||
|
||||
List<String> gsCommand = Arrays.asList("gs", "-sDEVICE=pdfwrite", "-dFILTERIMAGE", "-o", tempPdfWithoutImages.toString(), tempOutputFile.toString());
|
||||
|
||||
int gsReturnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.GHOSTSCRIPT).runCommandWithOutputHandling(gsCommand);
|
||||
tempOutputFile = tempPdfWithoutImages;
|
||||
}
|
||||
// Read the OCR processed PDF file
|
||||
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
||||
|
||||
// Clean up the temporary files
|
||||
Files.delete(tempInputFile);
|
||||
|
||||
// Return the OCR processed PDF as a response
|
||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
||||
|
||||
|
|
|
@ -72,4 +72,13 @@ public class OtherWebController {
|
|||
model.addAttribute("currentPage", "add-image");
|
||||
return "other/add-image";
|
||||
}
|
||||
|
||||
@GetMapping("/adjust-contrast")
|
||||
@Hidden
|
||||
public String contrast(Model model) {
|
||||
model.addAttribute("currentPage", "adjust-contrast");
|
||||
return "other/adjust-contrast";
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
|
@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
|
|||
public class ProcessExecutor {
|
||||
|
||||
public enum Processes {
|
||||
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
|
||||
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV, GHOSTSCRIPT
|
||||
}
|
||||
|
||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||
|
@ -24,6 +24,7 @@ public class ProcessExecutor {
|
|||
case LIBRE_OFFICE -> 1;
|
||||
case OCR_MY_PDF -> 2;
|
||||
case PYTHON_OPENCV -> 8;
|
||||
case GHOSTSCRIPT -> 16;
|
||||
};
|
||||
return new ProcessExecutor(semaphoreLimit);
|
||||
});
|
||||
|
|
|
@ -149,6 +149,8 @@ ocr.selectText.7=\u0641\u0631\u0636 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\
|
|||
ocr.selectText.8=\u0639\u0627\u062F\u064A (\u062E\u0637\u0623 \u0625\u0630\u0627 \u0643\u0627\u0646 PDF \u064A\u062D\u062A\u0648\u064A \u0639\u0644\u0649 \u0646\u0635)
|
||||
ocr.selectText.9=\u0625\u0639\u062F\u0627\u062F\u0627\u062A \u0625\u0636\u0627\u0641\u064A\u0629
|
||||
ocr.selectText.10=\u0648\u0636\u0639 \u0627\u0644\u062A\u0639\u0631\u0641 \u0627\u0644\u0636\u0648\u0626\u064A \u0639\u0644\u0649 \u0627\u0644\u062D\u0631\u0648\u0641
|
||||
ocr.selectText.11 = إزالة الصور بعد التعرف الضوئي على الحروف (يزيل كل الصور ، يكون مفيدًا فقط إذا كان جزءًا من خطوة التحويل)
|
||||
ocr.selectText.12 = نوع العرض (متقدم)
|
||||
ocr.help=\u064A\u0631\u062C\u0649 \u0642\u0631\u0627\u0621\u0629 \u0647\u0630\u0647 \u0627\u0644\u0648\u062B\u0627\u0626\u0642 \u062D\u0648\u0644 \u0643\u064A\u0641\u064A\u0629 \u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0647\u0630\u0627 \u0644\u0644\u063A\u0627\u062A \u0623\u062E\u0631\u0649 \u0648 / \u0623\u0648 \u0627\u0644\u0627\u0633\u062A\u062E\u062F\u0627\u0645 \u0644\u064A\u0633 \u0641\u064A \u0639\u0627\u0645\u0644 \u0627\u0644\u0625\u0631\u0633\u0627\u0621
|
||||
ocr.credit=\u062A\u0633\u062A\u062E\u062F\u0645 \u0647\u0630\u0647 \u0627\u0644\u062E\u062F\u0645\u0629 OCRmyPDF \u0648 Tesseract \u0644 OCR.
|
||||
ocr.submit=\u0645\u0639\u0627\u0644\u062C\u0629 PDF \u0628\u0627\u0633\u062A\u062E\u062F\u0627\u0645 OCR
|
||||
|
|
|
@ -142,6 +142,8 @@ ocr.selectText.7=OCR erzwingen, OCR wird jede Seite entfernen und alle ursprüng
|
|||
ocr.selectText.8=Normal (Fehler, wenn PDF Text enthält)
|
||||
ocr.selectText.9=Zusätzliche Einstellungen
|
||||
ocr.selectText.10=OCR-Modus
|
||||
ocr.selectText.11=Bilder nach OCR entfernen (Entfernt ALLE Bilder, nur sinnvoll, wenn Teil des Konvertierungsschritts)
|
||||
ocr.selectText.12=Rendertyp (Erweitert)
|
||||
ocr.help=Bitte lesen Sie diese Dokumentation, um zu erfahren, wie Sie dies für andere Sprachen verwenden und/oder nicht in Docker verwenden können
|
||||
ocr.credit=Dieser Dienst verwendet OCRmyPDF und Tesseract für OCR.
|
||||
ocr.submit=PDF mit OCR verarbeiten
|
||||
|
|
|
@ -148,6 +148,8 @@ ocr.selectText.7=Force OCR, will OCR Every page removing all original text eleme
|
|||
ocr.selectText.8=Normal (Will error if PDF contains text)
|
||||
ocr.selectText.9=Additional Settings
|
||||
ocr.selectText.10=OCR Mode
|
||||
ocr.selectText.11=Remove images after OCR (Removes ALL images, only useful if part of conversion step)
|
||||
ocr.selectText.12=Render Type (Advanced)
|
||||
ocr.help=Please read this documentation on how to use this for other languages and/or use not in docker
|
||||
ocr.credit=This service uses OCRmyPDF and Tesseract for OCR.
|
||||
ocr.submit=Process PDF with OCR
|
||||
|
|
|
@ -144,6 +144,8 @@ ocr.selectText.7=Fuerza OCR, OCR eliminará en cada página todo el texto origin
|
|||
ocr.selectText.8=Normal (Se producirá un error si el PDF contiene texto)
|
||||
ocr.selectText.9=Ajustes Adicionales
|
||||
ocr.selectText.10=Modo OCR
|
||||
ocr.selectText.11=Eliminar imágenes después de OCR (Elimina TODAS las imágenes, solo es útil si es parte del paso de conversión)
|
||||
ocr.selectText.12=Tipo de procesamiento (avanzado)
|
||||
ocr.help=Lea esta documentación sobre cómo usar esto para otros idiomas y/o no usarlo en docker
|
||||
ocr.credit=Este servicio utiliza OCRmyPDF y Tesseract para OCR.
|
||||
ocr.submit=Procesa PDF con OCR
|
||||
|
|
|
@ -148,6 +148,8 @@ ocr.selectText.7=Forcer l'OCR, OCR chaque page supprimera tous les éléments de
|
|||
ocr.selectText.8=Normal (Erreur si le PDF contient du texte)
|
||||
ocr.selectText.9=Paramètres supplémentaires
|
||||
ocr.selectText.10=Mode ROC
|
||||
ocr.selectText.11=Supprimer les images après l'OCR (Supprime TOUTES les images, utile uniquement si elles font partie de l'étape de conversion)
|
||||
ocr.selectText.12=Type de rendu (avancé)
|
||||
ocr.help=Veuillez lire cette documentation pour savoir comment l'utiliser pour d'autres langues et/ou une utilisation non dans docker
|
||||
ocr.credit=Ce service utilise OCRmyPDF et Tesseract pour l'OCR.
|
||||
ocr.submit=Traiter PDF avec OCR
|
||||
|
|
32
src/main/resources/templates/other/adjust-contrast.html
Normal file
32
src/main/resources/templates/other/adjust-contrast.html
Normal file
|
@ -0,0 +1,32 @@
|
|||
<!DOCTYPE html>
|
||||
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||
|
||||
<th:block th:insert="~{fragments/common :: head(title=#{extractImages.title})}"></th:block>
|
||||
|
||||
|
||||
<body>
|
||||
<div id="page-container">
|
||||
<div id="content-wrap">
|
||||
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||
<br> <br>
|
||||
<div class="container">
|
||||
<div class="row justify-content-center">
|
||||
<div class="col-md-6">
|
||||
<h2 th:text="#{extractImages.header}"></h2>
|
||||
|
||||
<form id="multiPdfForm" th:action="@{adjust-contrast}" method="post" enctype="multipart/form-data">
|
||||
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
|
||||
<div class="form-group">
|
||||
<label for="contrastRange">Contrast</label>
|
||||
<input name="contrastRange" type="range" class="form-control-range" id="contrastRange" min="-100" max="100" value="0" step="1">
|
||||
</div>
|
||||
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImages.submit}"></button>
|
||||
</form>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
|
@ -53,8 +53,14 @@
|
|||
<input type="checkbox" class="form-check-input" name="clean-final" id="clean-final" />
|
||||
<label class="form-check-label" for="clean-final" th:text="#{ocr.selectText.5}"></label>
|
||||
</div>
|
||||
<div class="form-check">
|
||||
<input type="checkbox" class="form-check-input" name="removeImagesAfter" id="removeImagesAfter" />
|
||||
<label class="form-check-label" for="removeImagesAfter" th:text="#{ocr.selectText.11}"></label>
|
||||
</div>
|
||||
|
||||
|
||||
<div class="form-group">
|
||||
<label>Render Type (Advanced)</label>
|
||||
<label th:text="#{ocr.selectText.12}"></label>
|
||||
<select class="form-control" name="ocrRenderType">
|
||||
<option value="hocr">HOCR (Latin/Roman alphabet only)</option>
|
||||
<option value="sandwich">Sandwich</option>
|
||||
|
|
Loading…
Reference in a new issue