From 80c26a9550efd118dd2107bfd0509ac1d4c61ba8 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sun, 30 Apr 2023 14:42:26 +0100 Subject: [PATCH] fix for OCR multi lang --- HowToUseOCR.md | 2 ++ .../controller/api/other/OCRController.java | 19 +++++++++++++++---- .../resources/templates/other/ocr-pdf.html | 4 ++-- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/HowToUseOCR.md b/HowToUseOCR.md index 1b7d75f9..9a867cbb 100644 --- a/HowToUseOCR.md +++ b/HowToUseOCR.md @@ -20,6 +20,8 @@ Depending on your requirements, you can choose the appropriate language pack for 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. 2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata` +# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, ITS REQUIRED. + #### Docker If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs. diff --git a/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java b/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java index 9a9b527c..e7534b12 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/other/OCRController.java @@ -36,6 +36,15 @@ public class OCRController { private static final Logger logger = LoggerFactory.getLogger(OCRController.class); + public List getAvailableTesseractLanguages() { + String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata"; + File[] files = new File(tessdataDir).listFiles(); + if (files == null) { + return Collections.emptyList(); + } + return Arrays.stream(files).filter(file -> file.getName().endsWith(".traineddata")).map(file -> file.getName().replace(".traineddata", "")) + .filter(lang -> !lang.equalsIgnoreCase("osd")).collect(Collectors.toList()); + } @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf") public ResponseEntity processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, @@ -49,9 +58,11 @@ public class OCRController { throw new IOException("Please select at least one language."); } - // Validate and sanitize selected languages using regex - String languagePattern = "^[a-zA-Z]{3}$"; // Regex pattern for three-letter language codes - selectedLanguages = selectedLanguages.stream().filter(lang -> Pattern.matches(languagePattern, lang)).collect(Collectors.toList()); + // Get available Tesseract languages + List availableLanguages = getAvailableTesseractLanguages(); + + // Validate selected languages + selectedLanguages = selectedLanguages.stream().filter(availableLanguages::contains).collect(Collectors.toList()); if (selectedLanguages.isEmpty()) { throw new IOException("None of the selected languages are valid."); @@ -69,7 +80,7 @@ public class OCRController { // Run OCR Command String languageOption = String.join("+", selectedLanguages); - List command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf")); + List command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf", "--pdf-renderer" , "hocr")); if (sidecar != null && sidecar) { sidecarTextPath = Files.createTempFile("sidecar", ".txt"); diff --git a/src/main/resources/templates/other/ocr-pdf.html b/src/main/resources/templates/other/ocr-pdf.html index a1470ac9..b63089e1 100644 --- a/src/main/resources/templates/other/ocr-pdf.html +++ b/src/main/resources/templates/other/ocr-pdf.html @@ -20,9 +20,9 @@
-
+
- +