fix for OCR multi lang

This commit is contained in:
Anthony Stirling 2023-04-30 14:42:26 +01:00
parent 585bf4ccb4
commit 80c26a9550
3 changed files with 19 additions and 6 deletions

View file

@ -20,6 +20,8 @@ Depending on your requirements, you can choose the appropriate language pack for
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata` 2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, ITS REQUIRED.
#### Docker #### Docker
If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs. If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.

View file

@ -36,6 +36,15 @@ public class OCRController {
private static final Logger logger = LoggerFactory.getLogger(OCRController.class); private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
public List<String> getAvailableTesseractLanguages() {
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
File[] files = new File(tessdataDir).listFiles();
if (files == null) {
return Collections.emptyList();
}
return Arrays.stream(files).filter(file -> file.getName().endsWith(".traineddata")).map(file -> file.getName().replace(".traineddata", ""))
.filter(lang -> !lang.equalsIgnoreCase("osd")).collect(Collectors.toList());
}
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf") @PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile, public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
@ -49,9 +58,11 @@ public class OCRController {
throw new IOException("Please select at least one language."); throw new IOException("Please select at least one language.");
} }
// Validate and sanitize selected languages using regex // Get available Tesseract languages
String languagePattern = "^[a-zA-Z]{3}$"; // Regex pattern for three-letter language codes List<String> availableLanguages = getAvailableTesseractLanguages();
selectedLanguages = selectedLanguages.stream().filter(lang -> Pattern.matches(languagePattern, lang)).collect(Collectors.toList());
// Validate selected languages
selectedLanguages = selectedLanguages.stream().filter(availableLanguages::contains).collect(Collectors.toList());
if (selectedLanguages.isEmpty()) { if (selectedLanguages.isEmpty()) {
throw new IOException("None of the selected languages are valid."); throw new IOException("None of the selected languages are valid.");
@ -69,7 +80,7 @@ public class OCRController {
// Run OCR Command // Run OCR Command
String languageOption = String.join("+", selectedLanguages); String languageOption = String.join("+", selectedLanguages);
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf")); List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf", "--pdf-renderer" , "hocr"));
if (sidecar != null && sidecar) { if (sidecar != null && sidecar) {
sidecarTextPath = Files.createTempFile("sidecar", ".txt"); sidecarTextPath = Files.createTempFile("sidecar", ".txt");

View file

@ -22,7 +22,7 @@
<div id="languages"> <div id="languages">
<div th:each="language, iterStat : ${languages}"> <div th:each="language, iterStat : ${languages}">
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" /> <input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label> <label class="form-check-label" th:for="${'language-' + language}" th:text="${(language == 'eng') ? 'English' : language}"></label>
</div> </div>
</div> </div>
<hr> <hr>