fix for OCR multi lang
This commit is contained in:
parent
585bf4ccb4
commit
80c26a9550
3 changed files with 19 additions and 6 deletions
|
@ -20,6 +20,8 @@ Depending on your requirements, you can choose the appropriate language pack for
|
||||||
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
|
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
|
||||||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
|
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/4.00/tessdata`
|
||||||
|
|
||||||
|
# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, ITS REQUIRED.
|
||||||
|
|
||||||
#### Docker
|
#### Docker
|
||||||
|
|
||||||
If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
|
If you are using Docker, you need to expose the Tesseract tessdata directory as a volume in order to use the additional language packs.
|
||||||
|
|
|
@ -36,6 +36,15 @@ public class OCRController {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
|
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
|
||||||
|
|
||||||
|
public List<String> getAvailableTesseractLanguages() {
|
||||||
|
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
|
||||||
|
File[] files = new File(tessdataDir).listFiles();
|
||||||
|
if (files == null) {
|
||||||
|
return Collections.emptyList();
|
||||||
|
}
|
||||||
|
return Arrays.stream(files).filter(file -> file.getName().endsWith(".traineddata")).map(file -> file.getName().replace(".traineddata", ""))
|
||||||
|
.filter(lang -> !lang.equalsIgnoreCase("osd")).collect(Collectors.toList());
|
||||||
|
}
|
||||||
|
|
||||||
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
@PostMapping(consumes = "multipart/form-data", value = "/ocr-pdf")
|
||||||
public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
|
public ResponseEntity<byte[]> processPdfWithOCR(@RequestPart(required = true, value = "fileInput") MultipartFile inputFile,
|
||||||
|
@ -49,9 +58,11 @@ public class OCRController {
|
||||||
throw new IOException("Please select at least one language.");
|
throw new IOException("Please select at least one language.");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validate and sanitize selected languages using regex
|
// Get available Tesseract languages
|
||||||
String languagePattern = "^[a-zA-Z]{3}$"; // Regex pattern for three-letter language codes
|
List<String> availableLanguages = getAvailableTesseractLanguages();
|
||||||
selectedLanguages = selectedLanguages.stream().filter(lang -> Pattern.matches(languagePattern, lang)).collect(Collectors.toList());
|
|
||||||
|
// Validate selected languages
|
||||||
|
selectedLanguages = selectedLanguages.stream().filter(availableLanguages::contains).collect(Collectors.toList());
|
||||||
|
|
||||||
if (selectedLanguages.isEmpty()) {
|
if (selectedLanguages.isEmpty()) {
|
||||||
throw new IOException("None of the selected languages are valid.");
|
throw new IOException("None of the selected languages are valid.");
|
||||||
|
@ -69,7 +80,7 @@ public class OCRController {
|
||||||
// Run OCR Command
|
// Run OCR Command
|
||||||
String languageOption = String.join("+", selectedLanguages);
|
String languageOption = String.join("+", selectedLanguages);
|
||||||
|
|
||||||
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf"));
|
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf", "--verbose", "2", "--output-type", "pdf", "--pdf-renderer" , "hocr"));
|
||||||
|
|
||||||
if (sidecar != null && sidecar) {
|
if (sidecar != null && sidecar) {
|
||||||
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
||||||
|
|
|
@ -22,7 +22,7 @@
|
||||||
<div id="languages">
|
<div id="languages">
|
||||||
<div th:each="language, iterStat : ${languages}">
|
<div th:each="language, iterStat : ${languages}">
|
||||||
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
<input type="checkbox" class="form-check-input" th:name="languages" th:value="${language}" th:id="${'language-' + language}" />
|
||||||
<label class="form-check-label" th:for="${'language-' + language}" th:text=" ${language}"></label>
|
<label class="form-check-label" th:for="${'language-' + language}" th:text="${(language == 'eng') ? 'English' : language}"></label>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<hr>
|
<hr>
|
||||||
|
|
Loading…
Reference in a new issue