2023-03-20 22:55:11 +01:00
|
|
|
package stirling.software.SPDF.controller;
|
|
|
|
|
|
|
|
import java.io.File;
|
2023-03-25 23:16:26 +01:00
|
|
|
import java.io.FileOutputStream;
|
2023-03-20 22:55:11 +01:00
|
|
|
import java.io.IOException;
|
|
|
|
import java.nio.file.Files;
|
|
|
|
import java.nio.file.Path;
|
|
|
|
import java.nio.file.Paths;
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Arrays;
|
|
|
|
import java.util.Collections;
|
|
|
|
import java.util.List;
|
|
|
|
import java.util.stream.Collectors;
|
2023-03-25 23:16:26 +01:00
|
|
|
import java.util.zip.ZipEntry;
|
|
|
|
import java.util.zip.ZipOutputStream;
|
2023-03-20 22:55:11 +01:00
|
|
|
|
|
|
|
import org.slf4j.Logger;
|
|
|
|
import org.slf4j.LoggerFactory;
|
|
|
|
import org.springframework.http.HttpHeaders;
|
|
|
|
import org.springframework.http.MediaType;
|
|
|
|
import org.springframework.http.ResponseEntity;
|
|
|
|
import org.springframework.stereotype.Controller;
|
|
|
|
import org.springframework.web.bind.annotation.GetMapping;
|
|
|
|
import org.springframework.web.bind.annotation.PostMapping;
|
|
|
|
import org.springframework.web.bind.annotation.RequestParam;
|
|
|
|
import org.springframework.web.multipart.MultipartFile;
|
|
|
|
import org.springframework.web.servlet.ModelAndView;
|
|
|
|
|
|
|
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
|
|
|
//import com.spire.pdf.*;
|
2023-04-01 22:02:54 +02:00
|
|
|
import java.util.concurrent.Semaphore;
|
2023-03-20 22:55:11 +01:00
|
|
|
@Controller
|
|
|
|
public class OCRController {
|
|
|
|
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(OCRController.class);
|
|
|
|
|
|
|
|
@GetMapping("/ocr-pdf")
|
|
|
|
public ModelAndView ocrPdfPage() {
|
|
|
|
ModelAndView modelAndView = new ModelAndView("ocr-pdf");
|
|
|
|
modelAndView.addObject("languages", getAvailableTesseractLanguages());
|
|
|
|
modelAndView.addObject("currentPage", "ocr-pdf");
|
|
|
|
return modelAndView;
|
|
|
|
}
|
|
|
|
|
2023-04-01 22:02:54 +02:00
|
|
|
private final Semaphore semaphore = new Semaphore(2);
|
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
@PostMapping("/ocr-pdf")
|
|
|
|
public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
|
|
|
|
@RequestParam("languages") List<String> selectedLanguages,
|
2023-04-01 22:02:54 +02:00
|
|
|
@RequestParam(name = "sidecar", required = false) Boolean sidecar,
|
|
|
|
@RequestParam(name = "deskew", required = false) Boolean deskew,
|
|
|
|
@RequestParam(name = "clean", required = false) Boolean clean,
|
|
|
|
@RequestParam(name = "clean-final", required = false) Boolean cleanFinal,
|
|
|
|
@RequestParam(name = "ocrType", required = false) String ocrType) throws IOException, InterruptedException {
|
2023-03-20 22:55:11 +01:00
|
|
|
|
2023-04-01 22:02:54 +02:00
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
//--output-type pdfa
|
|
|
|
if (selectedLanguages == null || selectedLanguages.size() < 1) {
|
|
|
|
throw new IOException("Please select at least one language.");
|
|
|
|
}
|
|
|
|
|
|
|
|
// Save the uploaded file to a temporary location
|
|
|
|
Path tempInputFile = Files.createTempFile("input_", ".pdf");
|
|
|
|
inputFile.transferTo(tempInputFile.toFile());
|
|
|
|
|
|
|
|
// Prepare the output file path
|
|
|
|
Path tempOutputFile = Files.createTempFile("output_", ".pdf");
|
|
|
|
|
2023-04-01 22:02:54 +02:00
|
|
|
// Prepare the output file path
|
|
|
|
Path sidecarTextPath = null;
|
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
// Run OCR Command
|
|
|
|
String languageOption = String.join("+", selectedLanguages);
|
2023-04-01 22:02:54 +02:00
|
|
|
|
|
|
|
List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--output-type", "pdf"));
|
|
|
|
|
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
if (sidecar != null && sidecar) {
|
2023-04-01 22:02:54 +02:00
|
|
|
sidecarTextPath = Files.createTempFile("sidecar", ".txt");
|
2023-03-20 22:55:11 +01:00
|
|
|
command.add("--sidecar");
|
2023-04-01 22:02:54 +02:00
|
|
|
command.add(sidecarTextPath.toString());
|
2023-03-20 22:55:11 +01:00
|
|
|
}
|
2023-04-01 22:02:54 +02:00
|
|
|
|
|
|
|
if (deskew != null && deskew) {
|
|
|
|
command.add("--deskew");
|
|
|
|
}
|
|
|
|
if (clean != null && clean) {
|
|
|
|
command.add("--clean");
|
|
|
|
}
|
|
|
|
if (cleanFinal != null && cleanFinal) {
|
|
|
|
command.add("--clean-final");
|
|
|
|
}
|
|
|
|
if (ocrType != null && !ocrType.equals("")) {
|
|
|
|
if("skip-text".equals(ocrType)) {
|
|
|
|
command.add("--skip-text");
|
|
|
|
} else if("force-ocr".equals(ocrType)) {
|
|
|
|
command.add("--force-ocr");
|
|
|
|
} else if("Normal".equals(ocrType)) {
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
command.addAll(Arrays.asList("--language", languageOption,
|
|
|
|
tempInputFile.toString(), tempOutputFile.toString()));
|
|
|
|
|
|
|
|
//Run CLI command
|
|
|
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.OCR_MY_PDF).runCommandWithOutputHandling(command);
|
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
// Read the OCR processed PDF file
|
|
|
|
byte[] pdfBytes = Files.readAllBytes(tempOutputFile);
|
2023-04-01 22:02:54 +02:00
|
|
|
|
|
|
|
|
2023-03-20 22:55:11 +01:00
|
|
|
// Clean up the temporary files
|
|
|
|
Files.delete(tempInputFile);
|
|
|
|
// Return the OCR processed PDF as a response
|
|
|
|
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
|
|
|
|
|
|
|
HttpHeaders headers = new HttpHeaders();
|
|
|
|
|
|
|
|
if (sidecar != null && sidecar) {
|
|
|
|
// Create a zip file containing both the PDF and the text file
|
|
|
|
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
|
|
|
|
Path tempZipFile = Files.createTempFile("output_", ".zip");
|
|
|
|
|
|
|
|
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
|
|
|
|
// Add PDF file to the zip
|
|
|
|
ZipEntry pdfEntry = new ZipEntry(outputFilename);
|
|
|
|
zipOut.putNextEntry(pdfEntry);
|
|
|
|
Files.copy(tempOutputFile, zipOut);
|
|
|
|
zipOut.closeEntry();
|
|
|
|
|
|
|
|
// Add text file to the zip
|
2023-04-01 22:02:54 +02:00
|
|
|
ZipEntry txtEntry = new ZipEntry(outputFilename.replace(".pdf", ".txt"));
|
2023-03-20 22:55:11 +01:00
|
|
|
zipOut.putNextEntry(txtEntry);
|
2023-04-01 22:02:54 +02:00
|
|
|
Files.copy(sidecarTextPath, zipOut);
|
2023-03-20 22:55:11 +01:00
|
|
|
zipOut.closeEntry();
|
|
|
|
}
|
|
|
|
|
|
|
|
byte[] zipBytes = Files.readAllBytes(tempZipFile);
|
|
|
|
|
|
|
|
// Clean up the temporary zip file
|
|
|
|
Files.delete(tempZipFile);
|
|
|
|
Files.delete(tempOutputFile);
|
2023-04-01 22:02:54 +02:00
|
|
|
Files.delete(sidecarTextPath);
|
2023-03-20 22:55:11 +01:00
|
|
|
|
|
|
|
// Return the zip file containing both the PDF and the text file
|
|
|
|
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
|
|
|
headers.setContentDispositionFormData("attachment", outputZipFilename);
|
|
|
|
return ResponseEntity.ok().headers(headers).body(zipBytes);
|
|
|
|
} else {
|
|
|
|
// Return the OCR processed PDF as a response
|
|
|
|
Files.delete(tempOutputFile);
|
|
|
|
headers.setContentType(MediaType.APPLICATION_PDF);
|
|
|
|
headers.setContentDispositionFormData("attachment", outputFilename);
|
|
|
|
return ResponseEntity.ok().headers(headers).body(pdfBytes);
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
public List<String> getAvailableTesseractLanguages() {
|
|
|
|
String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
|
|
|
|
File[] files = new File(tessdataDir).listFiles();
|
|
|
|
if (files == null) {
|
|
|
|
return Collections.emptyList();
|
|
|
|
}
|
|
|
|
return Arrays.stream(files)
|
|
|
|
.filter(file -> file.getName().endsWith(".traineddata"))
|
|
|
|
.map(file -> file.getName().replace(".traineddata", ""))
|
|
|
|
.filter(lang -> !lang.equalsIgnoreCase("osd"))
|
|
|
|
.collect(Collectors.toList());
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|