Stirling-PDF/src/main/java/stirling/software/SPDF/controller/OCRController.java

package stirling.software.SPDF.controller;

import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.http.HttpHeaders;
import org.springframework.http.HttpStatus;
import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity;
import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.multipart.MultipartFile;
import org.springframework.web.servlet.ModelAndView;

import stirling.software.SPDF.utils.ProcessExecutor;

import java.io.FileOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
//import com.spire.pdf.*;
@Controller
public class OCRController {

	private static final Logger logger = LoggerFactory.getLogger(OCRController.class);

	@GetMapping("/ocr-pdf")
	public ModelAndView ocrPdfPage() {
		ModelAndView modelAndView = new ModelAndView("ocr-pdf");
		modelAndView.addObject("languages", getAvailableTesseractLanguages());
		modelAndView.addObject("currentPage", "ocr-pdf");
		return modelAndView;
	}

	@PostMapping("/ocr-pdf")
	public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,
			@RequestParam("languages") List<String> selectedLanguages,
			@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {

		//--output-type pdfa
		if (selectedLanguages == null || selectedLanguages.size() < 1) {
			throw new IOException("Please select at least one language.");
	    }
		
		// Save the uploaded file to a temporary location
		Path tempInputFile = Files.createTempFile("input_", ".pdf");
		inputFile.transferTo(tempInputFile.toFile());

		// Prepare the output file path
		Path tempOutputFile = Files.createTempFile("output_", ".pdf");

		// Run OCR Command
	    String languageOption = String.join("+", selectedLanguages);
	    List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,
	            tempInputFile.toString(), tempOutputFile.toString()));
	    String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");
	    if (sidecar != null && sidecar) {
	        command.add("--sidecar");
	        command.add(sidecarFile);
	    }
	    int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);

		// Read the OCR processed PDF file
		byte[] pdfBytes = Files.readAllBytes(tempOutputFile);

		// Clean up the temporary files
		Files.delete(tempInputFile);
		// Return the OCR processed PDF as a response
		String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";

		HttpHeaders headers = new HttpHeaders();

	    if (sidecar != null && sidecar) {
	        // Create a zip file containing both the PDF and the text file
	        String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
	        Path tempZipFile = Files.createTempFile("output_", ".zip");

	        try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
	            // Add PDF file to the zip
	            ZipEntry pdfEntry = new ZipEntry(outputFilename);
	            zipOut.putNextEntry(pdfEntry);
	            Files.copy(tempOutputFile, zipOut);
	            zipOut.closeEntry();

	            // Add text file to the zip
	            ZipEntry txtEntry = new ZipEntry(sidecarFile);
	            zipOut.putNextEntry(txtEntry);
	            Files.copy(Paths.get(sidecarFile), zipOut);
	            zipOut.closeEntry();
	        }

	        byte[] zipBytes = Files.readAllBytes(tempZipFile);

	        // Clean up the temporary zip file
	        Files.delete(tempZipFile);
	        Files.delete(tempOutputFile);
	        Files.delete(Paths.get(sidecarFile));
	        
	        // Return the zip file containing both the PDF and the text file
	        headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
	        headers.setContentDispositionFormData("attachment", outputZipFilename);
	        return ResponseEntity.ok().headers(headers).body(zipBytes);
	    } else {
	        // Return the OCR processed PDF as a response
	    	Files.delete(tempOutputFile);
	        headers.setContentType(MediaType.APPLICATION_PDF);
	        headers.setContentDispositionFormData("attachment", outputFilename);
	        return ResponseEntity.ok().headers(headers).body(pdfBytes);
	    }
	    
	}

	public List<String> getAvailableTesseractLanguages() {
	    String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";
	    File[] files = new File(tessdataDir).listFiles();
	    if (files == null) {
	        return Collections.emptyList();
	    }
	    return Arrays.stream(files)
	            .filter(file -> file.getName().endsWith(".traineddata"))
	            .map(file -> file.getName().replace(".traineddata", ""))
	            .filter(lang -> !lang.equalsIgnoreCase("osd"))
	            .collect(Collectors.toList());
	}

}
Lots of changes (#70) Image extraction and conversion to formats Multi parallel file execution for all forms so you can input multiple files quickly Any file at all pdf using libreoffice, super powerful Sadly makes docker image larger but worth it OCR PDF using ocr my pdf Works awesomely for adding text to a image Improved compression using ocr my pdf app Settings page with custom download options such as - open in same window - open in new window - download - download as zip Update detection in settings page it should show notification if there is a update (very hidden) UI cleanups Add other image formats to PDF to Image Various fies to icons, and pdf.js usage 2023-03-20 22:55:11 +01:00			`package stirling.software.SPDF.controller;`

			`import java.io.BufferedReader;`
			`import java.io.File;`
			`import java.io.IOException;`
			`import java.io.InputStreamReader;`
			`import java.nio.charset.StandardCharsets;`
			`import java.nio.file.Files;`
			`import java.nio.file.Path;`
			`import java.nio.file.Paths;`
			`import java.util.ArrayList;`
			`import java.util.Arrays;`
			`import java.util.Collections;`
			`import java.util.List;`
			`import java.util.stream.Collectors;`

			`import org.slf4j.Logger;`
			`import org.slf4j.LoggerFactory;`
			`import org.springframework.http.HttpHeaders;`
			`import org.springframework.http.HttpStatus;`
			`import org.springframework.http.MediaType;`
			`import org.springframework.http.ResponseEntity;`
			`import org.springframework.stereotype.Controller;`
			`import org.springframework.web.bind.annotation.GetMapping;`
			`import org.springframework.web.bind.annotation.PostMapping;`
			`import org.springframework.web.bind.annotation.RequestParam;`
			`import org.springframework.web.multipart.MultipartFile;`
			`import org.springframework.web.servlet.ModelAndView;`

			`import stirling.software.SPDF.utils.ProcessExecutor;`

			`import java.io.FileOutputStream;`
			`import java.nio.file.Files;`
			`import java.nio.file.Path;`
			`import java.nio.file.Paths;`
			`import java.util.zip.ZipEntry;`
			`import java.util.zip.ZipOutputStream;`
			`//import com.spire.pdf.*;`
			`@Controller`
			`public class OCRController {`

			`private static final Logger logger = LoggerFactory.getLogger(OCRController.class);`

			`@GetMapping("/ocr-pdf")`
			`public ModelAndView ocrPdfPage() {`
			`ModelAndView modelAndView = new ModelAndView("ocr-pdf");`
			`modelAndView.addObject("languages", getAvailableTesseractLanguages());`
			`modelAndView.addObject("currentPage", "ocr-pdf");`
			`return modelAndView;`
			`}`

			`@PostMapping("/ocr-pdf")`
			`public ResponseEntity<byte[]> processPdfWithOCR(@RequestParam("fileInput") MultipartFile inputFile,`
			`@RequestParam("languages") List<String> selectedLanguages,`
			`@RequestParam(name = "sidecar", required = false) Boolean sidecar) throws IOException, InterruptedException {`

			`//--output-type pdfa`
			`if (selectedLanguages == null \|\| selectedLanguages.size() < 1) {`
			`throw new IOException("Please select at least one language.");`
			`}`

			`// Save the uploaded file to a temporary location`
			`Path tempInputFile = Files.createTempFile("input_", ".pdf");`
			`inputFile.transferTo(tempInputFile.toFile());`

			`// Prepare the output file path`
			`Path tempOutputFile = Files.createTempFile("output_", ".pdf");`

			`// Run OCR Command`
			`String languageOption = String.join("+", selectedLanguages);`
			`List<String> command = new ArrayList<>(Arrays.asList("ocrmypdf","--verbose", "2", "--language", languageOption,`
			`tempInputFile.toString(), tempOutputFile.toString()));`
			`String sidecarFile = tempOutputFile.toString().replace(".pdf", ".txt");`
			`if (sidecar != null && sidecar) {`
			`command.add("--sidecar");`
			`command.add(sidecarFile);`
			`}`
			`int returnCode = ProcessExecutor.runCommandWithOutputHandling(command);`

			`// Read the OCR processed PDF file`
			`byte[] pdfBytes = Files.readAllBytes(tempOutputFile);`

			`// Clean up the temporary files`
			`Files.delete(tempInputFile);`
			`// Return the OCR processed PDF as a response`
			`String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";`

			`HttpHeaders headers = new HttpHeaders();`

			`if (sidecar != null && sidecar) {`
			`// Create a zip file containing both the PDF and the text file`
			`String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";`
			`Path tempZipFile = Files.createTempFile("output_", ".zip");`

			`try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {`
			`// Add PDF file to the zip`
			`ZipEntry pdfEntry = new ZipEntry(outputFilename);`
			`zipOut.putNextEntry(pdfEntry);`
			`Files.copy(tempOutputFile, zipOut);`
			`zipOut.closeEntry();`

			`// Add text file to the zip`
			`ZipEntry txtEntry = new ZipEntry(sidecarFile);`
			`zipOut.putNextEntry(txtEntry);`
			`Files.copy(Paths.get(sidecarFile), zipOut);`
			`zipOut.closeEntry();`
			`}`

			`byte[] zipBytes = Files.readAllBytes(tempZipFile);`

			`// Clean up the temporary zip file`
			`Files.delete(tempZipFile);`
			`Files.delete(tempOutputFile);`
			`Files.delete(Paths.get(sidecarFile));`

			`// Return the zip file containing both the PDF and the text file`
			`headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);`
			`headers.setContentDispositionFormData("attachment", outputZipFilename);`
			`return ResponseEntity.ok().headers(headers).body(zipBytes);`
			`} else {`
			`// Return the OCR processed PDF as a response`
			`Files.delete(tempOutputFile);`
			`headers.setContentType(MediaType.APPLICATION_PDF);`
			`headers.setContentDispositionFormData("attachment", outputFilename);`
			`return ResponseEntity.ok().headers(headers).body(pdfBytes);`
			`}`

			`}`

			`public List<String> getAvailableTesseractLanguages() {`
			`String tessdataDir = "/usr/share/tesseract-ocr/4.00/tessdata";`
			`File[] files = new File(tessdataDir).listFiles();`
			`if (files == null) {`
			`return Collections.emptyList();`
			`}`
			`return Arrays.stream(files)`
			`.filter(file -> file.getName().endsWith(".traineddata"))`
			`.map(file -> file.getName().replace(".traineddata", ""))`
			`.filter(lang -> !lang.equalsIgnoreCase("osd"))`
			`.collect(Collectors.toList());`
			`}`

			`}`