bug fixes and image scan

2023-04-26 13:18:24 +01:00 · 2023-04-26 13:18:24 +01:00 · 4327af5133
commit 4327af5133
parent ab4aea315a
13 changed files with 397 additions and 60 deletions
--- a/6
+++ b/6
@ -1,5 +1,9 @@
 # Build jbig2enc in a separate stage
-FROM frooodle/stirling-pdf-base:latest
+FROM frooodle/stirling-pdf-base:beta
+
+# Create pythonScripts folder and copy local scripts
+RUN mkdir /pythonScripts
+COPY ./pythonScripts/* /pythonScripts/

 # Copy the application JAR file
 COPY build/libs/*.jar app.jar
--- a/57
+++ b/57
@ -11,8 +11,8 @@ RUN apt-get update && \
        pkg-config \
        ca-certificates \
        zlib1g-dev \
-		make \
-		g++
+        make \
+        g++

 RUN git clone https://github.com/agl/jbig2enc && \
    cd jbig2enc && \
@ -33,12 +33,57 @@ RUN apt-get update && \
        libreoffice-calc \
        libreoffice-impress \
        python3-uno \
-		python3-pip \
+        python3-pip \
        unoconv \
-		pngquant \
-		unpaper \
+        pngquant \
+        unpaper \
        ocrmypdf && \
-	pip install --user --upgrade ocrmypdf
+    pip install --user --upgrade ocrmypdf && \
+    pip3 install opencv-python-headless
+
+# Copy the jbig2enc binary from the builder stage
+COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2# Build jbig2enc in a separate stage
+FROM debian:bullseye-slim as jbig2enc_builder
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        git \
+        automake \
+        autoconf \
+        libtool \
+        libleptonica-dev \
+        pkg-config \
+        ca-certificates \
+        zlib1g-dev \
+        make \
+        g++
+
+RUN git clone https://github.com/agl/jbig2enc && \
+    cd jbig2enc && \
+    ./autogen.sh && \
+    ./configure && \
+    make && \
+    make install
+
+# Main stage
+FROM openjdk:17-jdk-slim
+
+# Install necessary dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        libreoffice-core \
+        libreoffice-common \
+        libreoffice-writer \
+        libreoffice-calc \
+        libreoffice-impress \
+        python3-uno \
+        python3-pip \
+        unoconv \
+        pngquant \
+        unpaper \
+        ocrmypdf && \
+    pip install --user --upgrade ocrmypdf && \
+    pip3 install opencv-python-headless

 # Copy the jbig2enc binary from the builder stage
 COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2
--- a/pythonScripts/split_photos.py
+++ b/pythonScripts/split_photos.py
@ -0,0 +1,133 @@
+import sys
+import cv2
+import numpy as np
+import os 
+
+def find_photo_boundaries(image, background_color, tolerance=30, min_area=10000, min_contour_area=500):
+    mask = cv2.inRange(image, background_color - tolerance, background_color + tolerance)
+    mask = cv2.bitwise_not(mask)
+    kernel = np.ones((5,5),np.uint8)
+    mask = cv2.dilate(mask, kernel, iterations=2)
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    photo_boundaries = []
+    for contour in contours:
+        x, y, w, h = cv2.boundingRect(contour)
+        area = w * h
+        contour_area = cv2.contourArea(contour)
+        if area >= min_area and contour_area >= min_contour_area:
+            photo_boundaries.append((x, y, w, h))
+
+    return photo_boundaries
+
+def estimate_background_color(image, sample_points=5):
+    h, w, _ = image.shape
+    points = [
+        (0, 0),
+        (w - 1, 0),
+        (w - 1, h - 1),
+        (0, h - 1),
+        (w // 2, h // 2),
+    ]
+
+    colors = []
+    for x, y in points:
+        colors.append(image[y, x])
+
+    return np.median(colors, axis=0)
+
+def auto_rotate(image, angle_threshold=10):
+    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+    ret, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+
+    if len(contours) == 0:
+        return image
+
+    largest_contour = max(contours, key=cv2.contourArea)
+    mu = cv2.moments(largest_contour)
+    
+    if mu["m00"] == 0:
+        return image
+    
+    x_centroid = int(mu["m10"] / mu["m00"])
+    y_centroid = int(mu["m01"] / mu["m00"])
+
+    coords = np.column_stack(np.where(binary > 0))
+    u, _, vt = np.linalg.svd(coords - np.array([[y_centroid, x_centroid]]), full_matrices=False)
+
+    angle = np.arctan2(u[1, 0], u[0, 0]) * 180 / np.pi
+
+    if angle < -45:
+        angle = -(90 + angle)
+    else:
+        angle = -angle
+
+    if abs(angle) < angle_threshold:
+        return image
+
+    (h, w) = image.shape[:2]
+    center = (w // 2, h // 2)
+    M = cv2.getRotationMatrix2D(center, angle, 1.0)
+    return cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
+
+
+
+def crop_borders(image, border_color, tolerance=30):
+    mask = cv2.inRange(image, border_color - tolerance, border_color + tolerance)
+    
+    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
+    if len(contours) == 0:
+        return image
+
+    largest_contour = max(contours, key=cv2.contourArea)
+    x, y, w, h = cv2.boundingRect(largest_contour)
+    
+    return image[y:y+h, x:x+w]
+    
+def split_photos(input_file, output_directory, tolerance=30, min_area=10000, min_contour_area=500, angle_threshold=10, border_size=10):
+    image = cv2.imread(input_file)
+    background_color = estimate_background_color(image)
+
+    # Add a constant border around the image
+    image = cv2.copyMakeBorder(image, border_size, border_size, border_size, border_size, cv2.BORDER_CONSTANT, value=background_color)
+
+    photo_boundaries = find_photo_boundaries(image, background_color, tolerance)
+
+    if not os.path.exists(output_directory):
+        os.makedirs(output_directory)
+
+    # Get the input file's base name without the extension
+    input_file_basename = os.path.splitext(os.path.basename(input_file))[0]
+
+    for idx, (x, y, w, h) in enumerate(photo_boundaries):
+        cropped_image = image[y:y+h, x:x+w]
+        cropped_image = auto_rotate(cropped_image, angle_threshold)
+
+        # Remove the added border
+        cropped_image = cropped_image[border_size:-border_size, border_size:-border_size]
+
+        output_path = os.path.join(output_directory, f"{input_file_basename}_{idx+1}.png")
+        cv2.imwrite(output_path, cropped_image)
+        print(f"Saved {output_path}")
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print("Usage: python3 split_photos.py <input_file> <output_directory> [tolerance] [min_area] [min_contour_area] [angle_threshold] [border_size]")
+        print("\nParameters:")
+        print("  <input_file>       - The input scanned image containing multiple photos.")
+        print("  <output_directory> - The directory where the result images should be placed.")
+        print("  [tolerance]        - Optional. Determines the range of color variation around the estimated background color (default: 30).")
+        print("  [min_area]         - Optional. Sets the minimum area threshold for a photo (default: 10000).")
+        print("  [min_contour_area] - Optional. Sets the minimum contour area threshold for a photo (default: 500).")
+        print("  [angle_threshold]  - Optional. Sets the minimum absolute angle required for the image to be rotated (default: 10).")
+        print("  [border_size]      - Optional. Sets the size of the border added and removed to prevent white borders in the output (default: 10).")
+        sys.exit(1)
+
+    input_file = sys.argv[1]
+    output_directory = sys.argv[2]
+    tolerance = int(sys.argv[3]) if len(sys.argv) > 3 else 20
+    min_area = int(sys.argv[4]) if len(sys.argv) > 4 else 8000
+    min_contour_area = int(sys.argv[5]) if len(sys.argv) > 5 else 500
+    angle_threshold = int(sys.argv[6]) if len(sys.argv) > 6 else 60
+    split_photos(input_file, output_directory, tolerance=tolerance, min_area=min_area, min_contour_area=min_contour_area, angle_threshold=angle_threshold)
--- a/src/main/java/stirling/software/SPDF/controller/converters/ConvertPDFToPDFA.java
+++ b/src/main/java/stirling/software/SPDF/controller/converters/ConvertPDFToPDFA.java
@ -6,8 +6,6 @@ import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.List;

-import org.springframework.http.HttpHeaders;
-import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.stereotype.Controller;
 import org.springframework.ui.Model;
@ -16,6 +14,7 @@ import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestParam;
 import org.springframework.web.multipart.MultipartFile;

+import stirling.software.SPDF.utils.PdfUtils;
 import stirling.software.SPDF.utils.ProcessExecutor;

@Controller
@ -52,10 +51,7 @@ public class ConvertPDFToPDFA {

        // Return the optimized PDF as a response
        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
-        HttpHeaders headers = new HttpHeaders();
-        headers.setContentType(MediaType.APPLICATION_PDF);
-        headers.setContentDispositionFormData("attachment", outputFilename);
-        return ResponseEntity.ok().headers(headers).body(pdfBytes);
+        return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
    }

    @GetMapping("/pdf-to-pdfa")
--- a/src/main/java/stirling/software/SPDF/controller/other/CompressController.java
+++ b/src/main/java/stirling/software/SPDF/controller/other/CompressController.java
@ -8,8 +8,6 @@ import java.util.List;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.http.HttpHeaders;
-import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.stereotype.Controller;
 import org.springframework.ui.Model;
@ -18,6 +16,7 @@ import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestParam;
 import org.springframework.web.multipart.MultipartFile;

+import stirling.software.SPDF.utils.PdfUtils;
 import stirling.software.SPDF.utils.ProcessExecutor;

@Controller
@ -78,10 +77,7 @@ public class CompressController {

        // Return the optimized PDF as a response
        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
-        HttpHeaders headers = new HttpHeaders();
-        headers.setContentType(MediaType.APPLICATION_PDF);
-        headers.setContentDispositionFormData("attachment", outputFilename);
-        return ResponseEntity.ok().headers(headers).body(pdfBytes);
+        return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
    }

 }
--- a/src/main/java/stirling/software/SPDF/controller/other/ExtractImageScansController.java
+++ b/src/main/java/stirling/software/SPDF/controller/other/ExtractImageScansController.java
@ -0,0 +1,145 @@
+package stirling.software.SPDF.controller.other;
+
+import java.awt.image.BufferedImage;
+import java.io.ByteArrayInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardCopyOption;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipOutputStream;
+
+import javax.imageio.ImageIO;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.rendering.PDFRenderer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.http.MediaType;
+import org.springframework.http.ResponseEntity;
+import org.springframework.stereotype.Controller;
+import org.springframework.web.bind.annotation.GetMapping;
+import org.springframework.web.bind.annotation.PostMapping;
+import org.springframework.web.bind.annotation.RequestParam;
+import org.springframework.web.multipart.MultipartFile;
+import org.springframework.web.servlet.ModelAndView;
+
+import stirling.software.SPDF.utils.PdfUtils;
+import stirling.software.SPDF.utils.ProcessExecutor;
+
+@Controller
+public class ExtractImageScansController {
+
+    private static final Logger logger = LoggerFactory.getLogger(ExtractImageScansController.class);
+
+
+    @GetMapping("/extract-image-scans")
+    public ModelAndView extractImageScansForm() {
+        ModelAndView modelAndView = new ModelAndView("other/extract-image-scans");
+        modelAndView.addObject("currentPage", "extract-image-scans");
+        return modelAndView;
+    }
+
+    @PostMapping("/extract-image-scans")
+    public ResponseEntity<byte[]> extractImageScans(@RequestParam("fileInput") MultipartFile inputFile,
+                                                    @RequestParam(name = "angle_threshold", defaultValue = "5") int angleThreshold,
+                                                    @RequestParam(name = "tolerance", defaultValue = "20") int tolerance,
+                                                    @RequestParam(name = "min_area", defaultValue = "8000") int minArea,
+                                                    @RequestParam(name = "min_contour_area", defaultValue = "500") int minContourArea) throws IOException, InterruptedException {
+
+        String fileName = inputFile.getOriginalFilename();
+        String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
+
+        List<String> images = new ArrayList<>();
+
+     // Check if input file is a PDF
+        if (extension.equalsIgnoreCase("pdf")) {
+            // Load PDF document
+            try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputFile.getBytes()))) {
+                PDFRenderer pdfRenderer = new PDFRenderer(document);
+                int pageCount = document.getNumberOfPages();
+                images = new ArrayList<>();
+
+                // Create images of all pages
+                for (int i = 0; i < pageCount; i++) {
+                    // Create temp file to save the image
+                    Path tempFile = Files.createTempFile("image_", ".png");
+
+                    // Render image and save as temp file
+                    BufferedImage image = pdfRenderer.renderImageWithDPI(i, 300);
+                    ImageIO.write(image, "png", tempFile.toFile());
+
+                    // Add temp file path to images list
+                    images.add(tempFile.toString());
+                }
+            }
+        } else {
+            Path tempInputFile = Files.createTempFile("input_", "." + extension);
+            Files.copy(inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
+            // Add input file path to images list
+            images.add(tempInputFile.toString());
+        }
+
+        
+        List<byte[]> processedImageBytes = new ArrayList<>();
+
+        // Process each image
+        for (int i = 0; i < images.size(); i++) {
+
+            Path tempDir = Files.createTempDirectory("openCV_output");
+            List<String> command = new ArrayList<>(Arrays.asList("python3", "/pythonScripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold), String.valueOf(tolerance),String.valueOf(minArea),String.valueOf(minContourArea)));
+
+            // Run CLI command
+            int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
+
+            // Read the output photos in temp directory
+            List<Path> tempOutputFiles = Files.list(tempDir).sorted().collect(Collectors.toList());
+            for (Path tempOutputFile : tempOutputFiles) {
+                byte[] imageBytes = Files.readAllBytes(tempOutputFile);
+                processedImageBytes.add(imageBytes);
+            }
+            // Clean up the temporary directory
+            FileUtils.deleteDirectory(tempDir.toFile());
+        }
+
+        // Create zip file if multiple images
+        if (processedImageBytes.size() > 1) {
+            String outputZipFilename = fileName.replaceFirst("[.][^.]+$", "") + "_processed.zip";
+            Path tempZipFile = Files.createTempFile("output_", ".zip");
+
+            try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
+                // Add processed images to the zip
+                for (int i = 0; i < processedImageBytes.size(); i++) {
+                    ZipEntry entry = new ZipEntry(fileName.replaceFirst("[.][^.]+$", "") + "_" + (i+1) + ".png");
+                    zipOut.putNextEntry(entry);
+                    zipOut.write(processedImageBytes.get(i));
+                    zipOut.closeEntry();
+                }
+            }
+
+            byte[] zipBytes = Files.readAllBytes(tempZipFile);
+
+            // Clean up the temporary zip file
+            Files.delete(tempZipFile);
+
+            return PdfUtils.bytesToWebResponse(zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
+        } else {
+            // Return the processed image as a response
+            byte[] imageBytes = processedImageBytes.get(0);
+            return PdfUtils.bytesToWebResponse(imageBytes, fileName.replaceFirst("[.][^.]+$", "") + ".png", MediaType.IMAGE_PNG);
+        }
+        
+
+    }
+
+
+  
+
+}
--- a/src/main/java/stirling/software/SPDF/controller/other/ExtractImagesController.java
+++ b/src/main/java/stirling/software/SPDF/controller/other/ExtractImagesController.java
@ -18,10 +18,6 @@ import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.core.io.ByteArrayResource;
-import org.springframework.core.io.Resource;
-import org.springframework.http.HttpHeaders;
-import org.springframework.http.HttpStatus;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.stereotype.Controller;
@ -31,13 +27,15 @@ import org.springframework.web.bind.annotation.PostMapping;
 import org.springframework.web.bind.annotation.RequestParam;
 import org.springframework.web.multipart.MultipartFile;

+import stirling.software.SPDF.utils.PdfUtils;
+
@Controller
 public class ExtractImagesController {

    private static final Logger logger = LoggerFactory.getLogger(ExtractImagesController.class);

    @PostMapping("/extract-images")
-    public ResponseEntity<Resource> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {
+    public ResponseEntity<byte[]> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {

        System.out.println(System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
        PDDocument document = PDDocument.load(file.getBytes());
@ -98,18 +96,8 @@ public class ExtractImagesController {

        // Create ByteArrayResource from byte array
        byte[] zipContents = baos.toByteArray();
-        ByteArrayResource resource = new ByteArrayResource(zipContents);
-
-        // Set content disposition header to indicate that the response should be
-        // downloaded as a file
-        HttpHeaders headers = new HttpHeaders();
-        headers.setContentLength(zipContents.length);
-        headers.add(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip");
-
-        // Return ResponseEntity with ByteArrayResource and headers
-        return ResponseEntity.status(HttpStatus.OK).headers(headers)
-
-                .header("Cache-Control", "no-cache").contentType(MediaType.APPLICATION_OCTET_STREAM).body(resource);
+        
+        return PdfUtils.boasToWebResponse(baos, file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
    }

    @GetMapping("/extract-images")
--- a/src/main/java/stirling/software/SPDF/controller/other/OCRController.java
+++ b/src/main/java/stirling/software/SPDF/controller/other/OCRController.java
@ -17,7 +17,6 @@ import java.util.zip.ZipOutputStream;

 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
-import org.springframework.http.HttpHeaders;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
 import org.springframework.stereotype.Controller;
@ -27,6 +26,7 @@ import org.springframework.web.bind.annotation.RequestParam;
 import org.springframework.web.multipart.MultipartFile;
 import org.springframework.web.servlet.ModelAndView;

+import stirling.software.SPDF.utils.PdfUtils;
 import stirling.software.SPDF.utils.ProcessExecutor;

@Controller
@ -123,8 +123,6 @@ public class OCRController {
        // Return the OCR processed PDF as a response
        String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";

-        HttpHeaders headers = new HttpHeaders();
-
        if (sidecar != null && sidecar) {
            // Create a zip file containing both the PDF and the text file
            String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
@ -150,17 +148,13 @@ public class OCRController {
            Files.delete(tempZipFile);
            Files.delete(tempOutputFile);
            Files.delete(sidecarTextPath);
-
+            
            // Return the zip file containing both the PDF and the text file
-            headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
-            headers.setContentDispositionFormData("attachment", outputZipFilename);
-            return ResponseEntity.ok().headers(headers).body(zipBytes);
+            return PdfUtils.bytesToWebResponse(pdfBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
        } else {
            // Return the OCR processed PDF as a response
            Files.delete(tempOutputFile);
-            headers.setContentType(MediaType.APPLICATION_PDF);
-            headers.setContentDispositionFormData("attachment", outputFilename);
-            return ResponseEntity.ok().headers(headers).body(pdfBytes);
+            return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
        }

    }
--- a/src/main/java/stirling/software/SPDF/utils/PDFToFile.java
+++ b/src/main/java/stirling/software/SPDF/utils/PDFToFile.java
@ -15,7 +15,6 @@ import java.util.zip.ZipOutputStream;

 import org.apache.commons.io.FileUtils;
 import org.apache.commons.io.IOUtils;
-import org.springframework.http.HttpHeaders;
 import org.springframework.http.HttpStatus;
 import org.springframework.http.MediaType;
 import org.springframework.http.ResponseEntity;
@ -41,8 +40,7 @@ public class PDFToFile {
        Path tempInputFile = null;
        Path tempOutputDir = null;
        byte[] fileBytes;
-        // Prepare response
-        HttpHeaders headers = new HttpHeaders();
+        String fileName = "temp.file";

        try {
            // Save the uploaded file to a temporary location
@ -60,19 +58,18 @@ public class PDFToFile {
            // Get output files
            List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());

+            
            if (outputFiles.size() == 1) {
                // Return single output file
                File outputFile = outputFiles.get(0);
-                headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
                if (outputFormat.equals("txt:Text")) {
                    outputFormat = "txt";
                }
-                headers.setContentDispositionFormData("attachment", pdfBaseName + "." + outputFormat);
+                fileName = pdfBaseName + "." + outputFormat;
                fileBytes = FileUtils.readFileToByteArray(outputFile);
            } else {
                // Return output files in a ZIP archive
-                headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
-                headers.setContentDispositionFormData("attachment", pdfBaseName + "To" + outputFormat + ".zip");
+                fileName =  pdfBaseName + "To" + outputFormat + ".zip";
                ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
                ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);

@ -96,6 +93,6 @@ public class PDFToFile {
            if (tempOutputDir != null)
                FileUtils.deleteDirectory(tempOutputDir.toFile());
        }
-        return new ResponseEntity<>(fileBytes, headers, HttpStatus.OK);
+        return PdfUtils.bytesToWebResponse(fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
    }
 }
--- a/src/main/java/stirling/software/SPDF/utils/PdfUtils.java
+++ b/src/main/java/stirling/software/SPDF/utils/PdfUtils.java
@ -8,6 +8,8 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.net.URLEncoder;
+import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.security.KeyPair;
 import java.security.KeyStore;
@ -43,18 +45,26 @@ public class PdfUtils {

    public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName) throws IOException {
        return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName);
-
    }

-    public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
+    public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName, MediaType mediaType) throws IOException {
+        return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName, mediaType );
+    }
+    
+    public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName, MediaType mediaType ) throws IOException {

        // Return the PDF as a response
        HttpHeaders headers = new HttpHeaders();
-        headers.setContentType(MediaType.APPLICATION_PDF);
+        headers.setContentType(mediaType);
        headers.setContentLength(bytes.length);
-        headers.setContentDispositionFormData("attachment", docName);
+        String encodedDocName = URLEncoder.encode(docName, StandardCharsets.UTF_8.toString()).replaceAll("\\+", "%20");
+        headers.setContentDispositionFormData("attachment", encodedDocName);
        return new ResponseEntity<>(bytes, headers, HttpStatus.OK);
    }
+    
+    public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
+        return bytesToWebResponse(bytes, docName, MediaType.APPLICATION_PDF);
+    }

    public static byte[] convertFromPdf(byte[] inputStream, String imageType, ImageType colorType, boolean singleImage, int DPI) throws IOException, Exception {
        try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputStream))) {
--- a/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
+++ b/src/main/java/stirling/software/SPDF/utils/ProcessExecutor.java
@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
 public class ProcessExecutor {

    public enum Processes {
-        LIBRE_OFFICE, OCR_MY_PDF
+        LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
    }

    private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
@ -23,6 +23,7 @@ public class ProcessExecutor {
            int semaphoreLimit = switch (key) {
            case LIBRE_OFFICE -> 1;
            case OCR_MY_PDF -> 2;
+            case PYTHON_OPENCV -> 8;
            };
            return new ProcessExecutor(semaphoreLimit);
        });
--- a/src/main/resources/templates/fragments/common.html
+++ b/src/main/resources/templates/fragments/common.html
@ -136,7 +136,7 @@ document.addEventListener("DOMContentLoaded", function () {
                        const contentDispositionHeader = response.headers.get('Content-Disposition');
                        console.log(contentDispositionHeader)
                        if (contentDispositionHeader && contentDispositionHeader.indexOf('attachment') !== -1) {
-                        	filename = contentDispositionHeader.split('filename=')[1].replace(/"/g, '');
+                        	filename = decodeURIComponent(contentDispositionHeader.split('filename=')[1].replace(/"/g, ''));
                        } else {
                            // If the Content-Disposition header is not present or does not contain the filename, use a default filename
                            filename = 'download';
--- a/src/main/resources/templates/other/extract-image-scans.html
+++ b/src/main/resources/templates/other/extract-image-scans.html
@ -0,0 +1,28 @@
+<!DOCTYPE html>
+<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
+
+<th:block th:insert="~{fragments/common :: head(title=#{extractImageScans.title})}"></th:block>
+
+
+<body>
+  <div id="page-container">
+    <div id="content-wrap">
+      <div th:insert="~{fragments/navbar.html :: navbar}"></div>
+      <br> <br>
+      <div class="container">
+        <div class="row justify-content-center">
+          <div class="col-md-6">
+            <h2 th:text="#{extractImageScans.header}"></h2>
+
+            <form id="multiPdfForm" th:action="@{extract-image-scans}" method="post" enctype="multipart/form-data">
+              <div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='image/*, application/pdf')}"></div>
+              <button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImageScans.submit}"></button>
+            </form>
+          </div>
+        </div>
+      </div>
+    </div>
+    <div th:insert="~{fragments/footer.html :: footer}"></div>
+  </div>
+</body>
+</html>