bug fixes and image scan
This commit is contained in:
parent
ab4aea315a
commit
4327af5133
13 changed files with 397 additions and 60 deletions
|
@ -1,5 +1,9 @@
|
||||||
# Build jbig2enc in a separate stage
|
# Build jbig2enc in a separate stage
|
||||||
FROM frooodle/stirling-pdf-base:latest
|
FROM frooodle/stirling-pdf-base:beta
|
||||||
|
|
||||||
|
# Create pythonScripts folder and copy local scripts
|
||||||
|
RUN mkdir /pythonScripts
|
||||||
|
COPY ./pythonScripts/* /pythonScripts/
|
||||||
|
|
||||||
# Copy the application JAR file
|
# Copy the application JAR file
|
||||||
COPY build/libs/*.jar app.jar
|
COPY build/libs/*.jar app.jar
|
||||||
|
|
|
@ -38,7 +38,52 @@ RUN apt-get update && \
|
||||||
pngquant \
|
pngquant \
|
||||||
unpaper \
|
unpaper \
|
||||||
ocrmypdf && \
|
ocrmypdf && \
|
||||||
pip install --user --upgrade ocrmypdf
|
pip install --user --upgrade ocrmypdf && \
|
||||||
|
pip3 install opencv-python-headless
|
||||||
|
|
||||||
|
# Copy the jbig2enc binary from the builder stage
|
||||||
|
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2# Build jbig2enc in a separate stage
|
||||||
|
FROM debian:bullseye-slim as jbig2enc_builder
|
||||||
|
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
git \
|
||||||
|
automake \
|
||||||
|
autoconf \
|
||||||
|
libtool \
|
||||||
|
libleptonica-dev \
|
||||||
|
pkg-config \
|
||||||
|
ca-certificates \
|
||||||
|
zlib1g-dev \
|
||||||
|
make \
|
||||||
|
g++
|
||||||
|
|
||||||
|
RUN git clone https://github.com/agl/jbig2enc && \
|
||||||
|
cd jbig2enc && \
|
||||||
|
./autogen.sh && \
|
||||||
|
./configure && \
|
||||||
|
make && \
|
||||||
|
make install
|
||||||
|
|
||||||
|
# Main stage
|
||||||
|
FROM openjdk:17-jdk-slim
|
||||||
|
|
||||||
|
# Install necessary dependencies
|
||||||
|
RUN apt-get update && \
|
||||||
|
apt-get install -y --no-install-recommends \
|
||||||
|
libreoffice-core \
|
||||||
|
libreoffice-common \
|
||||||
|
libreoffice-writer \
|
||||||
|
libreoffice-calc \
|
||||||
|
libreoffice-impress \
|
||||||
|
python3-uno \
|
||||||
|
python3-pip \
|
||||||
|
unoconv \
|
||||||
|
pngquant \
|
||||||
|
unpaper \
|
||||||
|
ocrmypdf && \
|
||||||
|
pip install --user --upgrade ocrmypdf && \
|
||||||
|
pip3 install opencv-python-headless
|
||||||
|
|
||||||
# Copy the jbig2enc binary from the builder stage
|
# Copy the jbig2enc binary from the builder stage
|
||||||
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2
|
COPY --from=jbig2enc_builder /usr/local/bin/jbig2 /usr/local/bin/jbig2
|
133
pythonScripts/split_photos.py
Normal file
133
pythonScripts/split_photos.py
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
import sys
|
||||||
|
import cv2
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
|
||||||
|
def find_photo_boundaries(image, background_color, tolerance=30, min_area=10000, min_contour_area=500):
|
||||||
|
mask = cv2.inRange(image, background_color - tolerance, background_color + tolerance)
|
||||||
|
mask = cv2.bitwise_not(mask)
|
||||||
|
kernel = np.ones((5,5),np.uint8)
|
||||||
|
mask = cv2.dilate(mask, kernel, iterations=2)
|
||||||
|
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
photo_boundaries = []
|
||||||
|
for contour in contours:
|
||||||
|
x, y, w, h = cv2.boundingRect(contour)
|
||||||
|
area = w * h
|
||||||
|
contour_area = cv2.contourArea(contour)
|
||||||
|
if area >= min_area and contour_area >= min_contour_area:
|
||||||
|
photo_boundaries.append((x, y, w, h))
|
||||||
|
|
||||||
|
return photo_boundaries
|
||||||
|
|
||||||
|
def estimate_background_color(image, sample_points=5):
|
||||||
|
h, w, _ = image.shape
|
||||||
|
points = [
|
||||||
|
(0, 0),
|
||||||
|
(w - 1, 0),
|
||||||
|
(w - 1, h - 1),
|
||||||
|
(0, h - 1),
|
||||||
|
(w // 2, h // 2),
|
||||||
|
]
|
||||||
|
|
||||||
|
colors = []
|
||||||
|
for x, y in points:
|
||||||
|
colors.append(image[y, x])
|
||||||
|
|
||||||
|
return np.median(colors, axis=0)
|
||||||
|
|
||||||
|
def auto_rotate(image, angle_threshold=10):
|
||||||
|
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
|
||||||
|
ret, binary = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
|
||||||
|
contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
|
||||||
|
if len(contours) == 0:
|
||||||
|
return image
|
||||||
|
|
||||||
|
largest_contour = max(contours, key=cv2.contourArea)
|
||||||
|
mu = cv2.moments(largest_contour)
|
||||||
|
|
||||||
|
if mu["m00"] == 0:
|
||||||
|
return image
|
||||||
|
|
||||||
|
x_centroid = int(mu["m10"] / mu["m00"])
|
||||||
|
y_centroid = int(mu["m01"] / mu["m00"])
|
||||||
|
|
||||||
|
coords = np.column_stack(np.where(binary > 0))
|
||||||
|
u, _, vt = np.linalg.svd(coords - np.array([[y_centroid, x_centroid]]), full_matrices=False)
|
||||||
|
|
||||||
|
angle = np.arctan2(u[1, 0], u[0, 0]) * 180 / np.pi
|
||||||
|
|
||||||
|
if angle < -45:
|
||||||
|
angle = -(90 + angle)
|
||||||
|
else:
|
||||||
|
angle = -angle
|
||||||
|
|
||||||
|
if abs(angle) < angle_threshold:
|
||||||
|
return image
|
||||||
|
|
||||||
|
(h, w) = image.shape[:2]
|
||||||
|
center = (w // 2, h // 2)
|
||||||
|
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
||||||
|
return cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def crop_borders(image, border_color, tolerance=30):
|
||||||
|
mask = cv2.inRange(image, border_color - tolerance, border_color + tolerance)
|
||||||
|
|
||||||
|
contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
|
||||||
|
if len(contours) == 0:
|
||||||
|
return image
|
||||||
|
|
||||||
|
largest_contour = max(contours, key=cv2.contourArea)
|
||||||
|
x, y, w, h = cv2.boundingRect(largest_contour)
|
||||||
|
|
||||||
|
return image[y:y+h, x:x+w]
|
||||||
|
|
||||||
|
def split_photos(input_file, output_directory, tolerance=30, min_area=10000, min_contour_area=500, angle_threshold=10, border_size=10):
|
||||||
|
image = cv2.imread(input_file)
|
||||||
|
background_color = estimate_background_color(image)
|
||||||
|
|
||||||
|
# Add a constant border around the image
|
||||||
|
image = cv2.copyMakeBorder(image, border_size, border_size, border_size, border_size, cv2.BORDER_CONSTANT, value=background_color)
|
||||||
|
|
||||||
|
photo_boundaries = find_photo_boundaries(image, background_color, tolerance)
|
||||||
|
|
||||||
|
if not os.path.exists(output_directory):
|
||||||
|
os.makedirs(output_directory)
|
||||||
|
|
||||||
|
# Get the input file's base name without the extension
|
||||||
|
input_file_basename = os.path.splitext(os.path.basename(input_file))[0]
|
||||||
|
|
||||||
|
for idx, (x, y, w, h) in enumerate(photo_boundaries):
|
||||||
|
cropped_image = image[y:y+h, x:x+w]
|
||||||
|
cropped_image = auto_rotate(cropped_image, angle_threshold)
|
||||||
|
|
||||||
|
# Remove the added border
|
||||||
|
cropped_image = cropped_image[border_size:-border_size, border_size:-border_size]
|
||||||
|
|
||||||
|
output_path = os.path.join(output_directory, f"{input_file_basename}_{idx+1}.png")
|
||||||
|
cv2.imwrite(output_path, cropped_image)
|
||||||
|
print(f"Saved {output_path}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
print("Usage: python3 split_photos.py <input_file> <output_directory> [tolerance] [min_area] [min_contour_area] [angle_threshold] [border_size]")
|
||||||
|
print("\nParameters:")
|
||||||
|
print(" <input_file> - The input scanned image containing multiple photos.")
|
||||||
|
print(" <output_directory> - The directory where the result images should be placed.")
|
||||||
|
print(" [tolerance] - Optional. Determines the range of color variation around the estimated background color (default: 30).")
|
||||||
|
print(" [min_area] - Optional. Sets the minimum area threshold for a photo (default: 10000).")
|
||||||
|
print(" [min_contour_area] - Optional. Sets the minimum contour area threshold for a photo (default: 500).")
|
||||||
|
print(" [angle_threshold] - Optional. Sets the minimum absolute angle required for the image to be rotated (default: 10).")
|
||||||
|
print(" [border_size] - Optional. Sets the size of the border added and removed to prevent white borders in the output (default: 10).")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
input_file = sys.argv[1]
|
||||||
|
output_directory = sys.argv[2]
|
||||||
|
tolerance = int(sys.argv[3]) if len(sys.argv) > 3 else 20
|
||||||
|
min_area = int(sys.argv[4]) if len(sys.argv) > 4 else 8000
|
||||||
|
min_contour_area = int(sys.argv[5]) if len(sys.argv) > 5 else 500
|
||||||
|
angle_threshold = int(sys.argv[6]) if len(sys.argv) > 6 else 60
|
||||||
|
split_photos(input_file, output_directory, tolerance=tolerance, min_area=min_area, min_contour_area=min_contour_area, angle_threshold=angle_threshold)
|
|
@ -6,8 +6,6 @@ import java.nio.file.Path;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.MediaType;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
import org.springframework.ui.Model;
|
import org.springframework.ui.Model;
|
||||||
|
@ -16,6 +14,7 @@ import org.springframework.web.bind.annotation.PostMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestParam;
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.utils.PdfUtils;
|
||||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
|
||||||
@Controller
|
@Controller
|
||||||
|
@ -52,10 +51,7 @@ public class ConvertPDFToPDFA {
|
||||||
|
|
||||||
// Return the optimized PDF as a response
|
// Return the optimized PDF as a response
|
||||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
|
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_PDFA.pdf";
|
||||||
HttpHeaders headers = new HttpHeaders();
|
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
headers.setContentType(MediaType.APPLICATION_PDF);
|
|
||||||
headers.setContentDispositionFormData("attachment", outputFilename);
|
|
||||||
return ResponseEntity.ok().headers(headers).body(pdfBytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping("/pdf-to-pdfa")
|
@GetMapping("/pdf-to-pdfa")
|
||||||
|
|
|
@ -8,8 +8,6 @@ import java.util.List;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.MediaType;
|
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
import org.springframework.ui.Model;
|
import org.springframework.ui.Model;
|
||||||
|
@ -18,6 +16,7 @@ import org.springframework.web.bind.annotation.PostMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestParam;
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.utils.PdfUtils;
|
||||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
|
||||||
@Controller
|
@Controller
|
||||||
|
@ -78,10 +77,7 @@ public class CompressController {
|
||||||
|
|
||||||
// Return the optimized PDF as a response
|
// Return the optimized PDF as a response
|
||||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
|
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_Optimized.pdf";
|
||||||
HttpHeaders headers = new HttpHeaders();
|
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
headers.setContentType(MediaType.APPLICATION_PDF);
|
|
||||||
headers.setContentDispositionFormData("attachment", outputFilename);
|
|
||||||
return ResponseEntity.ok().headers(headers).body(pdfBytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,145 @@
|
||||||
|
package stirling.software.SPDF.controller.other;
|
||||||
|
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.io.ByteArrayInputStream;
|
||||||
|
import java.io.FileOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.io.InputStream;
|
||||||
|
import java.nio.file.Files;
|
||||||
|
import java.nio.file.Path;
|
||||||
|
import java.nio.file.StandardCopyOption;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.stream.Collectors;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
|
||||||
|
import org.apache.commons.io.FileUtils;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.rendering.PDFRenderer;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
import org.springframework.http.MediaType;
|
||||||
|
import org.springframework.http.ResponseEntity;
|
||||||
|
import org.springframework.stereotype.Controller;
|
||||||
|
import org.springframework.web.bind.annotation.GetMapping;
|
||||||
|
import org.springframework.web.bind.annotation.PostMapping;
|
||||||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
import org.springframework.web.servlet.ModelAndView;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.utils.PdfUtils;
|
||||||
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
|
||||||
|
@Controller
|
||||||
|
public class ExtractImageScansController {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(ExtractImageScansController.class);
|
||||||
|
|
||||||
|
|
||||||
|
@GetMapping("/extract-image-scans")
|
||||||
|
public ModelAndView extractImageScansForm() {
|
||||||
|
ModelAndView modelAndView = new ModelAndView("other/extract-image-scans");
|
||||||
|
modelAndView.addObject("currentPage", "extract-image-scans");
|
||||||
|
return modelAndView;
|
||||||
|
}
|
||||||
|
|
||||||
|
@PostMapping("/extract-image-scans")
|
||||||
|
public ResponseEntity<byte[]> extractImageScans(@RequestParam("fileInput") MultipartFile inputFile,
|
||||||
|
@RequestParam(name = "angle_threshold", defaultValue = "5") int angleThreshold,
|
||||||
|
@RequestParam(name = "tolerance", defaultValue = "20") int tolerance,
|
||||||
|
@RequestParam(name = "min_area", defaultValue = "8000") int minArea,
|
||||||
|
@RequestParam(name = "min_contour_area", defaultValue = "500") int minContourArea) throws IOException, InterruptedException {
|
||||||
|
|
||||||
|
String fileName = inputFile.getOriginalFilename();
|
||||||
|
String extension = fileName.substring(fileName.lastIndexOf(".") + 1);
|
||||||
|
|
||||||
|
List<String> images = new ArrayList<>();
|
||||||
|
|
||||||
|
// Check if input file is a PDF
|
||||||
|
if (extension.equalsIgnoreCase("pdf")) {
|
||||||
|
// Load PDF document
|
||||||
|
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputFile.getBytes()))) {
|
||||||
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
|
int pageCount = document.getNumberOfPages();
|
||||||
|
images = new ArrayList<>();
|
||||||
|
|
||||||
|
// Create images of all pages
|
||||||
|
for (int i = 0; i < pageCount; i++) {
|
||||||
|
// Create temp file to save the image
|
||||||
|
Path tempFile = Files.createTempFile("image_", ".png");
|
||||||
|
|
||||||
|
// Render image and save as temp file
|
||||||
|
BufferedImage image = pdfRenderer.renderImageWithDPI(i, 300);
|
||||||
|
ImageIO.write(image, "png", tempFile.toFile());
|
||||||
|
|
||||||
|
// Add temp file path to images list
|
||||||
|
images.add(tempFile.toString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Path tempInputFile = Files.createTempFile("input_", "." + extension);
|
||||||
|
Files.copy(inputFile.getInputStream(), tempInputFile, StandardCopyOption.REPLACE_EXISTING);
|
||||||
|
// Add input file path to images list
|
||||||
|
images.add(tempInputFile.toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
List<byte[]> processedImageBytes = new ArrayList<>();
|
||||||
|
|
||||||
|
// Process each image
|
||||||
|
for (int i = 0; i < images.size(); i++) {
|
||||||
|
|
||||||
|
Path tempDir = Files.createTempDirectory("openCV_output");
|
||||||
|
List<String> command = new ArrayList<>(Arrays.asList("python3", "/pythonScripts/split_photos.py", images.get(i), tempDir.toString(), String.valueOf(angleThreshold), String.valueOf(tolerance),String.valueOf(minArea),String.valueOf(minContourArea)));
|
||||||
|
|
||||||
|
// Run CLI command
|
||||||
|
int returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.PYTHON_OPENCV).runCommandWithOutputHandling(command);
|
||||||
|
|
||||||
|
// Read the output photos in temp directory
|
||||||
|
List<Path> tempOutputFiles = Files.list(tempDir).sorted().collect(Collectors.toList());
|
||||||
|
for (Path tempOutputFile : tempOutputFiles) {
|
||||||
|
byte[] imageBytes = Files.readAllBytes(tempOutputFile);
|
||||||
|
processedImageBytes.add(imageBytes);
|
||||||
|
}
|
||||||
|
// Clean up the temporary directory
|
||||||
|
FileUtils.deleteDirectory(tempDir.toFile());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create zip file if multiple images
|
||||||
|
if (processedImageBytes.size() > 1) {
|
||||||
|
String outputZipFilename = fileName.replaceFirst("[.][^.]+$", "") + "_processed.zip";
|
||||||
|
Path tempZipFile = Files.createTempFile("output_", ".zip");
|
||||||
|
|
||||||
|
try (ZipOutputStream zipOut = new ZipOutputStream(new FileOutputStream(tempZipFile.toFile()))) {
|
||||||
|
// Add processed images to the zip
|
||||||
|
for (int i = 0; i < processedImageBytes.size(); i++) {
|
||||||
|
ZipEntry entry = new ZipEntry(fileName.replaceFirst("[.][^.]+$", "") + "_" + (i+1) + ".png");
|
||||||
|
zipOut.putNextEntry(entry);
|
||||||
|
zipOut.write(processedImageBytes.get(i));
|
||||||
|
zipOut.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
byte[] zipBytes = Files.readAllBytes(tempZipFile);
|
||||||
|
|
||||||
|
// Clean up the temporary zip file
|
||||||
|
Files.delete(tempZipFile);
|
||||||
|
|
||||||
|
return PdfUtils.bytesToWebResponse(zipBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||||
|
} else {
|
||||||
|
// Return the processed image as a response
|
||||||
|
byte[] imageBytes = processedImageBytes.get(0);
|
||||||
|
return PdfUtils.bytesToWebResponse(imageBytes, fileName.replaceFirst("[.][^.]+$", "") + ".png", MediaType.IMAGE_PNG);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
}
|
|
@ -18,10 +18,6 @@ import org.apache.pdfbox.pdmodel.PDPage;
|
||||||
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.core.io.ByteArrayResource;
|
|
||||||
import org.springframework.core.io.Resource;
|
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.HttpStatus;
|
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
|
@ -31,13 +27,15 @@ import org.springframework.web.bind.annotation.PostMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestParam;
|
import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.utils.PdfUtils;
|
||||||
|
|
||||||
@Controller
|
@Controller
|
||||||
public class ExtractImagesController {
|
public class ExtractImagesController {
|
||||||
|
|
||||||
private static final Logger logger = LoggerFactory.getLogger(ExtractImagesController.class);
|
private static final Logger logger = LoggerFactory.getLogger(ExtractImagesController.class);
|
||||||
|
|
||||||
@PostMapping("/extract-images")
|
@PostMapping("/extract-images")
|
||||||
public ResponseEntity<Resource> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {
|
public ResponseEntity<byte[]> extractImages(@RequestParam("fileInput") MultipartFile file, @RequestParam("format") String format) throws IOException {
|
||||||
|
|
||||||
System.out.println(System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
|
System.out.println(System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
|
||||||
PDDocument document = PDDocument.load(file.getBytes());
|
PDDocument document = PDDocument.load(file.getBytes());
|
||||||
|
@ -98,18 +96,8 @@ public class ExtractImagesController {
|
||||||
|
|
||||||
// Create ByteArrayResource from byte array
|
// Create ByteArrayResource from byte array
|
||||||
byte[] zipContents = baos.toByteArray();
|
byte[] zipContents = baos.toByteArray();
|
||||||
ByteArrayResource resource = new ByteArrayResource(zipContents);
|
|
||||||
|
|
||||||
// Set content disposition header to indicate that the response should be
|
return PdfUtils.boasToWebResponse(baos, file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||||
// downloaded as a file
|
|
||||||
HttpHeaders headers = new HttpHeaders();
|
|
||||||
headers.setContentLength(zipContents.length);
|
|
||||||
headers.add(HttpHeaders.CONTENT_DISPOSITION, "attachment; filename=" + file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_extracted-images.zip");
|
|
||||||
|
|
||||||
// Return ResponseEntity with ByteArrayResource and headers
|
|
||||||
return ResponseEntity.status(HttpStatus.OK).headers(headers)
|
|
||||||
|
|
||||||
.header("Cache-Control", "no-cache").contentType(MediaType.APPLICATION_OCTET_STREAM).body(resource);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@GetMapping("/extract-images")
|
@GetMapping("/extract-images")
|
||||||
|
|
|
@ -17,7 +17,6 @@ import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
import org.slf4j.Logger;
|
import org.slf4j.Logger;
|
||||||
import org.slf4j.LoggerFactory;
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
import org.springframework.stereotype.Controller;
|
import org.springframework.stereotype.Controller;
|
||||||
|
@ -27,6 +26,7 @@ import org.springframework.web.bind.annotation.RequestParam;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
import org.springframework.web.servlet.ModelAndView;
|
import org.springframework.web.servlet.ModelAndView;
|
||||||
|
|
||||||
|
import stirling.software.SPDF.utils.PdfUtils;
|
||||||
import stirling.software.SPDF.utils.ProcessExecutor;
|
import stirling.software.SPDF.utils.ProcessExecutor;
|
||||||
|
|
||||||
@Controller
|
@Controller
|
||||||
|
@ -123,8 +123,6 @@ public class OCRController {
|
||||||
// Return the OCR processed PDF as a response
|
// Return the OCR processed PDF as a response
|
||||||
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
String outputFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.pdf";
|
||||||
|
|
||||||
HttpHeaders headers = new HttpHeaders();
|
|
||||||
|
|
||||||
if (sidecar != null && sidecar) {
|
if (sidecar != null && sidecar) {
|
||||||
// Create a zip file containing both the PDF and the text file
|
// Create a zip file containing both the PDF and the text file
|
||||||
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
|
String outputZipFilename = inputFile.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_OCR.zip";
|
||||||
|
@ -152,15 +150,11 @@ public class OCRController {
|
||||||
Files.delete(sidecarTextPath);
|
Files.delete(sidecarTextPath);
|
||||||
|
|
||||||
// Return the zip file containing both the PDF and the text file
|
// Return the zip file containing both the PDF and the text file
|
||||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
return PdfUtils.bytesToWebResponse(pdfBytes, outputZipFilename, MediaType.APPLICATION_OCTET_STREAM);
|
||||||
headers.setContentDispositionFormData("attachment", outputZipFilename);
|
|
||||||
return ResponseEntity.ok().headers(headers).body(zipBytes);
|
|
||||||
} else {
|
} else {
|
||||||
// Return the OCR processed PDF as a response
|
// Return the OCR processed PDF as a response
|
||||||
Files.delete(tempOutputFile);
|
Files.delete(tempOutputFile);
|
||||||
headers.setContentType(MediaType.APPLICATION_PDF);
|
return PdfUtils.bytesToWebResponse(pdfBytes, outputFilename);
|
||||||
headers.setContentDispositionFormData("attachment", outputFilename);
|
|
||||||
return ResponseEntity.ok().headers(headers).body(pdfBytes);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,6 @@ import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
import org.apache.commons.io.FileUtils;
|
import org.apache.commons.io.FileUtils;
|
||||||
import org.apache.commons.io.IOUtils;
|
import org.apache.commons.io.IOUtils;
|
||||||
import org.springframework.http.HttpHeaders;
|
|
||||||
import org.springframework.http.HttpStatus;
|
import org.springframework.http.HttpStatus;
|
||||||
import org.springframework.http.MediaType;
|
import org.springframework.http.MediaType;
|
||||||
import org.springframework.http.ResponseEntity;
|
import org.springframework.http.ResponseEntity;
|
||||||
|
@ -41,8 +40,7 @@ public class PDFToFile {
|
||||||
Path tempInputFile = null;
|
Path tempInputFile = null;
|
||||||
Path tempOutputDir = null;
|
Path tempOutputDir = null;
|
||||||
byte[] fileBytes;
|
byte[] fileBytes;
|
||||||
// Prepare response
|
String fileName = "temp.file";
|
||||||
HttpHeaders headers = new HttpHeaders();
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Save the uploaded file to a temporary location
|
// Save the uploaded file to a temporary location
|
||||||
|
@ -60,19 +58,18 @@ public class PDFToFile {
|
||||||
// Get output files
|
// Get output files
|
||||||
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
|
List<File> outputFiles = Arrays.asList(tempOutputDir.toFile().listFiles());
|
||||||
|
|
||||||
|
|
||||||
if (outputFiles.size() == 1) {
|
if (outputFiles.size() == 1) {
|
||||||
// Return single output file
|
// Return single output file
|
||||||
File outputFile = outputFiles.get(0);
|
File outputFile = outputFiles.get(0);
|
||||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
|
||||||
if (outputFormat.equals("txt:Text")) {
|
if (outputFormat.equals("txt:Text")) {
|
||||||
outputFormat = "txt";
|
outputFormat = "txt";
|
||||||
}
|
}
|
||||||
headers.setContentDispositionFormData("attachment", pdfBaseName + "." + outputFormat);
|
fileName = pdfBaseName + "." + outputFormat;
|
||||||
fileBytes = FileUtils.readFileToByteArray(outputFile);
|
fileBytes = FileUtils.readFileToByteArray(outputFile);
|
||||||
} else {
|
} else {
|
||||||
// Return output files in a ZIP archive
|
// Return output files in a ZIP archive
|
||||||
headers.setContentType(MediaType.APPLICATION_OCTET_STREAM);
|
fileName = pdfBaseName + "To" + outputFormat + ".zip";
|
||||||
headers.setContentDispositionFormData("attachment", pdfBaseName + "To" + outputFormat + ".zip");
|
|
||||||
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
|
||||||
ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
|
ZipOutputStream zipOutputStream = new ZipOutputStream(byteArrayOutputStream);
|
||||||
|
|
||||||
|
@ -96,6 +93,6 @@ public class PDFToFile {
|
||||||
if (tempOutputDir != null)
|
if (tempOutputDir != null)
|
||||||
FileUtils.deleteDirectory(tempOutputDir.toFile());
|
FileUtils.deleteDirectory(tempOutputDir.toFile());
|
||||||
}
|
}
|
||||||
return new ResponseEntity<>(fileBytes, headers, HttpStatus.OK);
|
return PdfUtils.bytesToWebResponse(fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,8 @@ import java.io.File;
|
||||||
import java.io.FileOutputStream;
|
import java.io.FileOutputStream;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.io.InputStream;
|
import java.io.InputStream;
|
||||||
|
import java.net.URLEncoder;
|
||||||
|
import java.nio.charset.StandardCharsets;
|
||||||
import java.nio.file.Files;
|
import java.nio.file.Files;
|
||||||
import java.security.KeyPair;
|
import java.security.KeyPair;
|
||||||
import java.security.KeyStore;
|
import java.security.KeyStore;
|
||||||
|
@ -43,19 +45,27 @@ public class PdfUtils {
|
||||||
|
|
||||||
public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName) throws IOException {
|
public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName) throws IOException {
|
||||||
return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName);
|
return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
|
public static ResponseEntity<byte[]> boasToWebResponse(ByteArrayOutputStream baos, String docName, MediaType mediaType) throws IOException {
|
||||||
|
return PdfUtils.bytesToWebResponse(baos.toByteArray(), docName, mediaType );
|
||||||
|
}
|
||||||
|
|
||||||
|
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName, MediaType mediaType ) throws IOException {
|
||||||
|
|
||||||
// Return the PDF as a response
|
// Return the PDF as a response
|
||||||
HttpHeaders headers = new HttpHeaders();
|
HttpHeaders headers = new HttpHeaders();
|
||||||
headers.setContentType(MediaType.APPLICATION_PDF);
|
headers.setContentType(mediaType);
|
||||||
headers.setContentLength(bytes.length);
|
headers.setContentLength(bytes.length);
|
||||||
headers.setContentDispositionFormData("attachment", docName);
|
String encodedDocName = URLEncoder.encode(docName, StandardCharsets.UTF_8.toString()).replaceAll("\\+", "%20");
|
||||||
|
headers.setContentDispositionFormData("attachment", encodedDocName);
|
||||||
return new ResponseEntity<>(bytes, headers, HttpStatus.OK);
|
return new ResponseEntity<>(bytes, headers, HttpStatus.OK);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public static ResponseEntity<byte[]> bytesToWebResponse(byte[] bytes, String docName) throws IOException {
|
||||||
|
return bytesToWebResponse(bytes, docName, MediaType.APPLICATION_PDF);
|
||||||
|
}
|
||||||
|
|
||||||
public static byte[] convertFromPdf(byte[] inputStream, String imageType, ImageType colorType, boolean singleImage, int DPI) throws IOException, Exception {
|
public static byte[] convertFromPdf(byte[] inputStream, String imageType, ImageType colorType, boolean singleImage, int DPI) throws IOException, Exception {
|
||||||
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputStream))) {
|
try (PDDocument document = PDDocument.load(new ByteArrayInputStream(inputStream))) {
|
||||||
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
PDFRenderer pdfRenderer = new PDFRenderer(document);
|
||||||
|
|
|
@ -13,7 +13,7 @@ import java.util.concurrent.Semaphore;
|
||||||
public class ProcessExecutor {
|
public class ProcessExecutor {
|
||||||
|
|
||||||
public enum Processes {
|
public enum Processes {
|
||||||
LIBRE_OFFICE, OCR_MY_PDF
|
LIBRE_OFFICE, OCR_MY_PDF, PYTHON_OPENCV
|
||||||
}
|
}
|
||||||
|
|
||||||
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
private static final Map<Processes, ProcessExecutor> instances = new ConcurrentHashMap<>();
|
||||||
|
@ -23,6 +23,7 @@ public class ProcessExecutor {
|
||||||
int semaphoreLimit = switch (key) {
|
int semaphoreLimit = switch (key) {
|
||||||
case LIBRE_OFFICE -> 1;
|
case LIBRE_OFFICE -> 1;
|
||||||
case OCR_MY_PDF -> 2;
|
case OCR_MY_PDF -> 2;
|
||||||
|
case PYTHON_OPENCV -> 8;
|
||||||
};
|
};
|
||||||
return new ProcessExecutor(semaphoreLimit);
|
return new ProcessExecutor(semaphoreLimit);
|
||||||
});
|
});
|
||||||
|
|
|
@ -136,7 +136,7 @@ document.addEventListener("DOMContentLoaded", function () {
|
||||||
const contentDispositionHeader = response.headers.get('Content-Disposition');
|
const contentDispositionHeader = response.headers.get('Content-Disposition');
|
||||||
console.log(contentDispositionHeader)
|
console.log(contentDispositionHeader)
|
||||||
if (contentDispositionHeader && contentDispositionHeader.indexOf('attachment') !== -1) {
|
if (contentDispositionHeader && contentDispositionHeader.indexOf('attachment') !== -1) {
|
||||||
filename = contentDispositionHeader.split('filename=')[1].replace(/"/g, '');
|
filename = decodeURIComponent(contentDispositionHeader.split('filename=')[1].replace(/"/g, ''));
|
||||||
} else {
|
} else {
|
||||||
// If the Content-Disposition header is not present or does not contain the filename, use a default filename
|
// If the Content-Disposition header is not present or does not contain the filename, use a default filename
|
||||||
filename = 'download';
|
filename = 'download';
|
||||||
|
|
28
src/main/resources/templates/other/extract-image-scans.html
Normal file
28
src/main/resources/templates/other/extract-image-scans.html
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
<!DOCTYPE html>
|
||||||
|
<html th:lang="${#locale.toString()}" th:lang-direction="#{language.direction}" xmlns:th="http://www.thymeleaf.org">
|
||||||
|
|
||||||
|
<th:block th:insert="~{fragments/common :: head(title=#{extractImageScans.title})}"></th:block>
|
||||||
|
|
||||||
|
|
||||||
|
<body>
|
||||||
|
<div id="page-container">
|
||||||
|
<div id="content-wrap">
|
||||||
|
<div th:insert="~{fragments/navbar.html :: navbar}"></div>
|
||||||
|
<br> <br>
|
||||||
|
<div class="container">
|
||||||
|
<div class="row justify-content-center">
|
||||||
|
<div class="col-md-6">
|
||||||
|
<h2 th:text="#{extractImageScans.header}"></h2>
|
||||||
|
|
||||||
|
<form id="multiPdfForm" th:action="@{extract-image-scans}" method="post" enctype="multipart/form-data">
|
||||||
|
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='image/*, application/pdf')}"></div>
|
||||||
|
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{extractImageScans.submit}"></button>
|
||||||
|
</form>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div th:insert="~{fragments/footer.html :: footer}"></div>
|
||||||
|
</div>
|
||||||
|
</body>
|
||||||
|
</html>
|
Loading…
Reference in a new issue