Added multithreaded feature for image extraction (#1641)
Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
parent
96b080528b
commit
e50391a44a
1 changed files with 123 additions and 75 deletions
|
@ -1,19 +1,8 @@
|
||||||
package stirling.software.SPDF.controller.api.misc;
|
package stirling.software.SPDF.controller.api.misc;
|
||||||
|
|
||||||
import java.awt.Graphics2D;
|
import io.github.pixee.security.Filenames;
|
||||||
import java.awt.Image;
|
import io.swagger.v3.oas.annotations.Operation;
|
||||||
import java.awt.image.BufferedImage;
|
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||||
import java.awt.image.RenderedImage;
|
|
||||||
import java.io.ByteArrayOutputStream;
|
|
||||||
import java.io.IOException;
|
|
||||||
import java.util.HashSet;
|
|
||||||
import java.util.Set;
|
|
||||||
import java.util.zip.Deflater;
|
|
||||||
import java.util.zip.ZipEntry;
|
|
||||||
import java.util.zip.ZipOutputStream;
|
|
||||||
|
|
||||||
import javax.imageio.ImageIO;
|
|
||||||
|
|
||||||
import org.apache.pdfbox.Loader;
|
import org.apache.pdfbox.Loader;
|
||||||
import org.apache.pdfbox.cos.COSName;
|
import org.apache.pdfbox.cos.COSName;
|
||||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
@ -28,14 +17,25 @@ import org.springframework.web.bind.annotation.PostMapping;
|
||||||
import org.springframework.web.bind.annotation.RequestMapping;
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||||||
import org.springframework.web.bind.annotation.RestController;
|
import org.springframework.web.bind.annotation.RestController;
|
||||||
import org.springframework.web.multipart.MultipartFile;
|
import org.springframework.web.multipart.MultipartFile;
|
||||||
|
|
||||||
import io.github.pixee.security.Filenames;
|
|
||||||
import io.swagger.v3.oas.annotations.Operation;
|
|
||||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
|
||||||
|
|
||||||
import stirling.software.SPDF.model.api.PDFWithImageFormatRequest;
|
import stirling.software.SPDF.model.api.PDFWithImageFormatRequest;
|
||||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||||
|
|
||||||
|
import javax.imageio.ImageIO;
|
||||||
|
import java.awt.*;
|
||||||
|
import java.awt.image.BufferedImage;
|
||||||
|
import java.awt.image.RenderedImage;
|
||||||
|
import java.io.ByteArrayOutputStream;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.HashSet;
|
||||||
|
import java.util.Set;
|
||||||
|
import java.util.concurrent.ExecutionException;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.Future;
|
||||||
|
import java.util.zip.Deflater;
|
||||||
|
import java.util.zip.ZipEntry;
|
||||||
|
import java.util.zip.ZipOutputStream;
|
||||||
|
|
||||||
@RestController
|
@RestController
|
||||||
@RequestMapping("/api/v1/misc")
|
@RequestMapping("/api/v1/misc")
|
||||||
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
@Tag(name = "Misc", description = "Miscellaneous APIs")
|
||||||
|
@ -47,16 +47,19 @@ public class ExtractImagesController {
|
||||||
@Operation(
|
@Operation(
|
||||||
summary = "Extract images from a PDF file",
|
summary = "Extract images from a PDF file",
|
||||||
description =
|
description =
|
||||||
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input:PDF Output:IMAGE/ZIP Type:SIMO")
|
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input: PDF Output: IMAGE/ZIP Type: SIMO")
|
||||||
public ResponseEntity<byte[]> extractImages(@ModelAttribute PDFWithImageFormatRequest request)
|
public ResponseEntity<byte[]> extractImages(@ModelAttribute PDFWithImageFormatRequest request)
|
||||||
throws IOException {
|
throws IOException, InterruptedException, ExecutionException {
|
||||||
MultipartFile file = request.getFileInput();
|
MultipartFile file = request.getFileInput();
|
||||||
String format = request.getFormat();
|
String format = request.getFormat();
|
||||||
|
|
||||||
System.out.println(
|
System.out.println(
|
||||||
System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format);
|
System.currentTimeMillis() + " file=" + file.getName() + ", format=" + format);
|
||||||
PDDocument document = Loader.loadPDF(file.getBytes());
|
PDDocument document = Loader.loadPDF(file.getBytes());
|
||||||
|
|
||||||
|
// Determine if multithreading should be used based on PDF size or number of pages
|
||||||
|
boolean useMultithreading = shouldUseMultithreading(file, document);
|
||||||
|
|
||||||
// Create ByteArrayOutputStream to write zip file to byte array
|
// Create ByteArrayOutputStream to write zip file to byte array
|
||||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||||
|
|
||||||
|
@ -66,71 +69,51 @@ public class ExtractImagesController {
|
||||||
// Set compression level
|
// Set compression level
|
||||||
zos.setLevel(Deflater.BEST_COMPRESSION);
|
zos.setLevel(Deflater.BEST_COMPRESSION);
|
||||||
|
|
||||||
int imageIndex = 1;
|
|
||||||
String filename =
|
String filename =
|
||||||
Filenames.toSimpleFileName(file.getOriginalFilename())
|
Filenames.toSimpleFileName(file.getOriginalFilename())
|
||||||
.replaceFirst("[.][^.]+$", "");
|
.replaceFirst("[.][^.]+$", "");
|
||||||
int pageNum = 0;
|
|
||||||
Set<Integer> processedImages = new HashSet<>();
|
Set<Integer> processedImages = new HashSet<>();
|
||||||
// Iterate over each page
|
|
||||||
for (PDPage page : document.getPages()) {
|
|
||||||
++pageNum;
|
|
||||||
// Extract images from page
|
|
||||||
for (COSName name : page.getResources().getXObjectNames()) {
|
|
||||||
if (page.getResources().isImageXObject(name)) {
|
|
||||||
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
|
|
||||||
int imageHash = image.hashCode();
|
|
||||||
if (processedImages.contains(imageHash)) {
|
|
||||||
continue; // Skip already processed images
|
|
||||||
}
|
|
||||||
processedImages.add(imageHash);
|
|
||||||
|
|
||||||
// Convert image to desired format
|
if (useMultithreading) {
|
||||||
RenderedImage renderedImage = image.getImage();
|
// Executor service to handle multithreading
|
||||||
BufferedImage bufferedImage = null;
|
ExecutorService executor =
|
||||||
if ("png".equalsIgnoreCase(format)) {
|
Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
|
||||||
bufferedImage =
|
Set<Future<Void>> futures = new HashSet<>();
|
||||||
new BufferedImage(
|
|
||||||
renderedImage.getWidth(),
|
|
||||||
renderedImage.getHeight(),
|
|
||||||
BufferedImage.TYPE_INT_ARGB);
|
|
||||||
} else if ("jpeg".equalsIgnoreCase(format) || "jpg".equalsIgnoreCase(format)) {
|
|
||||||
bufferedImage =
|
|
||||||
new BufferedImage(
|
|
||||||
renderedImage.getWidth(),
|
|
||||||
renderedImage.getHeight(),
|
|
||||||
BufferedImage.TYPE_INT_RGB);
|
|
||||||
} else if ("gif".equalsIgnoreCase(format)) {
|
|
||||||
bufferedImage =
|
|
||||||
new BufferedImage(
|
|
||||||
renderedImage.getWidth(),
|
|
||||||
renderedImage.getHeight(),
|
|
||||||
BufferedImage.TYPE_BYTE_INDEXED);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Write image to zip file
|
// Iterate over each page
|
||||||
String imageName =
|
for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) {
|
||||||
filename + "_" + imageIndex + " (Page " + pageNum + ")." + format;
|
PDPage page = document.getPage(pgNum);
|
||||||
ZipEntry zipEntry = new ZipEntry(imageName);
|
int pageNum = document.getPages().indexOf(page) + 1;
|
||||||
zos.putNextEntry(zipEntry);
|
// Submit a task for processing each page
|
||||||
|
Future<Void> future =
|
||||||
|
executor.submit(
|
||||||
|
() -> {
|
||||||
|
extractImagesFromPage(
|
||||||
|
page, format, filename, pageNum, processedImages, zos);
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
Graphics2D g = bufferedImage.createGraphics();
|
futures.add(future);
|
||||||
g.drawImage((Image) renderedImage, 0, 0, null);
|
}
|
||||||
g.dispose();
|
|
||||||
// Write image bytes to zip file
|
|
||||||
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
|
|
||||||
ImageIO.write(bufferedImage, format, imageBaos);
|
|
||||||
zos.write(imageBaos.toByteArray());
|
|
||||||
|
|
||||||
zos.closeEntry();
|
// Wait for all tasks to complete
|
||||||
imageIndex++;
|
for (Future<Void> future : futures) {
|
||||||
}
|
future.get();
|
||||||
|
}
|
||||||
|
|
||||||
|
// Close executor service
|
||||||
|
executor.shutdown();
|
||||||
|
} else {
|
||||||
|
// Single-threaded extraction
|
||||||
|
for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) {
|
||||||
|
PDPage page = document.getPage(pgNum);
|
||||||
|
extractImagesFromPage(page, format, filename, pgNum + 1, processedImages, zos);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Close ZipOutputStream and PDDocument
|
// Close PDDocument and ZipOutputStream
|
||||||
zos.close();
|
|
||||||
document.close();
|
document.close();
|
||||||
|
zos.close();
|
||||||
|
|
||||||
// Create ByteArrayResource from byte array
|
// Create ByteArrayResource from byte array
|
||||||
byte[] zipContents = baos.toByteArray();
|
byte[] zipContents = baos.toByteArray();
|
||||||
|
@ -138,4 +121,69 @@ public class ExtractImagesController {
|
||||||
return WebResponseUtils.boasToWebResponse(
|
return WebResponseUtils.boasToWebResponse(
|
||||||
baos, filename + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
|
baos, filename + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private boolean shouldUseMultithreading(MultipartFile file, PDDocument document) {
|
||||||
|
// Criteria: Use multithreading if file size > 10MB or number of pages > 20
|
||||||
|
long fileSizeInMB = file.getSize() / (1024 * 1024);
|
||||||
|
int numberOfPages = document.getPages().getCount();
|
||||||
|
return fileSizeInMB > 10 || numberOfPages > 20;
|
||||||
|
}
|
||||||
|
|
||||||
|
private void extractImagesFromPage(
|
||||||
|
PDPage page,
|
||||||
|
String format,
|
||||||
|
String filename,
|
||||||
|
int pageNum,
|
||||||
|
Set<Integer> processedImages,
|
||||||
|
ZipOutputStream zos)
|
||||||
|
throws IOException {
|
||||||
|
for (COSName name : page.getResources().getXObjectNames()) {
|
||||||
|
if (page.getResources().isImageXObject(name)) {
|
||||||
|
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
|
||||||
|
int imageHash = image.hashCode();
|
||||||
|
synchronized (processedImages) {
|
||||||
|
if (processedImages.contains(imageHash)) {
|
||||||
|
continue; // Skip already processed images
|
||||||
|
}
|
||||||
|
processedImages.add(imageHash);
|
||||||
|
}
|
||||||
|
|
||||||
|
RenderedImage renderedImage = image.getImage();
|
||||||
|
|
||||||
|
// Convert to standard RGB colorspace if needed
|
||||||
|
BufferedImage bufferedImage = convertToRGB(renderedImage, format);
|
||||||
|
|
||||||
|
// Write image to zip file
|
||||||
|
String imageName = filename + "_" + imageHash + " (Page " + pageNum + ")." + format;
|
||||||
|
synchronized (zos) {
|
||||||
|
zos.putNextEntry(new ZipEntry(imageName));
|
||||||
|
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
|
||||||
|
ImageIO.write(bufferedImage, format, imageBaos);
|
||||||
|
zos.write(imageBaos.toByteArray());
|
||||||
|
zos.closeEntry();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private BufferedImage convertToRGB(RenderedImage renderedImage, String format) {
|
||||||
|
int width = renderedImage.getWidth();
|
||||||
|
int height = renderedImage.getHeight();
|
||||||
|
BufferedImage rgbImage;
|
||||||
|
|
||||||
|
if ("png".equalsIgnoreCase(format)) {
|
||||||
|
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
|
||||||
|
} else if ("jpeg".equalsIgnoreCase(format) || "jpg".equalsIgnoreCase(format)) {
|
||||||
|
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
|
||||||
|
} else if ("gif".equalsIgnoreCase(format)) {
|
||||||
|
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_INDEXED);
|
||||||
|
} else {
|
||||||
|
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
|
||||||
|
}
|
||||||
|
|
||||||
|
Graphics2D g = rgbImage.createGraphics();
|
||||||
|
g.drawImage((Image) renderedImage, 0, 0, null);
|
||||||
|
g.dispose();
|
||||||
|
return rgbImage;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue