Added multithreaded feature for image extraction (#1641)

Co-authored-by: Anthony Stirling <77850077+Frooodle@users.noreply.github.com>
This commit is contained in:
HimaGirija 2024-08-08 02:46:57 +05:30 committed by GitHub
parent 96b080528b
commit e50391a44a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194

View file

@ -1,19 +1,8 @@
package stirling.software.SPDF.controller.api.misc; package stirling.software.SPDF.controller.api.misc;
import java.awt.Graphics2D; import io.github.pixee.security.Filenames;
import java.awt.Image; import io.swagger.v3.oas.annotations.Operation;
import java.awt.image.BufferedImage; import io.swagger.v3.oas.annotations.tags.Tag;
import java.awt.image.RenderedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.zip.Deflater;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
import javax.imageio.ImageIO;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.cos.COSName; import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
@ -28,14 +17,25 @@ import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation;
import io.swagger.v3.oas.annotations.tags.Tag;
import stirling.software.SPDF.model.api.PDFWithImageFormatRequest; import stirling.software.SPDF.model.api.PDFWithImageFormatRequest;
import stirling.software.SPDF.utils.WebResponseUtils; import stirling.software.SPDF.utils.WebResponseUtils;
import javax.imageio.ImageIO;
import java.awt.*;
import java.awt.image.BufferedImage;
import java.awt.image.RenderedImage;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.zip.Deflater;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
@RestController @RestController
@RequestMapping("/api/v1/misc") @RequestMapping("/api/v1/misc")
@Tag(name = "Misc", description = "Miscellaneous APIs") @Tag(name = "Misc", description = "Miscellaneous APIs")
@ -47,16 +47,19 @@ public class ExtractImagesController {
@Operation( @Operation(
summary = "Extract images from a PDF file", summary = "Extract images from a PDF file",
description = description =
"This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input:PDF Output:IMAGE/ZIP Type:SIMO") "This endpoint extracts images from a given PDF file and returns them in a zip file. Users can specify the output image format. Input: PDF Output: IMAGE/ZIP Type: SIMO")
public ResponseEntity<byte[]> extractImages(@ModelAttribute PDFWithImageFormatRequest request) public ResponseEntity<byte[]> extractImages(@ModelAttribute PDFWithImageFormatRequest request)
throws IOException { throws IOException, InterruptedException, ExecutionException {
MultipartFile file = request.getFileInput(); MultipartFile file = request.getFileInput();
String format = request.getFormat(); String format = request.getFormat();
System.out.println( System.out.println(
System.currentTimeMillis() + "file=" + file.getName() + ", format=" + format); System.currentTimeMillis() + " file=" + file.getName() + ", format=" + format);
PDDocument document = Loader.loadPDF(file.getBytes()); PDDocument document = Loader.loadPDF(file.getBytes());
// Determine if multithreading should be used based on PDF size or number of pages
boolean useMultithreading = shouldUseMultithreading(file, document);
// Create ByteArrayOutputStream to write zip file to byte array // Create ByteArrayOutputStream to write zip file to byte array
ByteArrayOutputStream baos = new ByteArrayOutputStream(); ByteArrayOutputStream baos = new ByteArrayOutputStream();
@ -66,71 +69,51 @@ public class ExtractImagesController {
// Set compression level // Set compression level
zos.setLevel(Deflater.BEST_COMPRESSION); zos.setLevel(Deflater.BEST_COMPRESSION);
int imageIndex = 1;
String filename = String filename =
Filenames.toSimpleFileName(file.getOriginalFilename()) Filenames.toSimpleFileName(file.getOriginalFilename())
.replaceFirst("[.][^.]+$", ""); .replaceFirst("[.][^.]+$", "");
int pageNum = 0;
Set<Integer> processedImages = new HashSet<>(); Set<Integer> processedImages = new HashSet<>();
// Iterate over each page
for (PDPage page : document.getPages()) {
++pageNum;
// Extract images from page
for (COSName name : page.getResources().getXObjectNames()) {
if (page.getResources().isImageXObject(name)) {
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
int imageHash = image.hashCode();
if (processedImages.contains(imageHash)) {
continue; // Skip already processed images
}
processedImages.add(imageHash);
// Convert image to desired format if (useMultithreading) {
RenderedImage renderedImage = image.getImage(); // Executor service to handle multithreading
BufferedImage bufferedImage = null; ExecutorService executor =
if ("png".equalsIgnoreCase(format)) { Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());
bufferedImage = Set<Future<Void>> futures = new HashSet<>();
new BufferedImage(
renderedImage.getWidth(),
renderedImage.getHeight(),
BufferedImage.TYPE_INT_ARGB);
} else if ("jpeg".equalsIgnoreCase(format) || "jpg".equalsIgnoreCase(format)) {
bufferedImage =
new BufferedImage(
renderedImage.getWidth(),
renderedImage.getHeight(),
BufferedImage.TYPE_INT_RGB);
} else if ("gif".equalsIgnoreCase(format)) {
bufferedImage =
new BufferedImage(
renderedImage.getWidth(),
renderedImage.getHeight(),
BufferedImage.TYPE_BYTE_INDEXED);
}
// Write image to zip file // Iterate over each page
String imageName = for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) {
filename + "_" + imageIndex + " (Page " + pageNum + ")." + format; PDPage page = document.getPage(pgNum);
ZipEntry zipEntry = new ZipEntry(imageName); int pageNum = document.getPages().indexOf(page) + 1;
zos.putNextEntry(zipEntry); // Submit a task for processing each page
Future<Void> future =
executor.submit(
() -> {
extractImagesFromPage(
page, format, filename, pageNum, processedImages, zos);
return null;
});
Graphics2D g = bufferedImage.createGraphics(); futures.add(future);
g.drawImage((Image) renderedImage, 0, 0, null); }
g.dispose();
// Write image bytes to zip file
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, format, imageBaos);
zos.write(imageBaos.toByteArray());
zos.closeEntry(); // Wait for all tasks to complete
imageIndex++; for (Future<Void> future : futures) {
} future.get();
}
// Close executor service
executor.shutdown();
} else {
// Single-threaded extraction
for (int pgNum = 0; pgNum < document.getPages().getCount(); pgNum++) {
PDPage page = document.getPage(pgNum);
extractImagesFromPage(page, format, filename, pgNum + 1, processedImages, zos);
} }
} }
// Close ZipOutputStream and PDDocument // Close PDDocument and ZipOutputStream
zos.close();
document.close(); document.close();
zos.close();
// Create ByteArrayResource from byte array // Create ByteArrayResource from byte array
byte[] zipContents = baos.toByteArray(); byte[] zipContents = baos.toByteArray();
@ -138,4 +121,69 @@ public class ExtractImagesController {
return WebResponseUtils.boasToWebResponse( return WebResponseUtils.boasToWebResponse(
baos, filename + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM); baos, filename + "_extracted-images.zip", MediaType.APPLICATION_OCTET_STREAM);
} }
private boolean shouldUseMultithreading(MultipartFile file, PDDocument document) {
// Criteria: Use multithreading if file size > 10MB or number of pages > 20
long fileSizeInMB = file.getSize() / (1024 * 1024);
int numberOfPages = document.getPages().getCount();
return fileSizeInMB > 10 || numberOfPages > 20;
}
private void extractImagesFromPage(
PDPage page,
String format,
String filename,
int pageNum,
Set<Integer> processedImages,
ZipOutputStream zos)
throws IOException {
for (COSName name : page.getResources().getXObjectNames()) {
if (page.getResources().isImageXObject(name)) {
PDImageXObject image = (PDImageXObject) page.getResources().getXObject(name);
int imageHash = image.hashCode();
synchronized (processedImages) {
if (processedImages.contains(imageHash)) {
continue; // Skip already processed images
}
processedImages.add(imageHash);
}
RenderedImage renderedImage = image.getImage();
// Convert to standard RGB colorspace if needed
BufferedImage bufferedImage = convertToRGB(renderedImage, format);
// Write image to zip file
String imageName = filename + "_" + imageHash + " (Page " + pageNum + ")." + format;
synchronized (zos) {
zos.putNextEntry(new ZipEntry(imageName));
ByteArrayOutputStream imageBaos = new ByteArrayOutputStream();
ImageIO.write(bufferedImage, format, imageBaos);
zos.write(imageBaos.toByteArray());
zos.closeEntry();
}
}
}
}
private BufferedImage convertToRGB(RenderedImage renderedImage, String format) {
int width = renderedImage.getWidth();
int height = renderedImage.getHeight();
BufferedImage rgbImage;
if ("png".equalsIgnoreCase(format)) {
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_ARGB);
} else if ("jpeg".equalsIgnoreCase(format) || "jpg".equalsIgnoreCase(format)) {
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
} else if ("gif".equalsIgnoreCase(format)) {
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_BYTE_INDEXED);
} else {
rgbImage = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
}
Graphics2D g = rgbImage.createGraphics();
g.drawImage((Image) renderedImage, 0, 0, null);
g.dispose();
return rgbImage;
}
} }