Feature: Split PDFs by Chapters/Bookmarks (#1786)
* feature:split pdf by chapters * Update SplitPdfByChaptersController.java * Update SplitPdfByChaptersController.java * Update SplitPdfByChaptersController.java
This commit is contained in:
parent
c1f78d0f9b
commit
03887cc9f9
2 changed files with 320 additions and 0 deletions
|
@ -0,0 +1,299 @@
|
|||
package stirling.software.SPDF.controller.api;
|
||||
|
||||
import java.io.ByteArrayOutputStream;
|
||||
import java.nio.file.Files;
|
||||
import java.nio.file.Path;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
import org.apache.pdfbox.Loader;
|
||||
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||
import org.apache.pdfbox.pdmodel.PDPage;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDDocumentOutline;
|
||||
import org.apache.pdfbox.pdmodel.interactive.documentnavigation.outline.PDOutlineItem;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.springframework.http.MediaType;
|
||||
import org.springframework.http.ResponseEntity;
|
||||
import org.springframework.web.bind.annotation.ModelAttribute;
|
||||
import org.springframework.web.bind.annotation.PostMapping;
|
||||
import org.springframework.web.bind.annotation.RequestMapping;
|
||||
import org.springframework.web.bind.annotation.RestController;
|
||||
import org.springframework.web.multipart.MultipartFile;
|
||||
|
||||
import io.github.pixee.security.Filenames;
|
||||
import io.swagger.v3.oas.annotations.Operation;
|
||||
import io.swagger.v3.oas.annotations.tags.Tag;
|
||||
|
||||
import lombok.AllArgsConstructor;
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
import stirling.software.SPDF.model.PdfMetadata;
|
||||
import stirling.software.SPDF.model.api.SplitPdfByChaptersRequest;
|
||||
import stirling.software.SPDF.utils.PdfUtils;
|
||||
import stirling.software.SPDF.utils.WebResponseUtils;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/api/v1/general")
|
||||
@Tag(name = "General", description = "General APIs")
|
||||
public class SplitPdfByChaptersController {
|
||||
|
||||
private static final Logger logger =
|
||||
LoggerFactory.getLogger(SplitPdfByChaptersController.class);
|
||||
|
||||
@PostMapping(value = "/split-pdf-by-chapters", consumes = "multipart/form-data")
|
||||
@Operation(
|
||||
summary = "Split PDFs by Chapters",
|
||||
description = "Splits a PDF into chapters and returns a ZIP file.")
|
||||
public ResponseEntity<byte[]> splitPdf(@ModelAttribute SplitPdfByChaptersRequest request)
|
||||
throws Exception {
|
||||
MultipartFile file = request.getFileInput();
|
||||
boolean includeMetadata = request.getIncludeMetadata();
|
||||
Integer bookmarkLevel =
|
||||
request.getBookmarkLevel(); // levels start from 0 (top most bookmarks)
|
||||
if (bookmarkLevel < 0) {
|
||||
return ResponseEntity.badRequest().body("Invalid bookmark level".getBytes());
|
||||
}
|
||||
PDDocument sourceDocument = Loader.loadPDF(file.getBytes());
|
||||
|
||||
// checks if the document is encrypted by an empty user password
|
||||
if (sourceDocument.isEncrypted()) {
|
||||
try {
|
||||
sourceDocument.setAllSecurityToBeRemoved(true);
|
||||
logger.info("Removing security from the source document ");
|
||||
} catch (Exception e) {
|
||||
logger.warn("Cannot decrypt the pdf");
|
||||
}
|
||||
}
|
||||
PDDocumentOutline outline = sourceDocument.getDocumentCatalog().getDocumentOutline();
|
||||
|
||||
if (outline == null) {
|
||||
logger.warn("No outline found for {}", file.getOriginalFilename());
|
||||
return ResponseEntity.badRequest().body("No outline found".getBytes());
|
||||
}
|
||||
List<Bookmark> bookmarks = new ArrayList<>();
|
||||
try {
|
||||
bookmarks =
|
||||
extractOutlineItems(
|
||||
sourceDocument,
|
||||
outline.getFirstChild(),
|
||||
bookmarks,
|
||||
outline.getFirstChild().getNextSibling(),
|
||||
0,
|
||||
bookmarkLevel);
|
||||
// to handle last page edge case
|
||||
bookmarks.get(bookmarks.size() - 1).setEndPage(sourceDocument.getNumberOfPages());
|
||||
Bookmark lastBookmark = bookmarks.get(bookmarks.size() - 1);
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Unable to extract outline items", e);
|
||||
return ResponseEntity.internalServerError()
|
||||
.body("Unable to extract outline items".getBytes());
|
||||
}
|
||||
|
||||
boolean allowDuplicates = request.getAllowDuplicates();
|
||||
if (!allowDuplicates) {
|
||||
/*
|
||||
duplicates are generated when multiple bookmarks correspond to the same page,
|
||||
if the user doesn't want duplicates mergeBookmarksThatCorrespondToSamePage() method will merge the titles of all
|
||||
the bookmarks that correspond to the same page, and treat them as a single bookmark
|
||||
*/
|
||||
bookmarks = mergeBookmarksThatCorrespondToSamePage(bookmarks);
|
||||
}
|
||||
for (Bookmark bookmark : bookmarks) {
|
||||
logger.info(
|
||||
"{}::::{} to {}",
|
||||
bookmark.getTitle(),
|
||||
bookmark.getStartPage(),
|
||||
bookmark.getEndPage());
|
||||
}
|
||||
List<ByteArrayOutputStream> splitDocumentsBoas =
|
||||
getSplitDocumentsBoas(sourceDocument, bookmarks, includeMetadata);
|
||||
|
||||
Path zipFile = createZipFile(bookmarks, splitDocumentsBoas);
|
||||
|
||||
byte[] data = Files.readAllBytes(zipFile);
|
||||
Files.deleteIfExists(zipFile);
|
||||
|
||||
String filename =
|
||||
Filenames.toSimpleFileName(file.getOriginalFilename())
|
||||
.replaceFirst("[.][^.]+$", "");
|
||||
sourceDocument.close();
|
||||
return WebResponseUtils.bytesToWebResponse(
|
||||
data, filename + ".zip", MediaType.APPLICATION_OCTET_STREAM);
|
||||
}
|
||||
|
||||
private List<Bookmark> mergeBookmarksThatCorrespondToSamePage(List<Bookmark> bookmarks) {
|
||||
String mergedTitle = "";
|
||||
List<Bookmark> chaptersToBeRemoved = new ArrayList<>();
|
||||
for (Bookmark bookmark : bookmarks) {
|
||||
if (bookmark.getStartPage() == bookmark.getEndPage()) {
|
||||
mergedTitle = mergedTitle.concat(bookmark.getTitle().concat(" "));
|
||||
chaptersToBeRemoved.add(bookmark);
|
||||
} else {
|
||||
if (!mergedTitle.isEmpty()) {
|
||||
if (mergedTitle.length() > 255) {
|
||||
mergedTitle = mergedTitle.substring(0, 253) + "...";
|
||||
}
|
||||
|
||||
bookmarks.set(
|
||||
bookmarks.indexOf(bookmark),
|
||||
new Bookmark(
|
||||
mergedTitle, bookmark.getStartPage(), bookmark.getEndPage()));
|
||||
}
|
||||
mergedTitle = "";
|
||||
}
|
||||
}
|
||||
bookmarks.removeAll(chaptersToBeRemoved);
|
||||
return bookmarks;
|
||||
}
|
||||
|
||||
private static List<Bookmark> extractOutlineItems(
|
||||
PDDocument sourceDocument,
|
||||
PDOutlineItem current,
|
||||
List<Bookmark> bookmarks,
|
||||
PDOutlineItem nextParent,
|
||||
int level,
|
||||
int maxLevel)
|
||||
throws Exception {
|
||||
|
||||
while (current != null) {
|
||||
|
||||
String currentTitle = current.getTitle().replace("/", "");
|
||||
int firstPage =
|
||||
sourceDocument.getPages().indexOf(current.findDestinationPage(sourceDocument));
|
||||
PDOutlineItem child = current.getFirstChild();
|
||||
PDOutlineItem nextSibling = current.getNextSibling();
|
||||
int endPage;
|
||||
if (child != null && level < maxLevel) {
|
||||
endPage =
|
||||
sourceDocument
|
||||
.getPages()
|
||||
.indexOf(child.findDestinationPage(sourceDocument));
|
||||
} else if (nextSibling != null) {
|
||||
endPage =
|
||||
sourceDocument
|
||||
.getPages()
|
||||
.indexOf(nextSibling.findDestinationPage(sourceDocument));
|
||||
} else if (nextParent != null) {
|
||||
|
||||
endPage =
|
||||
sourceDocument
|
||||
.getPages()
|
||||
.indexOf(nextParent.findDestinationPage(sourceDocument));
|
||||
} else {
|
||||
endPage = -2;
|
||||
/*
|
||||
happens when we have something like this:
|
||||
Outline Item 2
|
||||
Outline Item 2.1
|
||||
Outline Item 2.1.1
|
||||
Outline Item 2.2
|
||||
Outline 2.2.1
|
||||
Outline 2.2.2 <--- this item neither has an immediate next parent nor an immediate next sibling
|
||||
Outline Item 3
|
||||
*/
|
||||
}
|
||||
if (!bookmarks.isEmpty()
|
||||
&& bookmarks.get(bookmarks.size() - 1).getEndPage() == -2
|
||||
&& firstPage
|
||||
>= bookmarks
|
||||
.get(bookmarks.size() - 1)
|
||||
.getStartPage()) { // for handling the above-mentioned case
|
||||
Bookmark previousBookmark = bookmarks.get(bookmarks.size() - 1);
|
||||
previousBookmark.setEndPage(firstPage);
|
||||
}
|
||||
bookmarks.add(new Bookmark(currentTitle, firstPage, endPage));
|
||||
|
||||
// Recursively process children
|
||||
if (child != null && level < maxLevel) {
|
||||
extractOutlineItems(
|
||||
sourceDocument, child, bookmarks, nextSibling, level + 1, maxLevel);
|
||||
}
|
||||
|
||||
current = nextSibling;
|
||||
}
|
||||
return bookmarks;
|
||||
}
|
||||
|
||||
private Path createZipFile(
|
||||
List<Bookmark> bookmarks, List<ByteArrayOutputStream> splitDocumentsBoas)
|
||||
throws Exception {
|
||||
Path zipFile = Files.createTempFile("split_documents", ".zip");
|
||||
String fileNumberFormatter = "%0" + (Integer.toString(bookmarks.size()).length()) + "d ";
|
||||
try (ZipOutputStream zipOut = new ZipOutputStream(Files.newOutputStream(zipFile))) {
|
||||
for (int i = 0; i < splitDocumentsBoas.size(); i++) {
|
||||
|
||||
// split files will be named as "[FILE_NUMBER] [BOOKMARK_TITLE].pdf"
|
||||
|
||||
String fileName =
|
||||
String.format(fileNumberFormatter, i)
|
||||
+ bookmarks.get(i).getTitle()
|
||||
+ ".pdf";
|
||||
ByteArrayOutputStream baos = splitDocumentsBoas.get(i);
|
||||
byte[] pdf = baos.toByteArray();
|
||||
|
||||
ZipEntry pdfEntry = new ZipEntry(fileName);
|
||||
zipOut.putNextEntry(pdfEntry);
|
||||
zipOut.write(pdf);
|
||||
zipOut.closeEntry();
|
||||
|
||||
logger.info("Wrote split document {} to zip file", fileName);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed writing to zip", e);
|
||||
throw e;
|
||||
}
|
||||
|
||||
logger.info("Successfully created zip file with split documents: {}", zipFile);
|
||||
return zipFile;
|
||||
}
|
||||
|
||||
public List<ByteArrayOutputStream> getSplitDocumentsBoas(
|
||||
PDDocument sourceDocument, List<Bookmark> bookmarks, boolean includeMetadata)
|
||||
throws Exception {
|
||||
List<ByteArrayOutputStream> splitDocumentsBoas = new ArrayList<>();
|
||||
PdfMetadata metadata = null;
|
||||
if (includeMetadata) {
|
||||
metadata = PdfUtils.extractMetadataFromPdf(sourceDocument);
|
||||
}
|
||||
for (Bookmark bookmark : bookmarks) {
|
||||
try (PDDocument splitDocument = new PDDocument()) {
|
||||
boolean isSinglePage = (bookmark.getStartPage() == bookmark.getEndPage());
|
||||
|
||||
for (int i = bookmark.getStartPage();
|
||||
i < bookmark.getEndPage() + (isSinglePage ? 1 : 0);
|
||||
i++) {
|
||||
PDPage page = sourceDocument.getPage(i);
|
||||
splitDocument.addPage(page);
|
||||
logger.info("Adding page {} to split document", i);
|
||||
}
|
||||
ByteArrayOutputStream baos = new ByteArrayOutputStream();
|
||||
if (includeMetadata) {
|
||||
PdfUtils.setMetadataToPdf(splitDocument, metadata);
|
||||
}
|
||||
|
||||
splitDocument.save(baos);
|
||||
|
||||
splitDocumentsBoas.add(baos);
|
||||
} catch (Exception e) {
|
||||
logger.error("Failed splitting documents and saving them", e);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
return splitDocumentsBoas;
|
||||
}
|
||||
}
|
||||
|
||||
@Data
|
||||
@EqualsAndHashCode
|
||||
@NoArgsConstructor
|
||||
@AllArgsConstructor
|
||||
class Bookmark {
|
||||
private String title;
|
||||
private int startPage;
|
||||
private int endPage;
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
package stirling.software.SPDF.model.api;
|
||||
|
||||
import io.swagger.v3.oas.annotations.media.Schema;
|
||||
|
||||
import lombok.Data;
|
||||
import lombok.EqualsAndHashCode;
|
||||
import lombok.NoArgsConstructor;
|
||||
|
||||
@Data
|
||||
@NoArgsConstructor
|
||||
@EqualsAndHashCode(callSuper = false)
|
||||
public class SplitPdfByChaptersRequest extends PDFFile {
|
||||
@Schema(description = "Whether to include Metadata or not", example = "true")
|
||||
private Boolean includeMetadata;
|
||||
|
||||
@Schema(description = "Whether to allow duplicates or not", example = "true")
|
||||
private Boolean allowDuplicates;
|
||||
|
||||
@Schema(description = "Maximum bookmark level required", example = "2")
|
||||
private Integer bookmarkLevel;
|
||||
}
|
Loading…
Reference in a new issue