From 52a7885f3c73a4f2d0cb4446289f312de278c17e Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sun, 30 Jul 2023 14:43:34 +0100 Subject: [PATCH 1/9] all inf --- .../controller/api/security/PDFExtractor.java | 335 ++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java new file mode 100644 index 00000000..4113691d --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java @@ -0,0 +1,335 @@ +package stirling.software.SPDF.controller.api.security; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import com.itextpdf.kernel.pdf.PdfObject; +import com.itextpdf.forms.PdfAcroForm; +import com.itextpdf.forms.fields.PdfFormField; +import com.itextpdf.kernel.geom.Rectangle; +import com.itextpdf.kernel.pdf.PdfArray; +import com.itextpdf.kernel.pdf.PdfCatalog; +import com.itextpdf.kernel.pdf.PdfDictionary; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfDocumentInfo; +import com.itextpdf.kernel.pdf.PdfEncryption; +import com.itextpdf.kernel.pdf.PdfReader; +import com.itextpdf.kernel.pdf.PdfResources; +import com.itextpdf.kernel.pdf.PdfStream; +import com.itextpdf.kernel.pdf.PdfName; +import com.itextpdf.kernel.pdf.PdfViewerPreferences; +import com.itextpdf.kernel.pdf.PdfWriter; +import com.itextpdf.kernel.pdf.annot.PdfAnnotation; +import com.itextpdf.kernel.pdf.annot.PdfFileAttachmentAnnotation; +import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation; +import com.itextpdf.kernel.pdf.layer.PdfLayer; +import com.itextpdf.kernel.pdf.layer.PdfOCProperties; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.node.ArrayNode; +import com.fasterxml.jackson.databind.node.ObjectNode; +import org.apache.pdfbox.text.PDFTextStripper; +import java.io.File; +import java.io.FileWriter; +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.List; +import java.util.Map; + +public class PDFExtractor { + public static void main(String[] args) { + try { + PDDocument pdfBoxDoc = PDDocument.load(new File("path_to_pdf.pdf")); + ObjectMapper objectMapper = new ObjectMapper(); + ObjectNode jsonOutput = objectMapper.createObjectNode(); + + // Metadata using PDFBox + PDDocumentInformation info = pdfBoxDoc.getDocumentInformation(); + ObjectNode metadata = objectMapper.createObjectNode(); + metadata.put("Title", info.getTitle()); + metadata.put("Author", info.getAuthor()); + metadata.put("Subject", info.getSubject()); + metadata.put("Keywords", info.getKeywords()); + metadata.put("Producer", info.getProducer()); + metadata.put("Creator", info.getCreator()); + metadata.put("CreationDate", formatDate(info.getCreationDate())); + metadata.put("ModificationDate", formatDate(info.getModificationDate())); + metadata.put("Trapped", info.getTrapped()); + jsonOutput.set("Metadata", metadata); + + // Document Information using PDFBox + ObjectNode docInfoNode = objectMapper.createObjectNode(); + docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages()); + docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); + ; + + // Page Mode using iText7 + PdfDocument itextDoc = new PdfDocument(new PdfReader("path_to_pdf.pdf")); + PdfCatalog catalog = itextDoc.getCatalog(); + PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode); + + ObjectNode itextDocInfo = objectMapper.createObjectNode(); + docInfoNode.put("Page Mode", getPageModeDescription(pageMode));; + + jsonOutput.set("Document Information", docInfoNode); + + for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { + ObjectNode pageInfo = objectMapper.createObjectNode(); + + // Page-level Information + Rectangle pageSize = itextDoc.getPage(pageNum).getPageSize(); + pageInfo.put("Width", pageSize.getWidth()); + pageInfo.put("Height", pageSize.getHeight()); + pageInfo.put("Rotation", itextDoc.getPage(pageNum).getRotation()); + + // Boxes + pageInfo.put("MediaBox", itextDoc.getPage(pageNum).getMediaBox().toString()); + pageInfo.put("CropBox", itextDoc.getPage(pageNum).getCropBox().toString()); + pageInfo.put("BleedBox", itextDoc.getPage(pageNum).getBleedBox().toString()); + pageInfo.put("TrimBox", itextDoc.getPage(pageNum).getTrimBox().toString()); + pageInfo.put("ArtBox", itextDoc.getPage(pageNum).getArtBox().toString()); + + // Content Extraction + PDFTextStripper textStripper = new PDFTextStripper(); + textStripper.setStartPage(pageNum -1); + textStripper.setEndPage(pageNum - 1); + String pageText = textStripper.getText(pdfBoxDoc); + + pageInfo.put("Text Characters Count", pageText.length()); // + + // Annotations + ArrayNode annotationsArray = objectMapper.createArrayNode(); + List annotations = itextDoc.getPage(pageNum).getAnnotations(); + for (PdfAnnotation annotation : annotations) { + ObjectNode annotationNode = objectMapper.createObjectNode(); + annotationNode.put("Subtype", annotation.getSubtype().toString()); + annotationNode.put("Contents", annotation.getContents().getValue()); + annotationsArray.add(annotationNode); + } + pageInfo.set("Annotations", annotationsArray); + + // Images (simplified) + // This part is non-trivial as images can be embedded in multiple ways in a PDF. + // Here is a basic structure to recognize image XObjects on a page. + ArrayNode imagesArray = objectMapper.createArrayNode(); + PdfResources resources = itextDoc.getPage(pageNum).getResources(); + for (PdfName name : resources.getResourceNames()) { + PdfObject obj = resources.getResource(name); + if (obj instanceof PdfStream) { + PdfStream stream = (PdfStream) obj; + if (PdfName.Image.equals(stream.getAsName(PdfName.Subtype))) { + ObjectNode imageNode = objectMapper.createObjectNode(); + imageNode.put("Width", stream.getAsNumber(PdfName.Width).intValue()); + imageNode.put("Height", stream.getAsNumber(PdfName.Height).intValue()); + PdfObject colorSpace = stream.get(PdfName.ColorSpace); + if (colorSpace != null) { + imageNode.put("ColorSpace", colorSpace.toString()); + } + imagesArray.add(imageNode); + } + } + } + pageInfo.set("Images", imagesArray); + + // Links + ArrayNode linksArray = objectMapper.createArrayNode(); + for (PdfAnnotation annotation : annotations) { + if (annotation instanceof PdfLinkAnnotation) { + PdfLinkAnnotation linkAnnotation = (PdfLinkAnnotation) annotation; + ObjectNode linkNode = objectMapper.createObjectNode(); + linkNode.put("URI", linkAnnotation.getAction().toString()); // Basic, might not work for all links + linksArray.add(linkNode); + } + } + pageInfo.set("Links", linksArray); + + //Fonts + ArrayNode fontsArray = objectMapper.createArrayNode(); + PdfDictionary fontDicts = resources.getResource(PdfName.Font); + if (fontDicts != null) { + for (PdfName key : fontDicts.keySet()) { + PdfDictionary font = fontDicts.getAsDictionary(key); + ObjectNode fontNode = objectMapper.createObjectNode(); + fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); + + // Font Subtype (e.g., Type1, TrueType) + if (font.containsKey(PdfName.Subtype)) { + fontNode.put("Subtype", font.getAsName(PdfName.Subtype).toString()); + } + + // Font Descriptor + PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor); + if (fontDescriptor != null) { + // Italic Angle + if (fontDescriptor.containsKey(PdfName.ItalicAngle)) { + fontNode.put("ItalicAngle", fontDescriptor.getAsNumber(PdfName.ItalicAngle).floatValue()); + } + + // Flags (e.g., italic, bold) + if (fontDescriptor.containsKey(PdfName.Flags)) { + int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue(); + fontNode.put("IsItalic", (flags & 64) != 0); + fontNode.put("IsBold", (flags & 1) != 0); + } + } + + fontsArray.add(fontNode); + } + } + pageInfo.set("Fonts", fontsArray); + + + + + // Access resources dictionary + PdfDictionary resourcesDict = itextDoc.getPage(pageNum).getResources().getPdfObject(); + + // Color Spaces & ICC Profiles + ArrayNode colorSpacesArray = objectMapper.createArrayNode(); + PdfDictionary colorSpaces = resourcesDict.getAsDictionary(PdfName.ColorSpace); + if (colorSpaces != null) { + for (PdfName name : colorSpaces.keySet()) { + PdfObject colorSpaceObject = colorSpaces.get(name); + if (colorSpaceObject instanceof PdfArray) { + PdfArray colorSpaceArray = (PdfArray) colorSpaceObject; + if (colorSpaceArray.size() > 1 && colorSpaceArray.get(0) instanceof PdfName && PdfName.ICCBased.equals(colorSpaceArray.get(0))) { + ObjectNode iccProfileNode = objectMapper.createObjectNode(); + PdfStream iccStream = (PdfStream) colorSpaceArray.get(1); + byte[] iccData = iccStream.getBytes(); + // TODO: Further decode and analyze the ICC data if needed + iccProfileNode.put("ICC Profile Length", iccData.length); + colorSpacesArray.add(iccProfileNode); + } + } + } + } + pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray); + + // Other XObjects + ArrayNode xObjectsArray = objectMapper.createArrayNode(); + PdfDictionary xObjects = resourcesDict.getAsDictionary(PdfName.XObject); + if (xObjects != null) { + for (PdfName name : xObjects.keySet()) { + PdfStream xObjectStream = xObjects.getAsStream(name); + ObjectNode xObjectNode = objectMapper.createObjectNode(); + xObjectNode.put("Type", xObjectStream.getAsName(PdfName.Subtype).toString()); + // TODO: Extract further details depending on the XObject type + xObjectsArray.add(xObjectNode); + } + } + pageInfo.set("XObjects", xObjectsArray); + + jsonOutput.set("Page " + pageNum, pageInfo); + } + + PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false); + if (acroForm != null) { + ObjectNode formFieldsNode = objectMapper.createObjectNode(); + for (Map.Entry entry : acroForm.getFormFields().entrySet()) { + formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString()); + } + jsonOutput.set("FormFields", formFieldsNode); + } + + + + //TODO bookmarks here + + + + + //embeed files TODO size + PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) + .getAsDictionary(PdfName.EmbeddedFiles); + if (embeddedFiles != null) { + ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); + PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode embeddedFileNode = objectMapper.createObjectNode(); + embeddedFileNode.put("Name", namesArray.getAsString(i).toString()); + // Add other details if required + embeddedFilesArray.add(embeddedFileNode); + } + jsonOutput.set("EmbeddedFiles", embeddedFilesArray); + } + + + //attachments TODO size + ArrayNode attachmentsArray = objectMapper.createArrayNode(); + for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { + for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) { + if (annotation instanceof PdfFileAttachmentAnnotation) { + ObjectNode attachmentNode = objectMapper.createObjectNode(); + attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString()); + attachmentNode.put("Description", annotation.getContents().getValue()); + attachmentsArray.add(attachmentNode); + } + } + } + jsonOutput.set("Attachments", attachmentsArray); + + //Javascript + PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); + if (namesDict != null) { + PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript); + if (javascriptDict != null) { + ArrayNode javascriptArray = objectMapper.createArrayNode(); + PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode jsNode = objectMapper.createObjectNode(); + jsNode.put("JS Name", namesArray.getAsString(i).toString()); + jsNode.put("JS Code", namesArray.getAsString(i + 1).toString()); + javascriptArray.add(jsNode); + } + jsonOutput.set("JavaScripts", javascriptArray); + } + } + + + //TODO size + PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); + if (ocProperties != null) { + ArrayNode layersArray = objectMapper.createArrayNode(); + for (PdfLayer layer : ocProperties.getLayers()) { + ObjectNode layerNode = objectMapper.createObjectNode(); + layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString()); + layersArray.add(layerNode); + } + jsonOutput.set("Layers", layersArray); + } + + + //TODO Security + + + + + + + // Digital Signatures using iText7 TODO + + + // Save JSON to file + try (FileWriter file = new FileWriter("output.json")) { + file.write(objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput)); + file.flush(); + } + + pdfBoxDoc.close(); + itextDoc.close(); + } catch (Exception e) { + e.printStackTrace(); + } + } + + private static String formatDate(Calendar calendar) { + if (calendar != null) { + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); + return sdf.format(calendar.getTime()); + } else { + return null; + } + } + + private static String getPageModeDescription(PdfName pageMode) { + return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown"; + } +} From 0da9c62ef84560cbab58b1a5a0bdd55aa3c96ab8 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sun, 30 Jul 2023 21:56:09 +0100 Subject: [PATCH 2/9] all info --- .../controller/api/security/PDFExtractor.java | 675 ++++++++++++++---- .../controller/web/SecurityWebController.java | 7 + .../templates/security/get-info-on-pdf.html | 33 + 3 files changed, 592 insertions(+), 123 deletions(-) create mode 100644 src/main/resources/templates/security/get-info-on-pdf.html diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java index 4113691d..2cd429b4 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java @@ -1,8 +1,23 @@ package stirling.software.SPDF.controller.api.security; +import org.apache.pdfbox.cos.COSArray; +import org.apache.pdfbox.cos.COSBase; +import org.apache.pdfbox.cos.COSDictionary; +import org.apache.pdfbox.cos.COSName; +import org.apache.pdfbox.cos.COSString; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentInformation; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureElement; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureNode; +import org.apache.pdfbox.pdmodel.documentinterchange.logicalstructure.PDStructureTreeRoot; +import org.apache.pdfbox.pdmodel.encryption.PDEncryption; +import org.apache.pdfbox.pdmodel.interactive.form.PDAcroForm; +import org.apache.pdfbox.pdmodel.interactive.form.PDField; + import com.itextpdf.kernel.pdf.PdfObject; +import com.itextpdf.kernel.pdf.PdfOutline; import com.itextpdf.forms.PdfAcroForm; import com.itextpdf.forms.fields.PdfFormField; import com.itextpdf.kernel.geom.Rectangle; @@ -15,29 +30,64 @@ import com.itextpdf.kernel.pdf.PdfEncryption; import com.itextpdf.kernel.pdf.PdfReader; import com.itextpdf.kernel.pdf.PdfResources; import com.itextpdf.kernel.pdf.PdfStream; +import com.itextpdf.kernel.pdf.PdfString; import com.itextpdf.kernel.pdf.PdfName; import com.itextpdf.kernel.pdf.PdfViewerPreferences; import com.itextpdf.kernel.pdf.PdfWriter; import com.itextpdf.kernel.pdf.annot.PdfAnnotation; import com.itextpdf.kernel.pdf.annot.PdfFileAttachmentAnnotation; import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation; +import com.itextpdf.kernel.pdf.annot.PdfWidgetAnnotation; import com.itextpdf.kernel.pdf.layer.PdfLayer; import com.itextpdf.kernel.pdf.layer.PdfOCProperties; +import com.itextpdf.kernel.xmp.XMPException; +import com.itextpdf.kernel.xmp.XMPMeta; +import com.itextpdf.kernel.xmp.XMPMetaFactory; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.WebResponseUtils; + import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.node.ArrayNode; import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.pdfbox.text.PDFTextStripper; +import org.springframework.http.MediaType; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + import java.io.File; +import java.util.HashMap; import java.io.FileWriter; +import java.io.IOException; +import java.nio.charset.StandardCharsets; import java.text.SimpleDateFormat; import java.util.Calendar; import java.util.List; import java.util.Map; - +import java.util.Set; +import java.util.HashSet; +@RestController +@Tag(name = "Security", description = "Security APIs") public class PDFExtractor { - public static void main(String[] args) { - try { - PDDocument pdfBoxDoc = PDDocument.load(new File("path_to_pdf.pdf")); + + static ObjectMapper objectMapper = new ObjectMapper(); + + @PostMapping(consumes = "multipart/form-data", value = "/get-info-on-pdf") + @Operation(summary = "Summary here", description = "desc. Input:PDF Output:JSON Type:SISO") + public ResponseEntity getPdfInfo( + @RequestPart(required = true, value = "fileInput") + @Parameter(description = "The input PDF file to get info on", required = true) MultipartFile inputFile) + throws IOException { + + try ( + PDDocument pdfBoxDoc = PDDocument.load(inputFile.getInputStream()); + PdfDocument itextDoc = new PdfDocument(new PdfReader(inputFile.getInputStream())) + ) { ObjectMapper objectMapper = new ObjectMapper(); ObjectNode jsonOutput = objectMapper.createObjectNode(); @@ -55,22 +105,256 @@ public class PDFExtractor { metadata.put("Trapped", info.getTrapped()); jsonOutput.set("Metadata", metadata); + + + // Total file size of the PDF + long fileSizeInBytes = inputFile.getSize(); + jsonOutput.put("FileSizeInBytes", fileSizeInBytes); + + // Number of words, paragraphs, and images in the entire document + String fullText = new PDFTextStripper().getText(pdfBoxDoc); + String[] words = fullText.split("\\s+"); + int wordCount = words.length; + int paragraphCount = fullText.split("\r\n|\r|\n").length; + jsonOutput.put("WordCount", wordCount); + jsonOutput.put("ParagraphCount", paragraphCount); + // Number of characters in the entire document (including spaces and special characters) + int charCount = fullText.length(); + jsonOutput.put("CharacterCount", charCount); + + + // Initialize the flags and types + boolean hasCompression = false; + String compressionType = "None"; + + // Check for object streams + for (int i = 1; i <= itextDoc.getNumberOfPdfObjects(); i++) { + PdfObject obj = itextDoc.getPdfObject(i); + if (obj != null && obj.isStream() && ((PdfStream) obj).get(PdfName.Type) == PdfName.ObjStm) { + hasCompression = true; + compressionType = "Object Streams"; + break; + } + } + + // If not compressed using object streams, check for compressed Xref tables + if (!hasCompression && itextDoc.getReader().hasRebuiltXref()) { + hasCompression = true; + compressionType = "Compressed Xref or Rebuilt Xref"; + } + jsonOutput.put("Compression", hasCompression); + if(hasCompression) + jsonOutput.put("CompressionType", compressionType); + + String language = pdfBoxDoc.getDocumentCatalog().getLanguage(); + jsonOutput.put("Language", language); + // Document Information using PDFBox ObjectNode docInfoNode = objectMapper.createObjectNode(); docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages()); docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); - ; + // Page Mode using iText7 - PdfDocument itextDoc = new PdfDocument(new PdfReader("path_to_pdf.pdf")); PdfCatalog catalog = itextDoc.getCatalog(); PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode); - ObjectNode itextDocInfo = objectMapper.createObjectNode(); + + + + PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false); + ObjectNode formFieldsNode = objectMapper.createObjectNode(); + if (acroForm != null) { + for (Map.Entry entry : acroForm.getFormFields().entrySet()) { + formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString()); + } + } + jsonOutput.set("FormFields", formFieldsNode); + + + + + + //embeed files TODO size + ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); + if(itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) != null) + { + PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) + .getAsDictionary(PdfName.EmbeddedFiles); + if (embeddedFiles != null) { + + PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode embeddedFileNode = objectMapper.createObjectNode(); + embeddedFileNode.put("Name", namesArray.getAsString(i).toString()); + // Add other details if required + embeddedFilesArray.add(embeddedFileNode); + } + + } + } + jsonOutput.set("EmbeddedFiles", embeddedFilesArray); + + //attachments TODO size + ArrayNode attachmentsArray = objectMapper.createArrayNode(); + for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { + for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) { + if (annotation instanceof PdfFileAttachmentAnnotation) { + ObjectNode attachmentNode = objectMapper.createObjectNode(); + attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString()); + attachmentNode.put("Description", annotation.getContents().getValue()); + attachmentsArray.add(attachmentNode); + } + } + } + jsonOutput.set("Attachments", attachmentsArray); + + //Javascript + PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); + ArrayNode javascriptArray = objectMapper.createArrayNode(); + if (namesDict != null) { + PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript); + if (javascriptDict != null) { + + PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names); + for (int i = 0; i < namesArray.size(); i += 2) { + ObjectNode jsNode = objectMapper.createObjectNode(); + jsNode.put("JS Name", namesArray.getAsString(i).toString()); + jsNode.put("JS Code", namesArray.getAsString(i + 1).toString()); + javascriptArray.add(jsNode); + } + + } + } + jsonOutput.set("JavaScript", javascriptArray); + + //TODO size + PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); + ArrayNode layersArray = objectMapper.createArrayNode(); + if (ocProperties != null) { + + for (PdfLayer layer : ocProperties.getLayers()) { + ObjectNode layerNode = objectMapper.createObjectNode(); + layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString()); + layersArray.add(layerNode); + } + + } + jsonOutput.set("Layers", layersArray); + + //TODO Security + + + + + + + // Digital Signatures using iText7 TODO + + + + + PDAcroForm pdAcroForm = pdfBoxDoc.getDocumentCatalog().getAcroForm(); + ArrayNode formFieldsArray2 = objectMapper.createArrayNode(); + if (pdAcroForm != null) { + + for (PDField field : pdAcroForm.getFields()) { + ObjectNode fieldNode = objectMapper.createObjectNode(); + fieldNode.put("FieldName", field.getFullyQualifiedName()); + fieldNode.put("FieldType", field.getFieldType()); + // Add more attributes as needed... + formFieldsArray2.add(fieldNode); + } + + } + jsonOutput.set("FormFields", formFieldsArray2); + + + PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot(); + ArrayNode structureTreeArray; + try { + if(structureTreeRoot != null) { + structureTreeArray = exploreStructureTree(structureTreeRoot.getKids()); + jsonOutput.set("StructureTree", structureTreeArray); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + + + + + + + + boolean isPdfACompliant = checkOutputIntent(itextDoc, "PDF/A"); + boolean isPdfXCompliant = checkOutputIntent(itextDoc, "PDF/X"); + boolean isPdfECompliant = checkForStandard(itextDoc, "PDF/E"); + boolean isPdfVTCompliant = checkForStandard(itextDoc, "PDF/VT"); + boolean isPdfUACompliant = checkForStandard(itextDoc, "PDF/UA"); + boolean isPdfBCompliant = checkForStandard(itextDoc, "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't an official ISO standard. + boolean isPdfSECCompliant = checkForStandard(itextDoc, "PDF/SEC"); // This might not be effective since PDF/SEC was under development in 2021. + + ObjectNode compliancy = objectMapper.createObjectNode(); + compliancy.put("IsPDF/ACompliant", isPdfACompliant); + compliancy.put("IsPDF/XCompliant", isPdfXCompliant); + compliancy.put("IsPDF/ECompliant", isPdfECompliant); + compliancy.put("IsPDF/VTCompliant", isPdfVTCompliant); + compliancy.put("IsPDF/UACompliant", isPdfUACompliant); + compliancy.put("IsPDF/BCompliant", isPdfBCompliant); + compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant); + + jsonOutput.set("Compliancy", compliancy); + + + + + ArrayNode bookmarksArray = objectMapper.createArrayNode(); + PdfOutline root = itextDoc.getOutlines(false); + if (root != null) { + for (PdfOutline child : root.getAllChildren()) { + addOutlinesToArray(child, bookmarksArray); + } + } + jsonOutput.set("Bookmarks/Outline/TOC", bookmarksArray); + + String xmpString = null; + try { + byte[] xmpBytes = itextDoc.getXmpMetadata(); + if (xmpBytes != null) { + XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(xmpBytes); + xmpString = xmpMeta.dumpObject(); + + } + } catch (XMPException e) { + e.printStackTrace(); + } + jsonOutput.put("XMPMetadata", xmpString); + + + + ObjectNode encryptionNode = objectMapper.createObjectNode(); + if (pdfBoxDoc.isEncrypted()) { + encryptionNode.put("IsEncrypted", true); + + // Retrieve encryption details using getEncryption() + PDEncryption encryption = pdfBoxDoc.getEncryption(); + encryptionNode.put("EncryptionAlgorithm", encryption.getFilter()); + encryptionNode.put("KeyLength", encryption.getLength()); + encryptionNode.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString()); + + // Add other encryption-related properties as needed + } else { + encryptionNode.put("IsEncrypted", false); + } + jsonOutput.set("Encryption", encryptionNode); + docInfoNode.put("Page Mode", getPageModeDescription(pageMode));; jsonOutput.set("Document Information", docInfoNode); - + ObjectNode pageInfoParent = objectMapper.createObjectNode(); for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { ObjectNode pageInfo = objectMapper.createObjectNode(); @@ -79,7 +363,9 @@ public class PDFExtractor { pageInfo.put("Width", pageSize.getWidth()); pageInfo.put("Height", pageSize.getHeight()); pageInfo.put("Rotation", itextDoc.getPage(pageNum).getRotation()); - + pageInfo.put("Page Orientation", getPageOrientation(pageSize.getWidth(),pageSize.getHeight())); + pageInfo.put("Standard Size", getPageSize(pageSize.getWidth(),pageSize.getHeight())); + // Boxes pageInfo.put("MediaBox", itextDoc.getPage(pageNum).getMediaBox().toString()); pageInfo.put("CropBox", itextDoc.getPage(pageNum).getCropBox().toString()); @@ -98,14 +384,25 @@ public class PDFExtractor { // Annotations ArrayNode annotationsArray = objectMapper.createArrayNode(); List annotations = itextDoc.getPage(pageNum).getAnnotations(); - for (PdfAnnotation annotation : annotations) { - ObjectNode annotationNode = objectMapper.createObjectNode(); - annotationNode.put("Subtype", annotation.getSubtype().toString()); - annotationNode.put("Contents", annotation.getContents().getValue()); - annotationsArray.add(annotationNode); - } - pageInfo.set("Annotations", annotationsArray); + int subtypeCount = 0; + int contentsCount = 0; + + for (PdfAnnotation annotation : annotations) { + if(annotation.getSubtype() != null) { + subtypeCount++; // Increase subtype count + } + if(annotation.getContents() != null) { + contentsCount++; // Increase contents count + } + } + + ObjectNode annotationsObject = objectMapper.createObjectNode(); + annotationsObject.put("AnnotationsCount", annotations.size()); + annotationsObject.put("SubtypeCount", subtypeCount); + annotationsObject.put("ContentsCount", contentsCount); + pageInfo.set("Annotations", annotationsObject); + // Images (simplified) // This part is non-trivial as images can be embedded in multiple ways in a PDF. // Here is a basic structure to recognize image XObjects on a page. @@ -129,32 +426,62 @@ public class PDFExtractor { } pageInfo.set("Images", imagesArray); + // Links ArrayNode linksArray = objectMapper.createArrayNode(); + Set uniqueURIs = new HashSet<>(); // To store unique URIs + for (PdfAnnotation annotation : annotations) { if (annotation instanceof PdfLinkAnnotation) { PdfLinkAnnotation linkAnnotation = (PdfLinkAnnotation) annotation; - ObjectNode linkNode = objectMapper.createObjectNode(); - linkNode.put("URI", linkAnnotation.getAction().toString()); // Basic, might not work for all links - linksArray.add(linkNode); + String uri = linkAnnotation.getAction().toString(); + uniqueURIs.add(uri); // Add to set to ensure uniqueness } } + + // Add unique URIs to linksArray + for (String uri : uniqueURIs) { + ObjectNode linkNode = objectMapper.createObjectNode(); + linkNode.put("URI", uri); + linksArray.add(linkNode); + } pageInfo.set("Links", linksArray); //Fonts ArrayNode fontsArray = objectMapper.createArrayNode(); PdfDictionary fontDicts = resources.getResource(PdfName.Font); + Set uniqueSubtypes = new HashSet<>(); // To store unique subtypes + + if (fontDicts != null) { for (PdfName key : fontDicts.keySet()) { + ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font PdfDictionary font = fontDicts.getAsDictionary(key); - ObjectNode fontNode = objectMapper.createObjectNode(); - fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); + boolean isEmbedded = font.containsKey(PdfName.FontFile) || + font.containsKey(PdfName.FontFile2) || + font.containsKey(PdfName.FontFile3); + fontNode.put("IsEmbedded", isEmbedded); + + + if (font.containsKey(PdfName.Encoding)) { + String encoding = font.getAsName(PdfName.Encoding).toString(); + fontNode.put("Encoding", encoding); + } + + + + if(font.getAsString(PdfName.BaseFont) != null) + fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); + + String subtype = null; // Font Subtype (e.g., Type1, TrueType) if (font.containsKey(PdfName.Subtype)) { - fontNode.put("Subtype", font.getAsName(PdfName.Subtype).toString()); + subtype = font.getAsName(PdfName.Subtype).toString(); + uniqueSubtypes.add(subtype); // Add to set to ensure uniqueness } - + fontNode.put("Subtype", subtype); + // Font Descriptor PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor); if (fontDescriptor != null) { @@ -166,14 +493,53 @@ public class PDFExtractor { // Flags (e.g., italic, bold) if (fontDescriptor.containsKey(PdfName.Flags)) { int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue(); - fontNode.put("IsItalic", (flags & 64) != 0); - fontNode.put("IsBold", (flags & 1) != 0); + fontNode.put("IsItalic", (flags & 64) != 0); // Existing italic flag + fontNode.put("IsBold", (flags & 1 << 16) != 0); // Existing bold flag + fontNode.put("IsFixedPitch", (flags & 1) != 0); + fontNode.put("IsSerif", (flags & 2) != 0); + fontNode.put("IsSymbolic", (flags & 4) != 0); + fontNode.put("IsScript", (flags & 8) != 0); + fontNode.put("IsNonsymbolic", (flags & 16) != 0); + } + + if (fontDescriptor.containsKey(PdfName.FontFamily)) { + String fontFamily = fontDescriptor.getAsString(PdfName.FontFamily).toString(); + fontNode.put("FontFamily", fontFamily); } - } - fontsArray.add(fontNode); + if (fontDescriptor.containsKey(PdfName.FontStretch)) { + String fontStretch = fontDescriptor.getAsName(PdfName.FontStretch).toString(); + fontNode.put("FontStretch", fontStretch); + } + + if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontBBox)) { + PdfArray bbox = fontDescriptor.getAsArray(PdfName.FontBBox); + fontNode.put("FontBoundingBox", bbox.toString()); + } + if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontWeight)) { + float fontWeight = fontDescriptor.getAsNumber(PdfName.FontWeight).floatValue(); + fontNode.put("FontWeight", fontWeight); + } + + } + if (font.containsKey(PdfName.ToUnicode)) { + PdfStream toUnicodeStream = font.getAsStream(PdfName.ToUnicode); + // Handle the stream as needed, maybe extract some details or just note its existence + fontNode.put("HasToUnicodeMap", true); + } + if (fontNode.size() > 0) { + fontsArray.add(fontNode); // Add each font node to fontsArray + } } } + + // Add unique subtypes to fontsArray + ArrayNode subtypesArray = objectMapper.createArrayNode(); + for (String subtype : uniqueSubtypes) { + subtypesArray.add(subtype); + } + pageInfo.set("FontSubtypes", subtypesArray); // Changed from Fonts to FontSubtypes + pageInfo.set("Fonts", fontsArray); @@ -204,123 +570,186 @@ public class PDFExtractor { pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray); // Other XObjects - ArrayNode xObjectsArray = objectMapper.createArrayNode(); + Map xObjectCountMap = new HashMap<>(); // To store the count for each type PdfDictionary xObjects = resourcesDict.getAsDictionary(PdfName.XObject); if (xObjects != null) { for (PdfName name : xObjects.keySet()) { PdfStream xObjectStream = xObjects.getAsStream(name); - ObjectNode xObjectNode = objectMapper.createObjectNode(); - xObjectNode.put("Type", xObjectStream.getAsName(PdfName.Subtype).toString()); - // TODO: Extract further details depending on the XObject type - xObjectsArray.add(xObjectNode); + String xObjectType = xObjectStream.getAsName(PdfName.Subtype).toString(); + + // Increment the count for this type in the map + xObjectCountMap.put(xObjectType, xObjectCountMap.getOrDefault(xObjectType, 0) + 1); } } - pageInfo.set("XObjects", xObjectsArray); - jsonOutput.set("Page " + pageNum, pageInfo); - } - - PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false); - if (acroForm != null) { - ObjectNode formFieldsNode = objectMapper.createObjectNode(); - for (Map.Entry entry : acroForm.getFormFields().entrySet()) { - formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString()); + // Add the count map to pageInfo (or wherever you want to store it) + ObjectNode xObjectCountNode = objectMapper.createObjectNode(); + for (Map.Entry entry : xObjectCountMap.entrySet()) { + xObjectCountNode.put(entry.getKey(), entry.getValue()); } - jsonOutput.set("FormFields", formFieldsNode); - } + pageInfo.set("XObjectCounts", xObjectCountNode); + + - - - //TODO bookmarks here - - - - - //embeed files TODO size - PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names) - .getAsDictionary(PdfName.EmbeddedFiles); - if (embeddedFiles != null) { - ArrayNode embeddedFilesArray = objectMapper.createArrayNode(); - PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names); - for (int i = 0; i < namesArray.size(); i += 2) { - ObjectNode embeddedFileNode = objectMapper.createObjectNode(); - embeddedFileNode.put("Name", namesArray.getAsString(i).toString()); - // Add other details if required - embeddedFilesArray.add(embeddedFileNode); - } - jsonOutput.set("EmbeddedFiles", embeddedFilesArray); - } - - - //attachments TODO size - ArrayNode attachmentsArray = objectMapper.createArrayNode(); - for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { - for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) { - if (annotation instanceof PdfFileAttachmentAnnotation) { - ObjectNode attachmentNode = objectMapper.createObjectNode(); - attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString()); - attachmentNode.put("Description", annotation.getContents().getValue()); - attachmentsArray.add(attachmentNode); + ArrayNode multimediaArray = objectMapper.createArrayNode(); + for (PdfAnnotation annotation : annotations) { + if (PdfName.RichMedia.equals(annotation.getSubtype())) { + ObjectNode multimediaNode = objectMapper.createObjectNode(); + // Extract details from the dictionary as needed + multimediaArray.add(multimediaNode); } } + pageInfo.set("Multimedia", multimediaArray); + + + + pageInfoParent.set("Page " + pageNum, pageInfo); } - jsonOutput.set("Attachments", attachmentsArray); - - //Javascript - PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); - if (namesDict != null) { - PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript); - if (javascriptDict != null) { - ArrayNode javascriptArray = objectMapper.createArrayNode(); - PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names); - for (int i = 0; i < namesArray.size(); i += 2) { - ObjectNode jsNode = objectMapper.createObjectNode(); - jsNode.put("JS Name", namesArray.getAsString(i).toString()); - jsNode.put("JS Code", namesArray.getAsString(i + 1).toString()); - javascriptArray.add(jsNode); - } - jsonOutput.set("JavaScripts", javascriptArray); - } - } - - //TODO size - PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); - if (ocProperties != null) { - ArrayNode layersArray = objectMapper.createArrayNode(); - for (PdfLayer layer : ocProperties.getLayers()) { - ObjectNode layerNode = objectMapper.createObjectNode(); - layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString()); - layersArray.add(layerNode); - } - jsonOutput.set("Layers", layersArray); - } - + jsonOutput.set("Per Page Info", pageInfoParent); - //TODO Security - - - - - - - // Digital Signatures using iText7 TODO // Save JSON to file - try (FileWriter file = new FileWriter("output.json")) { - file.write(objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput)); - file.flush(); - } - - pdfBoxDoc.close(); - itextDoc.close(); + String jsonString = objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput); + + + + return WebResponseUtils.bytesToWebResponse(jsonString.getBytes(StandardCharsets.UTF_8), "response.json", MediaType.APPLICATION_JSON); + } catch (Exception e) { e.printStackTrace(); } + return null; } - private static String formatDate(Calendar calendar) { + private static void addOutlinesToArray(PdfOutline outline, ArrayNode arrayNode) { + if (outline == null) return; + ObjectNode outlineNode = objectMapper.createObjectNode(); + outlineNode.put("Title", outline.getTitle()); + // You can add other properties if needed + arrayNode.add(outlineNode); + + for (PdfOutline child : outline.getAllChildren()) { + addOutlinesToArray(child, arrayNode); + } + } + public String getPageOrientation(double width, double height) { + if (width > height) { + return "Landscape"; + } else if (height > width) { + return "Portrait"; + } else { + return "Square"; + } + } + public String getPageSize(double width, double height) { + // Common aspect ratios used for standard paper sizes + double[] aspectRatios = {4.0 / 3.0, 3.0 / 2.0, Math.sqrt(2.0), 16.0 / 9.0}; + + // Check if the page matches any common aspect ratio + for (double aspectRatio : aspectRatios) { + if (isCloseToAspectRatio(width, height, aspectRatio)) { + return "Standard"; + } + } + + // If not a standard aspect ratio, consider it as a custom size + return "Custom"; + } + private boolean isCloseToAspectRatio(double width, double height, double aspectRatio) { + // Calculate the aspect ratio of the page + double pageAspectRatio = width / height; + + // Compare the page aspect ratio with the common aspect ratio within a threshold + return Math.abs(pageAspectRatio - aspectRatio) <= 0.05; + } + + public boolean checkForStandard(PdfDocument document, String standardKeyword) { + // Check Output Intents + boolean foundInOutputIntents = checkOutputIntent(document, standardKeyword); + if (foundInOutputIntents) return true; + + // Check XMP Metadata (rudimentary) + try { + byte[] metadataBytes = document.getXmpMetadata(); + if (metadataBytes != null) { + XMPMeta xmpMeta = XMPMetaFactory.parseFromBuffer(metadataBytes); + String xmpString = xmpMeta.dumpObject(); + if (xmpString.contains(standardKeyword)) { + return true; + } + } + } catch (XMPException e) { + e.printStackTrace(); + } + + return false; + } + + + public boolean checkOutputIntent(PdfDocument document, String standard) { + PdfArray outputIntents = document.getCatalog().getPdfObject().getAsArray(PdfName.OutputIntents); + if (outputIntents != null && !outputIntents.isEmpty()) { + for (int i = 0; i < outputIntents.size(); i++) { + PdfDictionary outputIntentDict = outputIntents.getAsDictionary(i); + if (outputIntentDict != null) { + PdfString s = outputIntentDict.getAsString(PdfName.S); + if (s != null && s.toString().contains(standard)) { + return true; + } + } + } + } + return false; + } + + public ArrayNode exploreStructureTree(List nodes) { + ArrayNode elementsArray = objectMapper.createArrayNode(); + if (nodes != null) { + for (Object obj : nodes) { + if (obj instanceof PDStructureNode) { + PDStructureNode node = (PDStructureNode) obj; + ObjectNode elementNode = objectMapper.createObjectNode(); + + if (node instanceof PDStructureElement) { + PDStructureElement structureElement = (PDStructureElement) node; + elementNode.put("Type", structureElement.getStructureType()); + elementNode.put("Content", getContent(structureElement)); + + // Recursively explore child elements + ArrayNode childElements = exploreStructureTree(structureElement.getKids()); + if (childElements.size() > 0) { + elementNode.set("Children", childElements); + } + } + elementsArray.add(elementNode); + } + } + } + return elementsArray; + } + + + public String getContent(PDStructureElement structureElement) { + StringBuilder contentBuilder = new StringBuilder(); + + for (Object item : structureElement.getKids()) { + if (item instanceof COSString) { + COSString cosString = (COSString) item; + contentBuilder.append(cosString.getString()); + } else if (item instanceof PDStructureElement) { + // For simplicity, we're handling only COSString and PDStructureElement here + // but a more comprehensive method would handle other types too + contentBuilder.append(getContent((PDStructureElement) item)); + } + } + + return contentBuilder.toString(); + } + + + private String formatDate(Calendar calendar) { if (calendar != null) { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); return sdf.format(calendar.getTime()); @@ -329,7 +758,7 @@ public class PDFExtractor { } } - private static String getPageModeDescription(PdfName pageMode) { + private String getPageModeDescription(PdfName pageMode) { return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown"; } } diff --git a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java index 66de400e..3857cc9e 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/SecurityWebController.java @@ -52,4 +52,11 @@ public class SecurityWebController { model.addAttribute("currentPage", "sanitize-pdf"); return "security/sanitize-pdf"; } + + @GetMapping("/get-info-on-pdf") + @Hidden + public String getInfo(Model model) { + model.addAttribute("currentPage", "get-info-on-pdf"); + return "security/get-info-on-pdf"; + } } diff --git a/src/main/resources/templates/security/get-info-on-pdf.html b/src/main/resources/templates/security/get-info-on-pdf.html new file mode 100644 index 00000000..36a0bb6e --- /dev/null +++ b/src/main/resources/templates/security/get-info-on-pdf.html @@ -0,0 +1,33 @@ + + + + + + + + +
+
+
+

+
+
+
+

+

+

+
+
+ + +
+ +
+
+
+ +
+
+
+ + \ No newline at end of file From 77411e94a48bc35a35a316c334ac1a1b664d1424 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Tue, 1 Aug 2023 00:03:13 +0100 Subject: [PATCH 3/9] new features --- build.gradle | 3 +- .../api/ToSinglePageController.java | 86 +++++++++++++++++ .../api/converters/ConvertHtmlToPDF.java | 81 ++-------------- .../api/converters/ConvertMarkdownToPdf.java | 52 ++++++++++ .../web/ConverterWebController.java | 9 +- .../controller/web/GeneralWebController.java | 14 +++ .../software/SPDF/utils/FileToPdf.java | 95 +++++++++++++++++++ .../software/SPDF/utils/WebResponseUtils.java | 17 ++++ src/main/resources/messages_en_GB.properties | 19 ++++ src/main/resources/static/images/extract.svg | 3 + src/main/resources/static/images/info.svg | 4 + src/main/resources/static/images/markdown.svg | 3 + .../resources/static/images/single-page.svg | 4 + .../resources/templates/auto-split-pdf.html | 2 +- .../templates/convert/markdown-to-pdf.html | 30 ++++++ .../resources/templates/extract-page.html | 33 +++++++ src/main/resources/templates/home.html | 7 ++ .../templates/pdf-to-single-page.html | 29 ++++++ 18 files changed, 413 insertions(+), 78 deletions(-) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java create mode 100644 src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java create mode 100644 src/main/java/stirling/software/SPDF/utils/FileToPdf.java create mode 100644 src/main/resources/static/images/extract.svg create mode 100644 src/main/resources/static/images/info.svg create mode 100644 src/main/resources/static/images/markdown.svg create mode 100644 src/main/resources/static/images/single-page.svg create mode 100644 src/main/resources/templates/convert/markdown-to-pdf.html create mode 100644 src/main/resources/templates/extract-page.html create mode 100644 src/main/resources/templates/pdf-to-single-page.html diff --git a/build.gradle b/build.gradle index 688bee00..e851d9a5 100644 --- a/build.gradle +++ b/build.gradle @@ -61,8 +61,9 @@ dependencies { implementation 'com.itextpdf:itext7-core:7.2.5' implementation 'org.springframework.boot:spring-boot-starter-actuator' implementation 'io.micrometer:micrometer-core' - implementation group: 'com.google.zxing', name: 'core', version: '3.5.1' + // https://mvnrepository.com/artifact/org.commonmark/commonmark + implementation 'org.commonmark:commonmark:0.21.0' developmentOnly("org.springframework.boot:spring-boot-devtools") diff --git a/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java b/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java new file mode 100644 index 00000000..2c249b85 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/ToSinglePageController.java @@ -0,0 +1,86 @@ +package stirling.software.SPDF.controller.api; + +import java.io.IOException; +import java.io.ByteArrayOutputStream; +import com.itextpdf.kernel.pdf.*; +import com.itextpdf.kernel.pdf.xobject.PdfFormXObject; +import com.itextpdf.kernel.geom.PageSize; +import com.itextpdf.kernel.geom.Rectangle; +import com.itextpdf.layout.Document; +import com.itextpdf.layout.element.Image; +import java.util.ArrayList; +import java.util.List; + +import org.apache.pdfbox.pdmodel.PDDocument; +import org.apache.pdfbox.pdmodel.PDPage; +import org.apache.pdfbox.pdmodel.PDPageTree; +import org.apache.pdfbox.pdmodel.common.PDRectangle; +import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.Parameter; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.WebResponseUtils; +import org.apache.pdfbox.pdmodel.*; +import org.apache.pdfbox.multipdf.PDFMergerUtility; +@RestController +@Tag(name = "General", description = "General APIs") +public class ToSinglePageController { + + private static final Logger logger = LoggerFactory.getLogger(ToSinglePageController.class); + + + @PostMapping(consumes = "multipart/form-data", value = "/pdf-to-single-page") + @Operation( + summary = "Convert a multi-page PDF into a single long page PDF", + description = "This endpoint converts a multi-page PDF document into a single paged PDF document. The width of the single page will be same as the input's width, but the height will be the sum of all the pages' heights. Input:PDF Output:PDF Type:SISO" + ) + public ResponseEntity pdfToSinglePage( + @RequestPart(required = true, value = "fileInput") + @Parameter(description = "The input multi-page PDF file to be converted into a single page", required = true) + MultipartFile file) throws IOException { + + PdfReader reader = new PdfReader(file.getInputStream()); + PdfDocument sourceDocument = new PdfDocument(reader); + + float totalHeight = 0; + float width = 0; + + for (int i = 1; i <= sourceDocument.getNumberOfPages(); i++) { + Rectangle pageSize = sourceDocument.getPage(i).getPageSize(); + totalHeight += pageSize.getHeight(); + if(width < pageSize.getWidth()) + width = pageSize.getWidth(); + } + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PdfWriter writer = new PdfWriter(baos); + PdfDocument newDocument = new PdfDocument(writer); + PageSize newPageSize = new PageSize(width, totalHeight); + newDocument.addNewPage(newPageSize); + + Document layoutDoc = new Document(newDocument); + float yOffset = totalHeight; + + for (int i = 1; i <= sourceDocument.getNumberOfPages(); i++) { + PdfFormXObject pageCopy = sourceDocument.getPage(i).copyAsFormXObject(newDocument); + Image copiedPage = new Image(pageCopy); + copiedPage.setFixedPosition(0, yOffset - sourceDocument.getPage(i).getPageSize().getHeight()); + yOffset -= sourceDocument.getPage(i).getPageSize().getHeight(); + layoutDoc.add(copiedPage); + } + + layoutDoc.close(); + sourceDocument.close(); + + byte[] result = baos.toByteArray(); + return WebResponseUtils.bytesToWebResponse(result, file.getOriginalFilename().replaceFirst("[.][^.]+$", "") + "_singlePage.pdf"); + } +} \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java index a5878b04..e054d7f0 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertHtmlToPDF.java @@ -19,6 +19,7 @@ import org.springframework.web.multipart.MultipartFile; import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.FileToPdf; import stirling.software.SPDF.utils.GeneralUtils; import stirling.software.SPDF.utils.ProcessExecutor; import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; @@ -44,87 +45,17 @@ public class ConvertHtmlToPDF { String originalFilename = fileInput.getOriginalFilename(); if (originalFilename == null || (!originalFilename.endsWith(".html") && !originalFilename.endsWith(".zip"))) { throw new IllegalArgumentException("File must be either .html or .zip format."); - } - Path tempOutputFile = Files.createTempFile("output_", ".pdf"); - Path tempInputFile = null; - byte[] pdfBytes; - try { - if (originalFilename.endsWith(".html")) { - tempInputFile = Files.createTempFile("input_", ".html"); - Files.write(tempInputFile, fileInput.getBytes()); - } else { - tempInputFile = unzipAndGetMainHtml(fileInput); - } - - List command = new ArrayList<>(); - command.add("weasyprint"); - command.add(tempInputFile.toString()); - command.add(tempOutputFile.toString()); - ProcessExecutorResult returnCode; - if (originalFilename.endsWith(".zip")) { - returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) - .runCommandWithOutputHandling(command, tempInputFile.getParent().toFile()); - } else { - - returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) - .runCommandWithOutputHandling(command); - } - - pdfBytes = Files.readAllBytes(tempOutputFile); - } finally { - // Clean up temporary files - Files.delete(tempOutputFile); - Files.delete(tempInputFile); - - if (originalFilename.endsWith(".zip")) { - GeneralUtils.deleteDirectory(tempInputFile.getParent()); - } - } + }byte[] pdfBytes = FileToPdf.convertHtmlToPdf( fileInput.getBytes(), originalFilename); + String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); } - - - private Path unzipAndGetMainHtml(MultipartFile zipFile) throws IOException { - Path tempDirectory = Files.createTempDirectory("unzipped_"); - try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(zipFile.getBytes()))) { - ZipEntry entry = zipIn.getNextEntry(); - while (entry != null) { - Path filePath = tempDirectory.resolve(entry.getName()); - if (entry.isDirectory()) { - Files.createDirectories(filePath); // Explicitly create the directory structure - } else { - Files.createDirectories(filePath.getParent()); // Create parent directories if they don't exist - Files.copy(zipIn, filePath); - } - zipIn.closeEntry(); - entry = zipIn.getNextEntry(); - } - } - - //search for the main HTML file. - try (Stream walk = Files.walk(tempDirectory)) { - List htmlFiles = walk.filter(file -> file.toString().endsWith(".html")) - .collect(Collectors.toList()); - - if (htmlFiles.isEmpty()) { - throw new IOException("No HTML files found in the unzipped directory."); - } - - // Prioritize 'index.html' if it exists, otherwise use the first .html file - for (Path htmlFile : htmlFiles) { - if (htmlFile.getFileName().toString().equals("index.html")) { - return htmlFile; - } - } - - return htmlFiles.get(0); - } - } + - + } diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java new file mode 100644 index 00000000..c1bc1b73 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertMarkdownToPdf.java @@ -0,0 +1,52 @@ +package stirling.software.SPDF.controller.api.converters; + +import java.io.IOException; + +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.FileToPdf; +import stirling.software.SPDF.utils.WebResponseUtils; + +@RestController +@Tag(name = "Convert", description = "Convert APIs") +public class ConvertMarkdownToPdf { + + @PostMapping(consumes = "multipart/form-data", value = "/markdown-to-pdf") + @Operation( + summary = "Convert a Markdown file to PDF", + description = "This endpoint takes a Markdown file input, converts it to HTML, and then to PDF format." + ) + public ResponseEntity markdownToPdf( + @RequestPart(required = true, value = "fileInput") MultipartFile fileInput) + throws IOException, InterruptedException { + + if (fileInput == null) { + throw new IllegalArgumentException("Please provide a Markdown file for conversion."); + } + + String originalFilename = fileInput.getOriginalFilename(); + if (originalFilename == null || !originalFilename.endsWith(".md")) { + throw new IllegalArgumentException("File must be in .md format."); + } + + // Convert Markdown to HTML using CommonMark + Parser parser = Parser.builder().build(); + Node document = parser.parse(new String(fileInput.getBytes())); + HtmlRenderer renderer = HtmlRenderer.builder().build(); + String htmlContent = renderer.render(document); + + byte[] pdfBytes = FileToPdf.convertHtmlToPdf(htmlContent.getBytes(), "converted.html"); + + String outputFilename = originalFilename.replaceFirst("[.][^.]+$", "") + ".pdf"; // Remove file extension and append .pdf + return WebResponseUtils.bytesToWebResponse(pdfBytes, outputFilename); + } +} diff --git a/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java b/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java index 90429f1a..76e7be8f 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/ConverterWebController.java @@ -25,7 +25,14 @@ public class ConverterWebController { model.addAttribute("currentPage", "html-to-pdf"); return "convert/html-to-pdf"; } - + @GetMapping("/markdown-to-pdf") + @Hidden + public String convertMarkdownToPdfForm(Model model) { + model.addAttribute("currentPage", "markdown-to-pdf"); + return "convert/markdown-to-pdf"; + } + + @GetMapping("/url-to-pdf") @Hidden public String convertURLToPdfForm(Model model) { diff --git a/src/main/java/stirling/software/SPDF/controller/web/GeneralWebController.java b/src/main/java/stirling/software/SPDF/controller/web/GeneralWebController.java index 75d67401..4d6e991a 100644 --- a/src/main/java/stirling/software/SPDF/controller/web/GeneralWebController.java +++ b/src/main/java/stirling/software/SPDF/controller/web/GeneralWebController.java @@ -97,6 +97,20 @@ public class GeneralWebController { return "pdf-organizer"; } + @GetMapping("/extract-page") + @Hidden + public String extractPages(Model model) { + model.addAttribute("currentPage", "extract-page"); + return "extract-page"; + } + + @GetMapping("/pdf-to-single-page") + @Hidden + public String pdfToSinglePage(Model model) { + model.addAttribute("currentPage", "pdf-to-single-page"); + return "pdf-to-single-page"; + } + @GetMapping("/rotate-pdf") @Hidden public String rotatePdfForm(Model model) { diff --git a/src/main/java/stirling/software/SPDF/utils/FileToPdf.java b/src/main/java/stirling/software/SPDF/utils/FileToPdf.java new file mode 100644 index 00000000..9515a3ac --- /dev/null +++ b/src/main/java/stirling/software/SPDF/utils/FileToPdf.java @@ -0,0 +1,95 @@ +package stirling.software.SPDF.utils; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import stirling.software.SPDF.utils.ProcessExecutor.ProcessExecutorResult; + +public class FileToPdf { + public static byte[] convertHtmlToPdf(byte[] fileBytes, String fileName) throws IOException, InterruptedException { + + Path tempOutputFile = Files.createTempFile("output_", ".pdf"); + Path tempInputFile = null; + byte[] pdfBytes; + try { + if (fileName.endsWith(".html")) { + tempInputFile = Files.createTempFile("input_", ".html"); + Files.write(tempInputFile, fileBytes); + } else { + tempInputFile = unzipAndGetMainHtml(fileBytes); + } + + List command = new ArrayList<>(); + command.add("weasyprint"); + command.add(tempInputFile.toString()); + command.add(tempOutputFile.toString()); + ProcessExecutorResult returnCode; + if (fileName.endsWith(".zip")) { + returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) + .runCommandWithOutputHandling(command, tempInputFile.getParent().toFile()); + } else { + + returnCode = ProcessExecutor.getInstance(ProcessExecutor.Processes.WEASYPRINT) + .runCommandWithOutputHandling(command); + } + + pdfBytes = Files.readAllBytes(tempOutputFile); + } finally { + // Clean up temporary files + Files.delete(tempOutputFile); + Files.delete(tempInputFile); + + if (fileName.endsWith(".zip")) { + GeneralUtils.deleteDirectory(tempInputFile.getParent()); + } + } + + return pdfBytes; + } + + + private static Path unzipAndGetMainHtml(byte[] fileBytes) throws IOException { + Path tempDirectory = Files.createTempDirectory("unzipped_"); + try (ZipInputStream zipIn = new ZipInputStream(new ByteArrayInputStream(fileBytes))) { + ZipEntry entry = zipIn.getNextEntry(); + while (entry != null) { + Path filePath = tempDirectory.resolve(entry.getName()); + if (entry.isDirectory()) { + Files.createDirectories(filePath); // Explicitly create the directory structure + } else { + Files.createDirectories(filePath.getParent()); // Create parent directories if they don't exist + Files.copy(zipIn, filePath); + } + zipIn.closeEntry(); + entry = zipIn.getNextEntry(); + } + } + + //search for the main HTML file. + try (Stream walk = Files.walk(tempDirectory)) { + List htmlFiles = walk.filter(file -> file.toString().endsWith(".html")) + .collect(Collectors.toList()); + + if (htmlFiles.isEmpty()) { + throw new IOException("No HTML files found in the unzipped directory."); + } + + // Prioritize 'index.html' if it exists, otherwise use the first .html file + for (Path htmlFile : htmlFiles) { + if (htmlFile.getFileName().toString().equals("index.html")) { + return htmlFile; + } + } + + return htmlFiles.get(0); + } + } +} diff --git a/src/main/java/stirling/software/SPDF/utils/WebResponseUtils.java b/src/main/java/stirling/software/SPDF/utils/WebResponseUtils.java index 59c0b056..09a395ba 100644 --- a/src/main/java/stirling/software/SPDF/utils/WebResponseUtils.java +++ b/src/main/java/stirling/software/SPDF/utils/WebResponseUtils.java @@ -12,6 +12,9 @@ import org.springframework.http.MediaType; import org.springframework.http.ResponseEntity; import org.springframework.web.multipart.MultipartFile; +import com.itextpdf.kernel.pdf.PdfDocument; +import com.itextpdf.kernel.pdf.PdfWriter; + public class WebResponseUtils { public static ResponseEntity boasToWebResponse(ByteArrayOutputStream baos, String docName) throws IOException { @@ -57,5 +60,19 @@ public class WebResponseUtils { return boasToWebResponse(baos, docName); } + + public static ResponseEntity pdfDocToWebResponse(PdfDocument document, String docName) throws IOException { + + // Open Byte Array and save document to it + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + PdfWriter writer = new PdfWriter(baos); + PdfDocument newDocument = new PdfDocument(writer); + + document.copyPagesTo(1, document.getNumberOfPages(), newDocument); + newDocument.close(); + + return boasToWebResponse(baos, docName); + } + } diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 0d881b65..807e8581 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -236,6 +236,25 @@ home.HTMLToPDF.desc=Converts any HTML file or zip to PDF HTMLToPDF.tags=markup,web-content,transformation,convert +home.MarkdownToPDF.title=Markdown to PDF +home.MarkdownToPDF.desc=Converts any Markdown fileto PDF +MarkdownToPDF.tags=markup,web-content,transformation,convert + + +home.getPdfInfo.title=Get ALL Info on PDF +home.getPdfInfo.desc=Grabs any and all information possible on PDFs +getPdfInfo.tags=infomation,data,stats,statistics + + +home.extractPage.title=Extract page(s) +home.extractPage.desc=Extracts select pages from PDF +extractPage.tags=extract + + +home.PdfToSinglePage.title=PDF to Single Large Page +home.PdfToSinglePage.desc=Merges all PDF pages into one large single page +PdfToSinglePage.tags=single page + ########################### # # # WEB PAGES # diff --git a/src/main/resources/static/images/extract.svg b/src/main/resources/static/images/extract.svg new file mode 100644 index 00000000..d21f03eb --- /dev/null +++ b/src/main/resources/static/images/extract.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/main/resources/static/images/info.svg b/src/main/resources/static/images/info.svg new file mode 100644 index 00000000..8f48f86c --- /dev/null +++ b/src/main/resources/static/images/info.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/main/resources/static/images/markdown.svg b/src/main/resources/static/images/markdown.svg new file mode 100644 index 00000000..ca5cd597 --- /dev/null +++ b/src/main/resources/static/images/markdown.svg @@ -0,0 +1,3 @@ + + + \ No newline at end of file diff --git a/src/main/resources/static/images/single-page.svg b/src/main/resources/static/images/single-page.svg new file mode 100644 index 00000000..4f57d79b --- /dev/null +++ b/src/main/resources/static/images/single-page.svg @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/src/main/resources/templates/auto-split-pdf.html b/src/main/resources/templates/auto-split-pdf.html index 4f3045e0..d6f68fe1 100644 --- a/src/main/resources/templates/auto-split-pdf.html +++ b/src/main/resources/templates/auto-split-pdf.html @@ -22,7 +22,7 @@
  • -
    +

    diff --git a/src/main/resources/templates/convert/markdown-to-pdf.html b/src/main/resources/templates/convert/markdown-to-pdf.html new file mode 100644 index 00000000..4637d129 --- /dev/null +++ b/src/main/resources/templates/convert/markdown-to-pdf.html @@ -0,0 +1,30 @@ + + + + + + +
    +
    +
    +

    +
    +
    +
    +

    + +
    +
    + + + +

    +

    +
    +
    +
    +
    +
    +
    + + diff --git a/src/main/resources/templates/extract-page.html b/src/main/resources/templates/extract-page.html new file mode 100644 index 00000000..0a5fb158 --- /dev/null +++ b/src/main/resources/templates/extract-page.html @@ -0,0 +1,33 @@ + + + + + + + +
    +
    +
    +

    +
    +
    +
    +

    +
    +
    + +
    + + +
    + + +
    +
    +
    +
    +
    +
    +
    + + \ No newline at end of file diff --git a/src/main/resources/templates/home.html b/src/main/resources/templates/home.html index 8cbb6ca7..c5c8a0da 100644 --- a/src/main/resources/templates/home.html +++ b/src/main/resources/templates/home.html @@ -84,6 +84,13 @@
    +
    +
    +
    +
    + + +
    diff --git a/src/main/resources/templates/pdf-to-single-page.html b/src/main/resources/templates/pdf-to-single-page.html new file mode 100644 index 00000000..1f5b64c6 --- /dev/null +++ b/src/main/resources/templates/pdf-to-single-page.html @@ -0,0 +1,29 @@ + + + + + + + +
    +
    +
    +

    +
    +
    +
    +

    +
    +

    +
    + +
    +
    +
    +
    + +
    +
    +
    + + \ No newline at end of file From 96f05cd5186394b1633f6902bc8ce485f425f476 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Wed, 2 Aug 2023 22:49:43 +0100 Subject: [PATCH 4/9] get info changes --- .../{PDFExtractor.java => GetInfoOnPDF.java} | 175 +++++++++--------- src/main/resources/messages_en_GB.properties | 2 +- src/main/resources/static/js/fileInput.js | 59 +++--- .../resources/templates/fragments/common.html | 12 +- .../resources/templates/other/compare.html | 4 +- .../templates/security/get-info-on-pdf.html | 100 +++++++++- 6 files changed, 226 insertions(+), 126 deletions(-) rename src/main/java/stirling/software/SPDF/controller/api/security/{PDFExtractor.java => GetInfoOnPDF.java} (89%) diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java similarity index 89% rename from src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java rename to src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java index 2cd429b4..a167cb74 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/PDFExtractor.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java @@ -73,7 +73,7 @@ import java.util.Set; import java.util.HashSet; @RestController @Tag(name = "Security", description = "Security APIs") -public class PDFExtractor { +public class GetInfoOnPDF { static ObjectMapper objectMapper = new ObjectMapper(); @@ -94,6 +94,13 @@ public class PDFExtractor { // Metadata using PDFBox PDDocumentInformation info = pdfBoxDoc.getDocumentInformation(); ObjectNode metadata = objectMapper.createObjectNode(); + ObjectNode basicInfo = objectMapper.createObjectNode(); + ObjectNode docInfoNode = objectMapper.createObjectNode(); + ObjectNode compliancy = objectMapper.createObjectNode(); + ObjectNode encryption = objectMapper.createObjectNode(); + ObjectNode other = objectMapper.createObjectNode(); + + metadata.put("Title", info.getTitle()); metadata.put("Author", info.getAuthor()); metadata.put("Subject", info.getSubject()); @@ -102,25 +109,25 @@ public class PDFExtractor { metadata.put("Creator", info.getCreator()); metadata.put("CreationDate", formatDate(info.getCreationDate())); metadata.put("ModificationDate", formatDate(info.getModificationDate())); - metadata.put("Trapped", info.getTrapped()); jsonOutput.set("Metadata", metadata); - + + // Total file size of the PDF long fileSizeInBytes = inputFile.getSize(); - jsonOutput.put("FileSizeInBytes", fileSizeInBytes); + basicInfo.put("FileSizeInBytes", fileSizeInBytes); // Number of words, paragraphs, and images in the entire document String fullText = new PDFTextStripper().getText(pdfBoxDoc); String[] words = fullText.split("\\s+"); int wordCount = words.length; int paragraphCount = fullText.split("\r\n|\r|\n").length; - jsonOutput.put("WordCount", wordCount); - jsonOutput.put("ParagraphCount", paragraphCount); + basicInfo.put("WordCount", wordCount); + basicInfo.put("ParagraphCount", paragraphCount); // Number of characters in the entire document (including spaces and special characters) int charCount = fullText.length(); - jsonOutput.put("CharacterCount", charCount); + basicInfo.put("CharacterCount", charCount); // Initialize the flags and types @@ -142,22 +149,24 @@ public class PDFExtractor { hasCompression = true; compressionType = "Compressed Xref or Rebuilt Xref"; } - jsonOutput.put("Compression", hasCompression); + basicInfo.put("Compression", hasCompression); if(hasCompression) - jsonOutput.put("CompressionType", compressionType); + basicInfo.put("CompressionType", compressionType); String language = pdfBoxDoc.getDocumentCatalog().getLanguage(); - jsonOutput.put("Language", language); + basicInfo.put("Language", language); + basicInfo.put("Number of pages", pdfBoxDoc.getNumberOfPages()); - // Document Information using PDFBox - ObjectNode docInfoNode = objectMapper.createObjectNode(); - docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages()); - docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); - // Page Mode using iText7 PdfCatalog catalog = itextDoc.getCatalog(); PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode); + + // Document Information using PDFBox + docInfoNode.put("PDF version", pdfBoxDoc.getVersion()); + docInfoNode.put("Trapped", info.getTrapped()); + docInfoNode.put("Page Mode", getPageModeDescription(pageMode));; + @@ -193,7 +202,7 @@ public class PDFExtractor { } } - jsonOutput.set("EmbeddedFiles", embeddedFilesArray); + other.set("EmbeddedFiles", embeddedFilesArray); //attachments TODO size ArrayNode attachmentsArray = objectMapper.createArrayNode(); @@ -207,7 +216,7 @@ public class PDFExtractor { } } } - jsonOutput.set("Attachments", attachmentsArray); + other.set("Attachments", attachmentsArray); //Javascript PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names); @@ -226,7 +235,7 @@ public class PDFExtractor { } } - jsonOutput.set("JavaScript", javascriptArray); + other.set("JavaScript", javascriptArray); //TODO size PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false); @@ -240,7 +249,7 @@ public class PDFExtractor { } } - jsonOutput.set("Layers", layersArray); + other.set("Layers", layersArray); //TODO Security @@ -267,7 +276,7 @@ public class PDFExtractor { } } - jsonOutput.set("FormFields", formFieldsArray2); + jsonOutput.set("FormFields2", formFieldsArray2); PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot(); @@ -275,19 +284,13 @@ public class PDFExtractor { try { if(structureTreeRoot != null) { structureTreeArray = exploreStructureTree(structureTreeRoot.getKids()); - jsonOutput.set("StructureTree", structureTreeArray); + other.set("StructureTree", structureTreeArray); } } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } - - - - - - boolean isPdfACompliant = checkOutputIntent(itextDoc, "PDF/A"); boolean isPdfXCompliant = checkOutputIntent(itextDoc, "PDF/X"); @@ -297,7 +300,6 @@ public class PDFExtractor { boolean isPdfBCompliant = checkForStandard(itextDoc, "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't an official ISO standard. boolean isPdfSECCompliant = checkForStandard(itextDoc, "PDF/SEC"); // This might not be effective since PDF/SEC was under development in 2021. - ObjectNode compliancy = objectMapper.createObjectNode(); compliancy.put("IsPDF/ACompliant", isPdfACompliant); compliancy.put("IsPDF/XCompliant", isPdfXCompliant); compliancy.put("IsPDF/ECompliant", isPdfECompliant); @@ -306,7 +308,6 @@ public class PDFExtractor { compliancy.put("IsPDF/BCompliant", isPdfBCompliant); compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant); - jsonOutput.set("Compliancy", compliancy); @@ -318,7 +319,7 @@ public class PDFExtractor { addOutlinesToArray(child, bookmarksArray); } } - jsonOutput.set("Bookmarks/Outline/TOC", bookmarksArray); + other.set("Bookmarks/Outline/TOC", bookmarksArray); String xmpString = null; try { @@ -331,29 +332,27 @@ public class PDFExtractor { } catch (XMPException e) { e.printStackTrace(); } - jsonOutput.put("XMPMetadata", xmpString); + other.put("XMPMetadata", xmpString); - ObjectNode encryptionNode = objectMapper.createObjectNode(); if (pdfBoxDoc.isEncrypted()) { - encryptionNode.put("IsEncrypted", true); + encryption.put("IsEncrypted", true); // Retrieve encryption details using getEncryption() - PDEncryption encryption = pdfBoxDoc.getEncryption(); - encryptionNode.put("EncryptionAlgorithm", encryption.getFilter()); - encryptionNode.put("KeyLength", encryption.getLength()); - encryptionNode.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString()); + PDEncryption pdfEncryption = pdfBoxDoc.getEncryption(); + encryption.put("EncryptionAlgorithm", pdfEncryption.getFilter()); + encryption.put("KeyLength", pdfEncryption.getLength()); + encryption.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString()); // Add other encryption-related properties as needed } else { - encryptionNode.put("IsEncrypted", false); + encryption.put("IsEncrypted", false); } - jsonOutput.set("Encryption", encryptionNode); - docInfoNode.put("Page Mode", getPageModeDescription(pageMode));; + + - jsonOutput.set("Document Information", docInfoNode); ObjectNode pageInfoParent = objectMapper.createObjectNode(); for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) { ObjectNode pageInfo = objectMapper.createObjectNode(); @@ -382,7 +381,6 @@ public class PDFExtractor { pageInfo.put("Text Characters Count", pageText.length()); // // Annotations - ArrayNode annotationsArray = objectMapper.createArrayNode(); List annotations = itextDoc.getPage(pageNum).getAnnotations(); int subtypeCount = 0; @@ -447,61 +445,57 @@ public class PDFExtractor { } pageInfo.set("Links", linksArray); - //Fonts + // Fonts ArrayNode fontsArray = objectMapper.createArrayNode(); PdfDictionary fontDicts = resources.getResource(PdfName.Font); - Set uniqueSubtypes = new HashSet<>(); // To store unique subtypes + Set uniqueSubtypes = new HashSet<>(); // To store unique subtypes + + // Map to store unique fonts and their counts + Map uniqueFontsMap = new HashMap<>(); - if (fontDicts != null) { for (PdfName key : fontDicts.keySet()) { - ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font + ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font PdfDictionary font = fontDicts.getAsDictionary(key); - - boolean isEmbedded = font.containsKey(PdfName.FontFile) || - font.containsKey(PdfName.FontFile2) || + + boolean isEmbedded = font.containsKey(PdfName.FontFile) || + font.containsKey(PdfName.FontFile2) || font.containsKey(PdfName.FontFile3); - fontNode.put("IsEmbedded", isEmbedded); - - - if (font.containsKey(PdfName.Encoding)) { - String encoding = font.getAsName(PdfName.Encoding).toString(); - fontNode.put("Encoding", encoding); - } - - - - if(font.getAsString(PdfName.BaseFont) != null) + fontNode.put("IsEmbedded", isEmbedded); + + if (font.containsKey(PdfName.Encoding)) { + String encoding = font.getAsName(PdfName.Encoding).toString(); + fontNode.put("Encoding", encoding); + } + + if (font.getAsString(PdfName.BaseFont) != null) { fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString()); - + } + String subtype = null; - // Font Subtype (e.g., Type1, TrueType) if (font.containsKey(PdfName.Subtype)) { subtype = font.getAsName(PdfName.Subtype).toString(); uniqueSubtypes.add(subtype); // Add to set to ensure uniqueness } fontNode.put("Subtype", subtype); - - // Font Descriptor + PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor); if (fontDescriptor != null) { - // Italic Angle if (fontDescriptor.containsKey(PdfName.ItalicAngle)) { fontNode.put("ItalicAngle", fontDescriptor.getAsNumber(PdfName.ItalicAngle).floatValue()); } - - // Flags (e.g., italic, bold) + if (fontDescriptor.containsKey(PdfName.Flags)) { int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue(); - fontNode.put("IsItalic", (flags & 64) != 0); // Existing italic flag - fontNode.put("IsBold", (flags & 1 << 16) != 0); // Existing bold flag + fontNode.put("IsItalic", (flags & 64) != 0); + fontNode.put("IsBold", (flags & 1 << 16) != 0); fontNode.put("IsFixedPitch", (flags & 1) != 0); fontNode.put("IsSerif", (flags & 2) != 0); fontNode.put("IsSymbolic", (flags & 4) != 0); fontNode.put("IsScript", (flags & 8) != 0); fontNode.put("IsNonsymbolic", (flags & 16) != 0); } - + if (fontDescriptor.containsKey(PdfName.FontFamily)) { String fontFamily = fontDescriptor.getAsString(PdfName.FontFamily).toString(); fontNode.put("FontFamily", fontFamily); @@ -511,34 +505,43 @@ public class PDFExtractor { String fontStretch = fontDescriptor.getAsName(PdfName.FontStretch).toString(); fontNode.put("FontStretch", fontStretch); } - - if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontBBox)) { + + if (fontDescriptor.containsKey(PdfName.FontBBox)) { PdfArray bbox = fontDescriptor.getAsArray(PdfName.FontBBox); fontNode.put("FontBoundingBox", bbox.toString()); } - if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontWeight)) { + + if (fontDescriptor.containsKey(PdfName.FontWeight)) { float fontWeight = fontDescriptor.getAsNumber(PdfName.FontWeight).floatValue(); fontNode.put("FontWeight", fontWeight); } - } + if (font.containsKey(PdfName.ToUnicode)) { - PdfStream toUnicodeStream = font.getAsStream(PdfName.ToUnicode); - // Handle the stream as needed, maybe extract some details or just note its existence fontNode.put("HasToUnicodeMap", true); } + if (fontNode.size() > 0) { - fontsArray.add(fontNode); // Add each font node to fontsArray + // Create a unique key for this font node based on its attributes + String uniqueKey = fontNode.toString(); + + // Increment count if this font exists, or initialize it if new + if (uniqueFontsMap.containsKey(uniqueKey)) { + ObjectNode existingFontNode = uniqueFontsMap.get(uniqueKey); + int count = existingFontNode.get("Count").asInt() + 1; + existingFontNode.put("Count", count); + } else { + fontNode.put("Count", 1); + uniqueFontsMap.put(uniqueKey, fontNode); + } } } } - // Add unique subtypes to fontsArray - ArrayNode subtypesArray = objectMapper.createArrayNode(); - for (String subtype : uniqueSubtypes) { - subtypesArray.add(subtype); + // Add unique font entries to fontsArray + for (ObjectNode uniqueFontNode : uniqueFontsMap.values()) { + fontsArray.add(uniqueFontNode); } - pageInfo.set("FontSubtypes", subtypesArray); // Changed from Fonts to FontSubtypes pageInfo.set("Fonts", fontsArray); @@ -605,8 +608,14 @@ public class PDFExtractor { pageInfoParent.set("Page " + pageNum, pageInfo); } + - jsonOutput.set("Per Page Info", pageInfoParent); + jsonOutput.set("BasicInfo", basicInfo); + jsonOutput.set("DocumentInfo", docInfoNode); + jsonOutput.set("Compliancy", compliancy); + jsonOutput.set("Encryption", encryption); + jsonOutput.set("Other", other); + jsonOutput.set("PerPageInfo", pageInfoParent); diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 807e8581..5ae0c5a0 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -237,7 +237,7 @@ HTMLToPDF.tags=markup,web-content,transformation,convert home.MarkdownToPDF.title=Markdown to PDF -home.MarkdownToPDF.desc=Converts any Markdown fileto PDF +home.MarkdownToPDF.desc=Converts any Markdown file to PDF MarkdownToPDF.tags=markup,web-content,transformation,convert diff --git a/src/main/resources/static/js/fileInput.js b/src/main/resources/static/js/fileInput.js index 94b5294f..0842462e 100644 --- a/src/main/resources/static/js/fileInput.js +++ b/src/main/resources/static/js/fileInput.js @@ -1,12 +1,18 @@ document.addEventListener('DOMContentLoaded', function() { + document.querySelectorAll('.custom-file-chooser').forEach(setupFileInput); +}); +function setupFileInput(chooser) { + const elementId = chooser.getAttribute('data-element-id'); + const filesSelected = chooser.getAttribute('data-files-selected'); + const pdfPrompt = chooser.getAttribute('data-pdf-prompt'); + let overlay; let dragCounter = 0; const dragenterListener = function() { dragCounter++; if (!overlay) { - // Create and show the overlay overlay = document.createElement('div'); overlay.style.position = 'fixed'; overlay.style.top = 0; @@ -28,7 +34,6 @@ document.addEventListener('DOMContentLoaded', function() { const dragleaveListener = function() { dragCounter--; if (dragCounter === 0) { - // Hide and remove the overlay if (overlay) { overlay.remove(); overlay = null; @@ -40,24 +45,19 @@ document.addEventListener('DOMContentLoaded', function() { const dt = e.dataTransfer; const files = dt.files; - // Access the file input element and assign dropped files - const fileInput = document.getElementById(elementID); + const fileInput = document.getElementById(elementId); fileInput.files = files; - // Hide and remove the overlay if (overlay) { overlay.remove(); overlay = null; } - // Reset drag counter dragCounter = 0; - //handleFileInputChange(fileInput); fileInput.dispatchEvent(new Event('change', { bubbles: true })); }; - // Prevent default behavior for drag events ['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => { document.body.addEventListener(eventName, preventDefaults, false); }); @@ -69,29 +69,26 @@ document.addEventListener('DOMContentLoaded', function() { document.body.addEventListener('dragenter', dragenterListener); document.body.addEventListener('dragleave', dragleaveListener); - // Add drop event listener document.body.addEventListener('drop', dropListener); -}); + $("#" + elementId).on("change", function() { + handleFileInputChange(this); + }); -$("#"+elementID).on("change", function() { - handleFileInputChange(this); -}); - - -function handleFileInputChange(inputElement) { - const files = $(inputElement).get(0).files; - const fileNames = Array.from(files).map(f => f.name); - const selectedFilesContainer = $(inputElement).siblings(".selected-files"); - selectedFilesContainer.empty(); - fileNames.forEach(fileName => { - selectedFilesContainer.append("
    " + fileName + "
    "); - }); - if (fileNames.length === 1) { - $(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames[0]); - } else if (fileNames.length > 1) { - $(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames.length + " " + filesSelected); - } else { - $(inputElement).siblings(".custom-file-label").addClass("selected").html(pdfPrompt); - } -} \ No newline at end of file + function handleFileInputChange(inputElement) { + const files = $(inputElement).get(0).files; + const fileNames = Array.from(files).map(f => f.name); + const selectedFilesContainer = $(inputElement).siblings(".selected-files"); + selectedFilesContainer.empty(); + fileNames.forEach(fileName => { + selectedFilesContainer.append("
    " + fileName + "
    "); + }); + if (fileNames.length === 1) { + $(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames[0]); + } else if (fileNames.length > 1) { + $(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames.length + " " + filesSelected); + } else { + $(inputElement).siblings(".custom-file-label").addClass("selected").html(pdfPrompt); + } + } +} diff --git a/src/main/resources/templates/fragments/common.html b/src/main/resources/templates/fragments/common.html index eb079166..42477113 100644 --- a/src/main/resources/templates/fragments/common.html +++ b/src/main/resources/templates/fragments/common.html @@ -98,7 +98,10 @@ -
    +
    @@ -114,12 +117,7 @@
    - - + diff --git a/src/main/resources/templates/other/compare.html b/src/main/resources/templates/other/compare.html index 7c06062a..aeb83aee 100644 --- a/src/main/resources/templates/other/compare.html +++ b/src/main/resources/templates/other/compare.html @@ -15,8 +15,8 @@

    -
    -
    +
    +
    diff --git a/src/main/resources/templates/security/get-info-on-pdf.html b/src/main/resources/templates/security/get-info-on-pdf.html index 36a0bb6e..dc93186c 100644 --- a/src/main/resources/templates/security/get-info-on-pdf.html +++ b/src/main/resources/templates/security/get-info-on-pdf.html @@ -15,13 +15,109 @@

    -

    -
    + +

    +
    + +
    + +
    + + + Download JSON +
    +
    From b07437dbfa6907179917b505a6fcbe020fa88a97 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Wed, 2 Aug 2023 23:03:35 +0100 Subject: [PATCH 5/9] get info DONE! --- .../controller/api/security/GetInfoOnPDF.java | 18 +---------------- .../templates/security/get-info-on-pdf.html | 20 +++++++++++++------ 2 files changed, 15 insertions(+), 23 deletions(-) diff --git a/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java index a167cb74..fea57764 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java +++ b/src/main/java/stirling/software/SPDF/controller/api/security/GetInfoOnPDF.java @@ -260,23 +260,7 @@ public class GetInfoOnPDF { // Digital Signatures using iText7 TODO - - - - PDAcroForm pdAcroForm = pdfBoxDoc.getDocumentCatalog().getAcroForm(); - ArrayNode formFieldsArray2 = objectMapper.createArrayNode(); - if (pdAcroForm != null) { - - for (PDField field : pdAcroForm.getFields()) { - ObjectNode fieldNode = objectMapper.createObjectNode(); - fieldNode.put("FieldName", field.getFullyQualifiedName()); - fieldNode.put("FieldType", field.getFieldType()); - // Add more attributes as needed... - formFieldsArray2.add(fieldNode); - } - - } - jsonOutput.set("FormFields2", formFieldsArray2); + PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot(); diff --git a/src/main/resources/templates/security/get-info-on-pdf.html b/src/main/resources/templates/security/get-info-on-pdf.html index dc93186c..a77b5ae6 100644 --- a/src/main/resources/templates/security/get-info-on-pdf.html +++ b/src/main/resources/templates/security/get-info-on-pdf.html @@ -5,7 +5,6 @@ -
    @@ -71,14 +70,17 @@ function renderJsonSection(key, value, depth = 0) { + // Replace spaces and other non-alphanumeric characters with underscores for valid IDs + let safeKey = (typeof key === "string") ? key.replace(/[^a-zA-Z0-9]/g, '_') : key; + let output = `
    -
    +
    `; // Check if the value is an object and has children - if (value && typeof value === 'object' && Object.keys(value).length) { + if (value && typeof value === 'object' && (Object.keys(value).length || Array.isArray(value))) { output += ` - `; } else { @@ -89,7 +91,7 @@ output += `
    -
    `; +
    `; // Check if the value is a nested object if (typeof value === 'object' && !Array.isArray(value)) { @@ -99,9 +101,13 @@ } output += '
    '; } else if (typeof value === 'object' && Array.isArray(value) && value.length) { // Array values + output += '
    '; value.forEach((val, index) => { - output += renderJsonSection(index, val, depth + 1); + // For arrays, we're going to make the displayed key more descriptive. + const arrayKey = `${key}[${index}]`; + output += renderJsonSection(arrayKey, val, depth + 1); }); + output += '
    '; } output += '
    '; @@ -109,6 +115,8 @@ return output; } + + From 1e35556034d0ef87d0f21d660a1b402b7044eef2 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sat, 5 Aug 2023 23:03:49 +0100 Subject: [PATCH 6/9] update --- .../api/converters/ConvertEpubToPdf.java | 139 +++++++++++++++++ src/main/resources/messages_en_GB.properties | 31 ++++ .../resources/templates/extract-page.html | 2 +- .../resources/templates/fragments/navbar.html | 5 +- .../templates/pdf-to-single-page.html | 1 - .../templates/security/get-info-on-pdf.html | 145 +++++++++--------- 6 files changed, 251 insertions(+), 72 deletions(-) create mode 100644 src/main/java/stirling/software/SPDF/controller/api/converters/ConvertEpubToPdf.java diff --git a/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertEpubToPdf.java b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertEpubToPdf.java new file mode 100644 index 00000000..6705bee8 --- /dev/null +++ b/src/main/java/stirling/software/SPDF/controller/api/converters/ConvertEpubToPdf.java @@ -0,0 +1,139 @@ +package stirling.software.SPDF.controller.api.converters; + +import java.io.IOException; + +import org.commonmark.node.Node; +import org.commonmark.parser.Parser; +import org.commonmark.renderer.html.HtmlRenderer; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; + +import io.swagger.v3.oas.annotations.Hidden; +import io.swagger.v3.oas.annotations.Operation; +import io.swagger.v3.oas.annotations.tags.Tag; +import stirling.software.SPDF.utils.FileToPdf; +import stirling.software.SPDF.utils.WebResponseUtils; +import org.springframework.web.bind.annotation.PostMapping; +import org.springframework.web.bind.annotation.RequestPart; +import org.springframework.web.bind.annotation.RestController; +import org.springframework.web.multipart.MultipartFile; +import org.springframework.http.ResponseEntity; + +import io.swagger.v3.oas.annotations.Operation; + +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.DocumentBuilder; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +@RestController +@Tag(name = "Convert", description = "Convert APIs") +public class ConvertEpubToPdf { + //TODO + @PostMapping(consumes = "multipart/form-data", value = "/epub-to-single-pdf") + @Hidden + @Operation( + summary = "Convert an EPUB file to a single PDF", + description = "This endpoint takes an EPUB file input and converts it to a single PDF." + ) + public ResponseEntity epubToSinglePdf( + @RequestPart(required = true, value = "fileInput") MultipartFile fileInput) + throws Exception { + + if (fileInput == null) { + throw new IllegalArgumentException("Please provide an EPUB file for conversion."); + } + + String originalFilename = fileInput.getOriginalFilename(); + if (originalFilename == null || !originalFilename.endsWith(".epub")) { + throw new IllegalArgumentException("File must be in .epub format."); + } + + Map epubContents = extractEpubContent(fileInput); + List htmlFilesOrder = getHtmlFilesOrderFromOpf(epubContents); + + List individualPdfs = new ArrayList<>(); + + for (String htmlFile : htmlFilesOrder) { + byte[] htmlContent = epubContents.get(htmlFile); + byte[] pdfBytes = FileToPdf.convertHtmlToPdf(htmlContent, htmlFile.replace(".html", ".pdf")); + individualPdfs.add(pdfBytes); + } + + // Pseudo-code to merge individual PDFs into one. + byte[] mergedPdfBytes = mergeMultiplePdfsIntoOne(individualPdfs); + + return WebResponseUtils.bytesToWebResponse(mergedPdfBytes, originalFilename.replace(".epub", ".pdf")); + } + + // Assuming a pseudo-code function that merges multiple PDFs into one. + private byte[] mergeMultiplePdfsIntoOne(List individualPdfs) { + // You can use a library such as iText or PDFBox to perform the merging here. + // Return the byte[] of the merged PDF. + return null; + } + + private Map extractEpubContent(MultipartFile fileInput) throws IOException { + Map contentMap = new HashMap<>(); + + try (ZipInputStream zis = new ZipInputStream(fileInput.getInputStream())) { + ZipEntry zipEntry = zis.getNextEntry(); + while (zipEntry != null) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + byte[] buffer = new byte[1024]; + int read = 0; + while ((read = zis.read(buffer)) != -1) { + baos.write(buffer, 0, read); + } + contentMap.put(zipEntry.getName(), baos.toByteArray()); + zipEntry = zis.getNextEntry(); + } + } + + return contentMap; + } + + private List getHtmlFilesOrderFromOpf(Map epubContents) throws Exception { + String opfContent = new String(epubContents.get("OEBPS/content.opf")); // Adjusting for given path + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + InputSource is = new InputSource(new StringReader(opfContent)); + Document doc = dBuilder.parse(is); + + NodeList itemRefs = doc.getElementsByTagName("itemref"); + List htmlFilesOrder = new ArrayList<>(); + + for (int i = 0; i < itemRefs.getLength(); i++) { + Element itemRef = (Element) itemRefs.item(i); + String idref = itemRef.getAttribute("idref"); + + NodeList items = doc.getElementsByTagName("item"); + for (int j = 0; j < items.getLength(); j++) { + Element item = (Element) items.item(j); + if (idref.equals(item.getAttribute("id"))) { + htmlFilesOrder.add(item.getAttribute("href")); // Fetching the actual href + break; + } + } + } + + return htmlFilesOrder; + } + + +} diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index 5ae0c5a0..d66ed9c5 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -260,6 +260,37 @@ PdfToSinglePage.tags=single page # WEB PAGES # # # ########################### + + + +#pdfToSinglePage +pdfToSinglePage.title=PDF To Single Page +pdfToSinglePage.header=PDF To Single Page +pdfToSinglePage.submit=Convert To Single Page + + +#pageExtracter +pageExtracter.title=Extract Pages +pageExtracter.header=Extract Pages +pageExtracter.submit=Extract + + +#getPdfInfo +getPdfInfo.title=Get Info on PDF +getPdfInfo.header=Get Info on PDF +getPdfInfo.submit=Get Info +getPdfInfo.downloadJson=Download JSON + + +#markdown-to-pdf +MarkdownToPDF.title=Markdown To PDF +MarkdownToPDF.header=Markdown To PDF +MarkdownToPDF.submit=Convert +MarkdownToPDF.help=Work in progress +MarkdownToPDF.credit=Uses WeasyPrint + + + #url-to-pdf URLToPDF.title=URL To PDF URLToPDF.header=URL To PDF diff --git a/src/main/resources/templates/extract-page.html b/src/main/resources/templates/extract-page.html index 0a5fb158..9dd1b9ed 100644 --- a/src/main/resources/templates/extract-page.html +++ b/src/main/resources/templates/extract-page.html @@ -17,7 +17,7 @@
    - +
    diff --git a/src/main/resources/templates/fragments/navbar.html b/src/main/resources/templates/fragments/navbar.html index 5b4b594a..ef84ef3e 100644 --- a/src/main/resources/templates/fragments/navbar.html +++ b/src/main/resources/templates/fragments/navbar.html @@ -57,6 +57,8 @@
    +
    +
    @@ -73,7 +75,7 @@
    - +
    @@ -102,6 +104,7 @@
    +
    diff --git a/src/main/resources/templates/pdf-to-single-page.html b/src/main/resources/templates/pdf-to-single-page.html index 1f5b64c6..75007224 100644 --- a/src/main/resources/templates/pdf-to-single-page.html +++ b/src/main/resources/templates/pdf-to-single-page.html @@ -14,7 +14,6 @@

    -

    diff --git a/src/main/resources/templates/security/get-info-on-pdf.html b/src/main/resources/templates/security/get-info-on-pdf.html index a77b5ae6..193056e4 100644 --- a/src/main/resources/templates/security/get-info-on-pdf.html +++ b/src/main/resources/templates/security/get-info-on-pdf.html @@ -1,35 +1,39 @@ - - - - - + + -
    -
    -
    -

    -
    -
    -
    -

    -

    -

    -
    -
    - +
    +
    +
    +

    +
    +
    +
    +

    + +
    +
    + - -
    - -
    - -
    - - - Download JSON + +
    + +
    + +
    + + +
    - -
    -
    -
    +
    +
    +
    -
    -
    -
    +
    +
    +
    \ No newline at end of file From a27ddb40beb3d124ab1b1101c4f9cc645bf5e728 Mon Sep 17 00:00:00 2001 From: Anthony Stirling <77850077+Frooodle@users.noreply.github.com> Date: Sun, 6 Aug 2023 12:34:26 +0100 Subject: [PATCH 7/9] navbar, blank page desc and drag drop append --- src/main/resources/messages_en_GB.properties | 6 +++--- src/main/resources/static/js/fileInput.js | 17 +++++++++++++---- .../resources/templates/fragments/navbar.html | 9 +++++---- 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/src/main/resources/messages_en_GB.properties b/src/main/resources/messages_en_GB.properties index d66ed9c5..bc01fb5f 100644 --- a/src/main/resources/messages_en_GB.properties +++ b/src/main/resources/messages_en_GB.properties @@ -403,10 +403,10 @@ certSign.submit=Sign PDF #removeBlanks removeBlanks.title=Remove Blanks removeBlanks.header=Remove Blank Pages -removeBlanks.threshold=Threshold: -removeBlanks.thresholdDesc=Threshold for determining how white a white pixel must be +removeBlanks.threshold=Pixel Whiteness Threshold: +removeBlanks.thresholdDesc=Threshold for determining how white a white pixel must be to be classed as 'White'. 0 = Black, 255 pure white. removeBlanks.whitePercent=White Percent (%): -removeBlanks.whitePercentDesc=Percent of page that must be white to be removed +removeBlanks.whitePercentDesc=Percent of page that must be 'white' pixels to be removed removeBlanks.submit=Remove Blanks diff --git a/src/main/resources/static/js/fileInput.js b/src/main/resources/static/js/fileInput.js index 0842462e..9379a7bf 100644 --- a/src/main/resources/static/js/fileInput.js +++ b/src/main/resources/static/js/fileInput.js @@ -6,7 +6,8 @@ function setupFileInput(chooser) { const elementId = chooser.getAttribute('data-element-id'); const filesSelected = chooser.getAttribute('data-files-selected'); const pdfPrompt = chooser.getAttribute('data-pdf-prompt'); - + + let allFiles = []; let overlay; let dragCounter = 0; @@ -42,11 +43,19 @@ function setupFileInput(chooser) { }; const dropListener = function(e) { + e.preventDefault(); const dt = e.dataTransfer; const files = dt.files; + for (let i = 0; i < files.length; i++) { + allFiles.push(files[i]); + } + + const dataTransfer = new DataTransfer(); + allFiles.forEach(file => dataTransfer.items.add(file)); + const fileInput = document.getElementById(elementId); - fileInput.files = files; + fileInput.files = dataTransfer.files; if (overlay) { overlay.remove(); @@ -76,8 +85,8 @@ function setupFileInput(chooser) { }); function handleFileInputChange(inputElement) { - const files = $(inputElement).get(0).files; - const fileNames = Array.from(files).map(f => f.name); + const files = allFiles; + const fileNames = files.map(f => f.name); const selectedFilesContainer = $(inputElement).siblings(".selected-files"); selectedFilesContainer.empty(); fileNames.forEach(fileName => { diff --git a/src/main/resources/templates/fragments/navbar.html b/src/main/resources/templates/fragments/navbar.html index ef84ef3e..c20a498c 100644 --- a/src/main/resources/templates/fragments/navbar.html +++ b/src/main/resources/templates/fragments/navbar.html @@ -40,7 +40,7 @@ --> -
    -