all inf
This commit is contained in:
parent
f98f089d63
commit
52a7885f3c
1 changed files with 335 additions and 0 deletions
|
@ -0,0 +1,335 @@
|
||||||
|
package stirling.software.SPDF.controller.api.security;
|
||||||
|
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocument;
|
||||||
|
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfObject;
|
||||||
|
import com.itextpdf.forms.PdfAcroForm;
|
||||||
|
import com.itextpdf.forms.fields.PdfFormField;
|
||||||
|
import com.itextpdf.kernel.geom.Rectangle;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfArray;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfCatalog;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfDictionary;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfDocument;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfDocumentInfo;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfEncryption;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfReader;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfResources;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfStream;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfName;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfViewerPreferences;
|
||||||
|
import com.itextpdf.kernel.pdf.PdfWriter;
|
||||||
|
import com.itextpdf.kernel.pdf.annot.PdfAnnotation;
|
||||||
|
import com.itextpdf.kernel.pdf.annot.PdfFileAttachmentAnnotation;
|
||||||
|
import com.itextpdf.kernel.pdf.annot.PdfLinkAnnotation;
|
||||||
|
import com.itextpdf.kernel.pdf.layer.PdfLayer;
|
||||||
|
import com.itextpdf.kernel.pdf.layer.PdfOCProperties;
|
||||||
|
import com.fasterxml.jackson.databind.ObjectMapper;
|
||||||
|
import com.fasterxml.jackson.databind.node.ArrayNode;
|
||||||
|
import com.fasterxml.jackson.databind.node.ObjectNode;
|
||||||
|
import org.apache.pdfbox.text.PDFTextStripper;
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.text.SimpleDateFormat;
|
||||||
|
import java.util.Calendar;
|
||||||
|
import java.util.List;
|
||||||
|
import java.util.Map;
|
||||||
|
|
||||||
|
public class PDFExtractor {
|
||||||
|
public static void main(String[] args) {
|
||||||
|
try {
|
||||||
|
PDDocument pdfBoxDoc = PDDocument.load(new File("path_to_pdf.pdf"));
|
||||||
|
ObjectMapper objectMapper = new ObjectMapper();
|
||||||
|
ObjectNode jsonOutput = objectMapper.createObjectNode();
|
||||||
|
|
||||||
|
// Metadata using PDFBox
|
||||||
|
PDDocumentInformation info = pdfBoxDoc.getDocumentInformation();
|
||||||
|
ObjectNode metadata = objectMapper.createObjectNode();
|
||||||
|
metadata.put("Title", info.getTitle());
|
||||||
|
metadata.put("Author", info.getAuthor());
|
||||||
|
metadata.put("Subject", info.getSubject());
|
||||||
|
metadata.put("Keywords", info.getKeywords());
|
||||||
|
metadata.put("Producer", info.getProducer());
|
||||||
|
metadata.put("Creator", info.getCreator());
|
||||||
|
metadata.put("CreationDate", formatDate(info.getCreationDate()));
|
||||||
|
metadata.put("ModificationDate", formatDate(info.getModificationDate()));
|
||||||
|
metadata.put("Trapped", info.getTrapped());
|
||||||
|
jsonOutput.set("Metadata", metadata);
|
||||||
|
|
||||||
|
// Document Information using PDFBox
|
||||||
|
ObjectNode docInfoNode = objectMapper.createObjectNode();
|
||||||
|
docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages());
|
||||||
|
docInfoNode.put("PDF version", pdfBoxDoc.getVersion());
|
||||||
|
;
|
||||||
|
|
||||||
|
// Page Mode using iText7
|
||||||
|
PdfDocument itextDoc = new PdfDocument(new PdfReader("path_to_pdf.pdf"));
|
||||||
|
PdfCatalog catalog = itextDoc.getCatalog();
|
||||||
|
PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode);
|
||||||
|
|
||||||
|
ObjectNode itextDocInfo = objectMapper.createObjectNode();
|
||||||
|
docInfoNode.put("Page Mode", getPageModeDescription(pageMode));;
|
||||||
|
|
||||||
|
jsonOutput.set("Document Information", docInfoNode);
|
||||||
|
|
||||||
|
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
|
||||||
|
ObjectNode pageInfo = objectMapper.createObjectNode();
|
||||||
|
|
||||||
|
// Page-level Information
|
||||||
|
Rectangle pageSize = itextDoc.getPage(pageNum).getPageSize();
|
||||||
|
pageInfo.put("Width", pageSize.getWidth());
|
||||||
|
pageInfo.put("Height", pageSize.getHeight());
|
||||||
|
pageInfo.put("Rotation", itextDoc.getPage(pageNum).getRotation());
|
||||||
|
|
||||||
|
// Boxes
|
||||||
|
pageInfo.put("MediaBox", itextDoc.getPage(pageNum).getMediaBox().toString());
|
||||||
|
pageInfo.put("CropBox", itextDoc.getPage(pageNum).getCropBox().toString());
|
||||||
|
pageInfo.put("BleedBox", itextDoc.getPage(pageNum).getBleedBox().toString());
|
||||||
|
pageInfo.put("TrimBox", itextDoc.getPage(pageNum).getTrimBox().toString());
|
||||||
|
pageInfo.put("ArtBox", itextDoc.getPage(pageNum).getArtBox().toString());
|
||||||
|
|
||||||
|
// Content Extraction
|
||||||
|
PDFTextStripper textStripper = new PDFTextStripper();
|
||||||
|
textStripper.setStartPage(pageNum -1);
|
||||||
|
textStripper.setEndPage(pageNum - 1);
|
||||||
|
String pageText = textStripper.getText(pdfBoxDoc);
|
||||||
|
|
||||||
|
pageInfo.put("Text Characters Count", pageText.length()); //
|
||||||
|
|
||||||
|
// Annotations
|
||||||
|
ArrayNode annotationsArray = objectMapper.createArrayNode();
|
||||||
|
List<PdfAnnotation> annotations = itextDoc.getPage(pageNum).getAnnotations();
|
||||||
|
for (PdfAnnotation annotation : annotations) {
|
||||||
|
ObjectNode annotationNode = objectMapper.createObjectNode();
|
||||||
|
annotationNode.put("Subtype", annotation.getSubtype().toString());
|
||||||
|
annotationNode.put("Contents", annotation.getContents().getValue());
|
||||||
|
annotationsArray.add(annotationNode);
|
||||||
|
}
|
||||||
|
pageInfo.set("Annotations", annotationsArray);
|
||||||
|
|
||||||
|
// Images (simplified)
|
||||||
|
// This part is non-trivial as images can be embedded in multiple ways in a PDF.
|
||||||
|
// Here is a basic structure to recognize image XObjects on a page.
|
||||||
|
ArrayNode imagesArray = objectMapper.createArrayNode();
|
||||||
|
PdfResources resources = itextDoc.getPage(pageNum).getResources();
|
||||||
|
for (PdfName name : resources.getResourceNames()) {
|
||||||
|
PdfObject obj = resources.getResource(name);
|
||||||
|
if (obj instanceof PdfStream) {
|
||||||
|
PdfStream stream = (PdfStream) obj;
|
||||||
|
if (PdfName.Image.equals(stream.getAsName(PdfName.Subtype))) {
|
||||||
|
ObjectNode imageNode = objectMapper.createObjectNode();
|
||||||
|
imageNode.put("Width", stream.getAsNumber(PdfName.Width).intValue());
|
||||||
|
imageNode.put("Height", stream.getAsNumber(PdfName.Height).intValue());
|
||||||
|
PdfObject colorSpace = stream.get(PdfName.ColorSpace);
|
||||||
|
if (colorSpace != null) {
|
||||||
|
imageNode.put("ColorSpace", colorSpace.toString());
|
||||||
|
}
|
||||||
|
imagesArray.add(imageNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo.set("Images", imagesArray);
|
||||||
|
|
||||||
|
// Links
|
||||||
|
ArrayNode linksArray = objectMapper.createArrayNode();
|
||||||
|
for (PdfAnnotation annotation : annotations) {
|
||||||
|
if (annotation instanceof PdfLinkAnnotation) {
|
||||||
|
PdfLinkAnnotation linkAnnotation = (PdfLinkAnnotation) annotation;
|
||||||
|
ObjectNode linkNode = objectMapper.createObjectNode();
|
||||||
|
linkNode.put("URI", linkAnnotation.getAction().toString()); // Basic, might not work for all links
|
||||||
|
linksArray.add(linkNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo.set("Links", linksArray);
|
||||||
|
|
||||||
|
//Fonts
|
||||||
|
ArrayNode fontsArray = objectMapper.createArrayNode();
|
||||||
|
PdfDictionary fontDicts = resources.getResource(PdfName.Font);
|
||||||
|
if (fontDicts != null) {
|
||||||
|
for (PdfName key : fontDicts.keySet()) {
|
||||||
|
PdfDictionary font = fontDicts.getAsDictionary(key);
|
||||||
|
ObjectNode fontNode = objectMapper.createObjectNode();
|
||||||
|
fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString());
|
||||||
|
|
||||||
|
// Font Subtype (e.g., Type1, TrueType)
|
||||||
|
if (font.containsKey(PdfName.Subtype)) {
|
||||||
|
fontNode.put("Subtype", font.getAsName(PdfName.Subtype).toString());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Font Descriptor
|
||||||
|
PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor);
|
||||||
|
if (fontDescriptor != null) {
|
||||||
|
// Italic Angle
|
||||||
|
if (fontDescriptor.containsKey(PdfName.ItalicAngle)) {
|
||||||
|
fontNode.put("ItalicAngle", fontDescriptor.getAsNumber(PdfName.ItalicAngle).floatValue());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Flags (e.g., italic, bold)
|
||||||
|
if (fontDescriptor.containsKey(PdfName.Flags)) {
|
||||||
|
int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue();
|
||||||
|
fontNode.put("IsItalic", (flags & 64) != 0);
|
||||||
|
fontNode.put("IsBold", (flags & 1) != 0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fontsArray.add(fontNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo.set("Fonts", fontsArray);
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Access resources dictionary
|
||||||
|
PdfDictionary resourcesDict = itextDoc.getPage(pageNum).getResources().getPdfObject();
|
||||||
|
|
||||||
|
// Color Spaces & ICC Profiles
|
||||||
|
ArrayNode colorSpacesArray = objectMapper.createArrayNode();
|
||||||
|
PdfDictionary colorSpaces = resourcesDict.getAsDictionary(PdfName.ColorSpace);
|
||||||
|
if (colorSpaces != null) {
|
||||||
|
for (PdfName name : colorSpaces.keySet()) {
|
||||||
|
PdfObject colorSpaceObject = colorSpaces.get(name);
|
||||||
|
if (colorSpaceObject instanceof PdfArray) {
|
||||||
|
PdfArray colorSpaceArray = (PdfArray) colorSpaceObject;
|
||||||
|
if (colorSpaceArray.size() > 1 && colorSpaceArray.get(0) instanceof PdfName && PdfName.ICCBased.equals(colorSpaceArray.get(0))) {
|
||||||
|
ObjectNode iccProfileNode = objectMapper.createObjectNode();
|
||||||
|
PdfStream iccStream = (PdfStream) colorSpaceArray.get(1);
|
||||||
|
byte[] iccData = iccStream.getBytes();
|
||||||
|
// TODO: Further decode and analyze the ICC data if needed
|
||||||
|
iccProfileNode.put("ICC Profile Length", iccData.length);
|
||||||
|
colorSpacesArray.add(iccProfileNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo.set("Color Spaces & ICC Profiles", colorSpacesArray);
|
||||||
|
|
||||||
|
// Other XObjects
|
||||||
|
ArrayNode xObjectsArray = objectMapper.createArrayNode();
|
||||||
|
PdfDictionary xObjects = resourcesDict.getAsDictionary(PdfName.XObject);
|
||||||
|
if (xObjects != null) {
|
||||||
|
for (PdfName name : xObjects.keySet()) {
|
||||||
|
PdfStream xObjectStream = xObjects.getAsStream(name);
|
||||||
|
ObjectNode xObjectNode = objectMapper.createObjectNode();
|
||||||
|
xObjectNode.put("Type", xObjectStream.getAsName(PdfName.Subtype).toString());
|
||||||
|
// TODO: Extract further details depending on the XObject type
|
||||||
|
xObjectsArray.add(xObjectNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pageInfo.set("XObjects", xObjectsArray);
|
||||||
|
|
||||||
|
jsonOutput.set("Page " + pageNum, pageInfo);
|
||||||
|
}
|
||||||
|
|
||||||
|
PdfAcroForm acroForm = PdfAcroForm.getAcroForm(itextDoc, false);
|
||||||
|
if (acroForm != null) {
|
||||||
|
ObjectNode formFieldsNode = objectMapper.createObjectNode();
|
||||||
|
for (Map.Entry<String, PdfFormField> entry : acroForm.getFormFields().entrySet()) {
|
||||||
|
formFieldsNode.put(entry.getKey(), entry.getValue().getValueAsString());
|
||||||
|
}
|
||||||
|
jsonOutput.set("FormFields", formFieldsNode);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//TODO bookmarks here
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
//embeed files TODO size
|
||||||
|
PdfDictionary embeddedFiles = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names)
|
||||||
|
.getAsDictionary(PdfName.EmbeddedFiles);
|
||||||
|
if (embeddedFiles != null) {
|
||||||
|
ArrayNode embeddedFilesArray = objectMapper.createArrayNode();
|
||||||
|
PdfArray namesArray = embeddedFiles.getAsArray(PdfName.Names);
|
||||||
|
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||||
|
ObjectNode embeddedFileNode = objectMapper.createObjectNode();
|
||||||
|
embeddedFileNode.put("Name", namesArray.getAsString(i).toString());
|
||||||
|
// Add other details if required
|
||||||
|
embeddedFilesArray.add(embeddedFileNode);
|
||||||
|
}
|
||||||
|
jsonOutput.set("EmbeddedFiles", embeddedFilesArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//attachments TODO size
|
||||||
|
ArrayNode attachmentsArray = objectMapper.createArrayNode();
|
||||||
|
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
|
||||||
|
for (PdfAnnotation annotation : itextDoc.getPage(pageNum).getAnnotations()) {
|
||||||
|
if (annotation instanceof PdfFileAttachmentAnnotation) {
|
||||||
|
ObjectNode attachmentNode = objectMapper.createObjectNode();
|
||||||
|
attachmentNode.put("Name", ((PdfFileAttachmentAnnotation) annotation).getName().toString());
|
||||||
|
attachmentNode.put("Description", annotation.getContents().getValue());
|
||||||
|
attachmentsArray.add(attachmentNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
jsonOutput.set("Attachments", attachmentsArray);
|
||||||
|
|
||||||
|
//Javascript
|
||||||
|
PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names);
|
||||||
|
if (namesDict != null) {
|
||||||
|
PdfDictionary javascriptDict = namesDict.getAsDictionary(PdfName.JavaScript);
|
||||||
|
if (javascriptDict != null) {
|
||||||
|
ArrayNode javascriptArray = objectMapper.createArrayNode();
|
||||||
|
PdfArray namesArray = javascriptDict.getAsArray(PdfName.Names);
|
||||||
|
for (int i = 0; i < namesArray.size(); i += 2) {
|
||||||
|
ObjectNode jsNode = objectMapper.createObjectNode();
|
||||||
|
jsNode.put("JS Name", namesArray.getAsString(i).toString());
|
||||||
|
jsNode.put("JS Code", namesArray.getAsString(i + 1).toString());
|
||||||
|
javascriptArray.add(jsNode);
|
||||||
|
}
|
||||||
|
jsonOutput.set("JavaScripts", javascriptArray);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//TODO size
|
||||||
|
PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false);
|
||||||
|
if (ocProperties != null) {
|
||||||
|
ArrayNode layersArray = objectMapper.createArrayNode();
|
||||||
|
for (PdfLayer layer : ocProperties.getLayers()) {
|
||||||
|
ObjectNode layerNode = objectMapper.createObjectNode();
|
||||||
|
layerNode.put("Name", layer.getPdfObject().getAsString(PdfName.Name).toString());
|
||||||
|
layersArray.add(layerNode);
|
||||||
|
}
|
||||||
|
jsonOutput.set("Layers", layersArray);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
//TODO Security
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
// Digital Signatures using iText7 TODO
|
||||||
|
|
||||||
|
|
||||||
|
// Save JSON to file
|
||||||
|
try (FileWriter file = new FileWriter("output.json")) {
|
||||||
|
file.write(objectMapper.writerWithDefaultPrettyPrinter().writeValueAsString(jsonOutput));
|
||||||
|
file.flush();
|
||||||
|
}
|
||||||
|
|
||||||
|
pdfBoxDoc.close();
|
||||||
|
itextDoc.close();
|
||||||
|
} catch (Exception e) {
|
||||||
|
e.printStackTrace();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String formatDate(Calendar calendar) {
|
||||||
|
if (calendar != null) {
|
||||||
|
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
|
||||||
|
return sdf.format(calendar.getTime());
|
||||||
|
} else {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private static String getPageModeDescription(PdfName pageMode) {
|
||||||
|
return pageMode != null ? pageMode.toString().replaceFirst("/", "") : "Unknown";
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue