get info changes

This commit is contained in:
Anthony Stirling 2023-08-02 22:49:43 +01:00
parent 77411e94a4
commit 96f05cd518
6 changed files with 226 additions and 126 deletions

View file

@ -73,7 +73,7 @@ import java.util.Set;
import java.util.HashSet;
@RestController
@Tag(name = "Security", description = "Security APIs")
public class PDFExtractor {
public class GetInfoOnPDF {
static ObjectMapper objectMapper = new ObjectMapper();
@ -94,6 +94,13 @@ public class PDFExtractor {
// Metadata using PDFBox
PDDocumentInformation info = pdfBoxDoc.getDocumentInformation();
ObjectNode metadata = objectMapper.createObjectNode();
ObjectNode basicInfo = objectMapper.createObjectNode();
ObjectNode docInfoNode = objectMapper.createObjectNode();
ObjectNode compliancy = objectMapper.createObjectNode();
ObjectNode encryption = objectMapper.createObjectNode();
ObjectNode other = objectMapper.createObjectNode();
metadata.put("Title", info.getTitle());
metadata.put("Author", info.getAuthor());
metadata.put("Subject", info.getSubject());
@ -102,25 +109,25 @@ public class PDFExtractor {
metadata.put("Creator", info.getCreator());
metadata.put("CreationDate", formatDate(info.getCreationDate()));
metadata.put("ModificationDate", formatDate(info.getModificationDate()));
metadata.put("Trapped", info.getTrapped());
jsonOutput.set("Metadata", metadata);
// Total file size of the PDF
long fileSizeInBytes = inputFile.getSize();
jsonOutput.put("FileSizeInBytes", fileSizeInBytes);
basicInfo.put("FileSizeInBytes", fileSizeInBytes);
// Number of words, paragraphs, and images in the entire document
String fullText = new PDFTextStripper().getText(pdfBoxDoc);
String[] words = fullText.split("\\s+");
int wordCount = words.length;
int paragraphCount = fullText.split("\r\n|\r|\n").length;
jsonOutput.put("WordCount", wordCount);
jsonOutput.put("ParagraphCount", paragraphCount);
basicInfo.put("WordCount", wordCount);
basicInfo.put("ParagraphCount", paragraphCount);
// Number of characters in the entire document (including spaces and special characters)
int charCount = fullText.length();
jsonOutput.put("CharacterCount", charCount);
basicInfo.put("CharacterCount", charCount);
// Initialize the flags and types
@ -142,22 +149,24 @@ public class PDFExtractor {
hasCompression = true;
compressionType = "Compressed Xref or Rebuilt Xref";
}
jsonOutput.put("Compression", hasCompression);
basicInfo.put("Compression", hasCompression);
if(hasCompression)
jsonOutput.put("CompressionType", compressionType);
basicInfo.put("CompressionType", compressionType);
String language = pdfBoxDoc.getDocumentCatalog().getLanguage();
jsonOutput.put("Language", language);
basicInfo.put("Language", language);
basicInfo.put("Number of pages", pdfBoxDoc.getNumberOfPages());
// Document Information using PDFBox
ObjectNode docInfoNode = objectMapper.createObjectNode();
docInfoNode.put("Number of pages", pdfBoxDoc.getNumberOfPages());
docInfoNode.put("PDF version", pdfBoxDoc.getVersion());
// Page Mode using iText7
PdfCatalog catalog = itextDoc.getCatalog();
PdfName pageMode = catalog.getPdfObject().getAsName(PdfName.PageMode);
// Document Information using PDFBox
docInfoNode.put("PDF version", pdfBoxDoc.getVersion());
docInfoNode.put("Trapped", info.getTrapped());
docInfoNode.put("Page Mode", getPageModeDescription(pageMode));;
@ -193,7 +202,7 @@ public class PDFExtractor {
}
}
jsonOutput.set("EmbeddedFiles", embeddedFilesArray);
other.set("EmbeddedFiles", embeddedFilesArray);
//attachments TODO size
ArrayNode attachmentsArray = objectMapper.createArrayNode();
@ -207,7 +216,7 @@ public class PDFExtractor {
}
}
}
jsonOutput.set("Attachments", attachmentsArray);
other.set("Attachments", attachmentsArray);
//Javascript
PdfDictionary namesDict = itextDoc.getCatalog().getPdfObject().getAsDictionary(PdfName.Names);
@ -226,7 +235,7 @@ public class PDFExtractor {
}
}
jsonOutput.set("JavaScript", javascriptArray);
other.set("JavaScript", javascriptArray);
//TODO size
PdfOCProperties ocProperties = itextDoc.getCatalog().getOCProperties(false);
@ -240,7 +249,7 @@ public class PDFExtractor {
}
}
jsonOutput.set("Layers", layersArray);
other.set("Layers", layersArray);
//TODO Security
@ -267,7 +276,7 @@ public class PDFExtractor {
}
}
jsonOutput.set("FormFields", formFieldsArray2);
jsonOutput.set("FormFields2", formFieldsArray2);
PDStructureTreeRoot structureTreeRoot = pdfBoxDoc.getDocumentCatalog().getStructureTreeRoot();
@ -275,19 +284,13 @@ public class PDFExtractor {
try {
if(structureTreeRoot != null) {
structureTreeArray = exploreStructureTree(structureTreeRoot.getKids());
jsonOutput.set("StructureTree", structureTreeArray);
other.set("StructureTree", structureTreeArray);
}
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
boolean isPdfACompliant = checkOutputIntent(itextDoc, "PDF/A");
boolean isPdfXCompliant = checkOutputIntent(itextDoc, "PDF/X");
@ -297,7 +300,6 @@ public class PDFExtractor {
boolean isPdfBCompliant = checkForStandard(itextDoc, "PDF/B"); // If you want to check for PDF/Broadcast, though this isn't an official ISO standard.
boolean isPdfSECCompliant = checkForStandard(itextDoc, "PDF/SEC"); // This might not be effective since PDF/SEC was under development in 2021.
ObjectNode compliancy = objectMapper.createObjectNode();
compliancy.put("IsPDF/ACompliant", isPdfACompliant);
compliancy.put("IsPDF/XCompliant", isPdfXCompliant);
compliancy.put("IsPDF/ECompliant", isPdfECompliant);
@ -306,7 +308,6 @@ public class PDFExtractor {
compliancy.put("IsPDF/BCompliant", isPdfBCompliant);
compliancy.put("IsPDF/SECCompliant", isPdfSECCompliant);
jsonOutput.set("Compliancy", compliancy);
@ -318,7 +319,7 @@ public class PDFExtractor {
addOutlinesToArray(child, bookmarksArray);
}
}
jsonOutput.set("Bookmarks/Outline/TOC", bookmarksArray);
other.set("Bookmarks/Outline/TOC", bookmarksArray);
String xmpString = null;
try {
@ -331,29 +332,27 @@ public class PDFExtractor {
} catch (XMPException e) {
e.printStackTrace();
}
jsonOutput.put("XMPMetadata", xmpString);
other.put("XMPMetadata", xmpString);
ObjectNode encryptionNode = objectMapper.createObjectNode();
if (pdfBoxDoc.isEncrypted()) {
encryptionNode.put("IsEncrypted", true);
encryption.put("IsEncrypted", true);
// Retrieve encryption details using getEncryption()
PDEncryption encryption = pdfBoxDoc.getEncryption();
encryptionNode.put("EncryptionAlgorithm", encryption.getFilter());
encryptionNode.put("KeyLength", encryption.getLength());
encryptionNode.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString());
PDEncryption pdfEncryption = pdfBoxDoc.getEncryption();
encryption.put("EncryptionAlgorithm", pdfEncryption.getFilter());
encryption.put("KeyLength", pdfEncryption.getLength());
encryption.put("Permissions", pdfBoxDoc.getCurrentAccessPermission().toString());
// Add other encryption-related properties as needed
} else {
encryptionNode.put("IsEncrypted", false);
encryption.put("IsEncrypted", false);
}
jsonOutput.set("Encryption", encryptionNode);
docInfoNode.put("Page Mode", getPageModeDescription(pageMode));;
jsonOutput.set("Document Information", docInfoNode);
ObjectNode pageInfoParent = objectMapper.createObjectNode();
for (int pageNum = 1; pageNum <= itextDoc.getNumberOfPages(); pageNum++) {
ObjectNode pageInfo = objectMapper.createObjectNode();
@ -382,7 +381,6 @@ public class PDFExtractor {
pageInfo.put("Text Characters Count", pageText.length()); //
// Annotations
ArrayNode annotationsArray = objectMapper.createArrayNode();
List<PdfAnnotation> annotations = itextDoc.getPage(pageNum).getAnnotations();
int subtypeCount = 0;
@ -447,61 +445,57 @@ public class PDFExtractor {
}
pageInfo.set("Links", linksArray);
//Fonts
// Fonts
ArrayNode fontsArray = objectMapper.createArrayNode();
PdfDictionary fontDicts = resources.getResource(PdfName.Font);
Set<String> uniqueSubtypes = new HashSet<>(); // To store unique subtypes
Set<String> uniqueSubtypes = new HashSet<>(); // To store unique subtypes
// Map to store unique fonts and their counts
Map<String, ObjectNode> uniqueFontsMap = new HashMap<>();
if (fontDicts != null) {
for (PdfName key : fontDicts.keySet()) {
ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font
ObjectNode fontNode = objectMapper.createObjectNode(); // Create a new font node for each font
PdfDictionary font = fontDicts.getAsDictionary(key);
boolean isEmbedded = font.containsKey(PdfName.FontFile) ||
font.containsKey(PdfName.FontFile2) ||
boolean isEmbedded = font.containsKey(PdfName.FontFile) ||
font.containsKey(PdfName.FontFile2) ||
font.containsKey(PdfName.FontFile3);
fontNode.put("IsEmbedded", isEmbedded);
if (font.containsKey(PdfName.Encoding)) {
String encoding = font.getAsName(PdfName.Encoding).toString();
fontNode.put("Encoding", encoding);
}
if(font.getAsString(PdfName.BaseFont) != null)
fontNode.put("IsEmbedded", isEmbedded);
if (font.containsKey(PdfName.Encoding)) {
String encoding = font.getAsName(PdfName.Encoding).toString();
fontNode.put("Encoding", encoding);
}
if (font.getAsString(PdfName.BaseFont) != null) {
fontNode.put("Name", font.getAsString(PdfName.BaseFont).toString());
}
String subtype = null;
// Font Subtype (e.g., Type1, TrueType)
if (font.containsKey(PdfName.Subtype)) {
subtype = font.getAsName(PdfName.Subtype).toString();
uniqueSubtypes.add(subtype); // Add to set to ensure uniqueness
}
fontNode.put("Subtype", subtype);
// Font Descriptor
PdfDictionary fontDescriptor = font.getAsDictionary(PdfName.FontDescriptor);
if (fontDescriptor != null) {
// Italic Angle
if (fontDescriptor.containsKey(PdfName.ItalicAngle)) {
fontNode.put("ItalicAngle", fontDescriptor.getAsNumber(PdfName.ItalicAngle).floatValue());
}
// Flags (e.g., italic, bold)
if (fontDescriptor.containsKey(PdfName.Flags)) {
int flags = fontDescriptor.getAsNumber(PdfName.Flags).intValue();
fontNode.put("IsItalic", (flags & 64) != 0); // Existing italic flag
fontNode.put("IsBold", (flags & 1 << 16) != 0); // Existing bold flag
fontNode.put("IsItalic", (flags & 64) != 0);
fontNode.put("IsBold", (flags & 1 << 16) != 0);
fontNode.put("IsFixedPitch", (flags & 1) != 0);
fontNode.put("IsSerif", (flags & 2) != 0);
fontNode.put("IsSymbolic", (flags & 4) != 0);
fontNode.put("IsScript", (flags & 8) != 0);
fontNode.put("IsNonsymbolic", (flags & 16) != 0);
}
if (fontDescriptor.containsKey(PdfName.FontFamily)) {
String fontFamily = fontDescriptor.getAsString(PdfName.FontFamily).toString();
fontNode.put("FontFamily", fontFamily);
@ -511,34 +505,43 @@ public class PDFExtractor {
String fontStretch = fontDescriptor.getAsName(PdfName.FontStretch).toString();
fontNode.put("FontStretch", fontStretch);
}
if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontBBox)) {
if (fontDescriptor.containsKey(PdfName.FontBBox)) {
PdfArray bbox = fontDescriptor.getAsArray(PdfName.FontBBox);
fontNode.put("FontBoundingBox", bbox.toString());
}
if (fontDescriptor != null && fontDescriptor.containsKey(PdfName.FontWeight)) {
if (fontDescriptor.containsKey(PdfName.FontWeight)) {
float fontWeight = fontDescriptor.getAsNumber(PdfName.FontWeight).floatValue();
fontNode.put("FontWeight", fontWeight);
}
}
if (font.containsKey(PdfName.ToUnicode)) {
PdfStream toUnicodeStream = font.getAsStream(PdfName.ToUnicode);
// Handle the stream as needed, maybe extract some details or just note its existence
fontNode.put("HasToUnicodeMap", true);
}
if (fontNode.size() > 0) {
fontsArray.add(fontNode); // Add each font node to fontsArray
// Create a unique key for this font node based on its attributes
String uniqueKey = fontNode.toString();
// Increment count if this font exists, or initialize it if new
if (uniqueFontsMap.containsKey(uniqueKey)) {
ObjectNode existingFontNode = uniqueFontsMap.get(uniqueKey);
int count = existingFontNode.get("Count").asInt() + 1;
existingFontNode.put("Count", count);
} else {
fontNode.put("Count", 1);
uniqueFontsMap.put(uniqueKey, fontNode);
}
}
}
}
// Add unique subtypes to fontsArray
ArrayNode subtypesArray = objectMapper.createArrayNode();
for (String subtype : uniqueSubtypes) {
subtypesArray.add(subtype);
// Add unique font entries to fontsArray
for (ObjectNode uniqueFontNode : uniqueFontsMap.values()) {
fontsArray.add(uniqueFontNode);
}
pageInfo.set("FontSubtypes", subtypesArray); // Changed from Fonts to FontSubtypes
pageInfo.set("Fonts", fontsArray);
@ -605,8 +608,14 @@ public class PDFExtractor {
pageInfoParent.set("Page " + pageNum, pageInfo);
}
jsonOutput.set("Per Page Info", pageInfoParent);
jsonOutput.set("BasicInfo", basicInfo);
jsonOutput.set("DocumentInfo", docInfoNode);
jsonOutput.set("Compliancy", compliancy);
jsonOutput.set("Encryption", encryption);
jsonOutput.set("Other", other);
jsonOutput.set("PerPageInfo", pageInfoParent);

View file

@ -237,7 +237,7 @@ HTMLToPDF.tags=markup,web-content,transformation,convert
home.MarkdownToPDF.title=Markdown to PDF
home.MarkdownToPDF.desc=Converts any Markdown fileto PDF
home.MarkdownToPDF.desc=Converts any Markdown file to PDF
MarkdownToPDF.tags=markup,web-content,transformation,convert

View file

@ -1,12 +1,18 @@
document.addEventListener('DOMContentLoaded', function() {
document.querySelectorAll('.custom-file-chooser').forEach(setupFileInput);
});
function setupFileInput(chooser) {
const elementId = chooser.getAttribute('data-element-id');
const filesSelected = chooser.getAttribute('data-files-selected');
const pdfPrompt = chooser.getAttribute('data-pdf-prompt');
let overlay;
let dragCounter = 0;
const dragenterListener = function() {
dragCounter++;
if (!overlay) {
// Create and show the overlay
overlay = document.createElement('div');
overlay.style.position = 'fixed';
overlay.style.top = 0;
@ -28,7 +34,6 @@ document.addEventListener('DOMContentLoaded', function() {
const dragleaveListener = function() {
dragCounter--;
if (dragCounter === 0) {
// Hide and remove the overlay
if (overlay) {
overlay.remove();
overlay = null;
@ -40,24 +45,19 @@ document.addEventListener('DOMContentLoaded', function() {
const dt = e.dataTransfer;
const files = dt.files;
// Access the file input element and assign dropped files
const fileInput = document.getElementById(elementID);
const fileInput = document.getElementById(elementId);
fileInput.files = files;
// Hide and remove the overlay
if (overlay) {
overlay.remove();
overlay = null;
}
// Reset drag counter
dragCounter = 0;
//handleFileInputChange(fileInput);
fileInput.dispatchEvent(new Event('change', { bubbles: true }));
};
// Prevent default behavior for drag events
['dragenter', 'dragover', 'dragleave', 'drop'].forEach(eventName => {
document.body.addEventListener(eventName, preventDefaults, false);
});
@ -69,29 +69,26 @@ document.addEventListener('DOMContentLoaded', function() {
document.body.addEventListener('dragenter', dragenterListener);
document.body.addEventListener('dragleave', dragleaveListener);
// Add drop event listener
document.body.addEventListener('drop', dropListener);
});
$("#" + elementId).on("change", function() {
handleFileInputChange(this);
});
$("#"+elementID).on("change", function() {
handleFileInputChange(this);
});
function handleFileInputChange(inputElement) {
const files = $(inputElement).get(0).files;
const fileNames = Array.from(files).map(f => f.name);
const selectedFilesContainer = $(inputElement).siblings(".selected-files");
selectedFilesContainer.empty();
fileNames.forEach(fileName => {
selectedFilesContainer.append("<div>" + fileName + "</div>");
});
if (fileNames.length === 1) {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames[0]);
} else if (fileNames.length > 1) {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames.length + " " + filesSelected);
} else {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(pdfPrompt);
}
}
function handleFileInputChange(inputElement) {
const files = $(inputElement).get(0).files;
const fileNames = Array.from(files).map(f => f.name);
const selectedFilesContainer = $(inputElement).siblings(".selected-files");
selectedFilesContainer.empty();
fileNames.forEach(fileName => {
selectedFilesContainer.append("<div>" + fileName + "</div>");
});
if (fileNames.length === 1) {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames[0]);
} else if (fileNames.length > 1) {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(fileNames.length + " " + filesSelected);
} else {
$(inputElement).siblings(".custom-file-label").addClass("selected").html(pdfPrompt);
}
}
}

View file

@ -98,7 +98,10 @@
</script>
<script src="js/downloader.js"></script>
<div class="custom-file-chooser">
<div class="custom-file-chooser" th:attr="data-unique-id=${name},
data-element-id=${name+'-input'},
data-files-selected=#{filesSelected},
data-pdf-prompt=#{pdfPrompt}">
<div class="custom-file">
<input type="file" class="custom-file-input" th:name="${name}" th:id="${name}+'-input'" th:accept="${accept}" multiple th:classappend="${notRequired ? '' : 'required'}">
<label class="custom-file-label" th:for="${name}+'-input'" th:text="${inputText}"></label>
@ -114,12 +117,7 @@
</div>
<button type="button" class="btn btn-primary" id="show-game-btn" style="display:none;">Bored waiting?</button>
<script th:inline="javascript">
const elementID = /*[[${name+"-input"}]]*/ '';
const filesSelected = /*[[#{filesSelected}]]*/ '';
const pdfPrompt = /*[[#{pdfPrompt}]]*/ '';
</script>
<script src="js/fileInput.js"></script>
<link rel="stylesheet" href="css/fileSelect.css">

View file

@ -15,8 +15,8 @@
<div class="col-md-9">
<h2 th:text="#{compare.header}"></h2>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf')}"></div>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput2', multiple=false, accept='application/pdf')}"></div>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, accept='application/pdf', remoteCall='false')}"></div>
<div th:replace="~{fragments/common :: fileSelector(name='fileInput2', multiple=false, accept='application/pdf', remoteCall='false')}"></div>
<button class="btn btn-primary" onclick="comparePDFs()" th:text="#{compare.submit}"></button>

View file

@ -15,13 +15,109 @@
<div class="col-md-6">
<h2 th:text="#{getPdfInfo.header}"></h2>
<p th:text="#{processTimeWarning}">
<form method="post" enctype="multipart/form-data" th:action="@{get-info-on-pdf}">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false)}"></div>
<form id="pdfInfoForm" method="post" enctype="multipart/form-data" th:action="@{get-info-on-pdf}">
<div th:replace="~{fragments/common :: fileSelector(name='fileInput', multiple=false, remoteCall='false')}"></div>
<br>
<button type="submit" id="submitBtn" class="btn btn-primary" th:text="#{getPdfInfo.submit}"></button>
</form>
<div class="container mt-5">
<!-- Iterate over each main section in the JSON -->
<div id="json-content">
<!-- JavaScript will populate this section -->
</div>
<!-- Button to download the JSON -->
<a href="#" id="downloadJson" class="btn btn-primary mt-3">Download JSON</a>
</div>
<script>
// Prevent the form from submitting the traditional way
document.getElementById("pdfInfoForm").addEventListener("submit", function(event) {
event.preventDefault();
// Fetch the formData
const formData = new FormData(event.target);
fetch('get-info-on-pdf', {
method: 'POST',
body: formData
})
.then(response => response.json())
.then(data => {
displayJsonData(data); // Display the data
setDownloadLink(data); // Set download link
})
.catch(error => console.error('Error:', error));
});
function displayJsonData(jsonData) {
let content = '';
for (const key in jsonData) {
content += renderJsonSection(key, jsonData[key]);
}
document.getElementById('json-content').innerHTML = content;
}
function setDownloadLink(jsonData) {
const downloadLink = document.getElementById('downloadJson');
const dataStr = "data:text/json;charset=utf-8," + encodeURIComponent(JSON.stringify(jsonData, null, 4));
downloadLink.setAttribute("href", dataStr);
downloadLink.setAttribute("download", "data.json");
}
function renderJsonSection(key, value, depth = 0) {
let output = `<div class="card mb-3">
<div class="card-header" id="${key}-heading-${depth}">
<h5 class="mb-0">`;
// Check if the value is an object and has children
if (value && typeof value === 'object' && Object.keys(value).length) {
output += `
<button class="btn btn-link" type="button" data-toggle="collapse" data-target="#${key}-content-${depth}" aria-expanded="true" aria-controls="${key}-content-${depth}">
${key}
</button>`;
} else {
// Display both key and value for simple entries
output += `${key}: ${value}`;
}
output += `
</h5>
</div>
<div id="${key}-content-${depth}" class="collapse" aria-labelledby="${key}-heading-${depth}">`;
// Check if the value is a nested object
if (typeof value === 'object' && !Array.isArray(value)) {
output += '<div class="card-body">';
for (const subKey in value) {
output += renderJsonSection(subKey, value[subKey], depth + 1);
}
output += '</div>';
} else if (typeof value === 'object' && Array.isArray(value) && value.length) { // Array values
value.forEach((val, index) => {
output += renderJsonSection(index, val, depth + 1);
});
}
output += '</div></div>';
return output;
}
</script>
</div>
</div>
</div>