diff --git a/.gitignore b/.gitignore index 174eab39..1a241d6f 100644 --- a/.gitignore +++ b/.gitignore @@ -124,4 +124,7 @@ watchedFolders/ # Ignore Mac DS_Store files .DS_Store -**/.DS_Store \ No newline at end of file +**/.DS_Store + +#cucumber +/cucumber/reports/** \ No newline at end of file diff --git a/cucumber/features/environment.py b/cucumber/features/environment.py new file mode 100644 index 00000000..ec7f2c9f --- /dev/null +++ b/cucumber/features/environment.py @@ -0,0 +1,16 @@ +import os + +def before_all(context): + context.endpoint = None + context.request_data = None + context.files = {} + context.response = None + +def after_scenario(context, scenario): + if hasattr(context, 'files'): + for file in context.files.values(): + file.close() + if os.path.exists('response_file'): + os.remove('response_file') + if hasattr(context, 'file_name') and os.path.exists(context.file_name): + os.remove(context.file_name) diff --git a/cucumber/features/examples.feature b/cucumber/features/examples.feature new file mode 100644 index 00000000..c18e0516 --- /dev/null +++ b/cucumber/features/examples.feature @@ -0,0 +1,242 @@ +Feature: API Validation + + Scenario: Remove password + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages + And the pdf is encrypted with password "password123" + And the request data includes + | parameter | value | + | password | password123 | + When I send the API request to the endpoint "/api/v1/security/remove-password" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response PDF is not passworded + And the response status code should be 200 + + Scenario: Remove password wrong password + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages + And the pdf is encrypted with password "password123" + And the request data includes + | parameter | value | + | password | wrongPassword | + When I send the API request to the endpoint "/api/v1/security/remove-password" + Then the response status code should be 500 + And the response should contain error message "Internal Server Error" + + Scenario: Get info + Given I generate a PDF file as "fileInput" + When I send the API request to the endpoint "/api/v1/security/get-info-on-pdf" + Then the response content type should be "application/json" + And the response file should have size greater than 100 + And the response status code should be 200 + + Scenario: Add password + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages + And the request data includes + | parameter | value | + | password | password123 | + When I send the API request to the endpoint "/api/v1/security/add-password" + Then the response content type should be "application/pdf" + And the response file should have size greater than 100 + And the response PDF is passworded + And the response status code should be 200 + + Scenario: Add password with other params + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages + And the request data includes + | parameter | value | + | ownerPassword | ownerPass | + | password | password123 | + | keyLength | 256 | + | canPrint | true | + | canModify | false | + When I send the API request to the endpoint "/api/v1/security/add-password" + Then the response content type should be "application/pdf" + And the response file should have size greater than 100 + And the response PDF is passworded + And the response status code should be 200 + + + Scenario: Add watermark + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages + And the request data includes + | parameter | value | + | watermarkType | text | + | watermarkText | Sample Watermark | + | fontSize | 30 | + | rotation | 45 | + | opacity | 0.5 | + | widthSpacer | 50 | + | heightSpacer | 50 | + When I send the API request to the endpoint "/api/v1/security/add-watermark" + Then the response content type should be "application/pdf" + And the response file should have size greater than 100 + And the response status code should be 200 + + + + Scenario: Repair PDF + Given I generate a PDF file as "fileInput" + When I send the API request to the endpoint "/api/v1/misc/repair" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response status code should be 200 + + + + Scenario: Remove blank pages + Given I generate a PDF file as "fileInput" + And the pdf contains 3 blank pages + And the request data includes + | parameter | value | + | threshold | 90 | + | whitePercent | 99.9 | + When I send the API request to the endpoint "/api/v1/misc/remove-blanks" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response PDF should contain 0 pages + And the response status code should be 200 + + @ocr + Scenario: Process PDF with OCR + Given I generate a PDF file as "fileInput" + And the request data includes + | parameter | value | + | languages | eng | + | sidecar | false | + | deskew | true | + | clean | true | + | cleanFinal | true | + | ocrType | Normal | + | ocrRenderType | hocr | + | removeImagesAfter| false | + When I send the API request to the endpoint "/api/v1/misc/ocr-pdf" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response status code should be 200 + + @ocr + Scenario: Process PDF with text and OCR with type normal + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages with random text + And the request data includes + | parameter | value | + | languages | eng | + | sidecar | false | + | deskew | true | + | clean | true | + | cleanFinal | true | + | ocrType | Normal | + | ocrRenderType | hocr | + | removeImagesAfter| false | + When I send the API request to the endpoint "/api/v1/misc/ocr-pdf" + Then the response status code should be 500 + + @ocr + Scenario: Process PDF with OCR + Given I generate a PDF file as "fileInput" + And the request data includes + | parameter | value | + | languages | eng | + | sidecar | false | + | deskew | true | + | clean | true | + | cleanFinal | true | + | ocrType | Force | + | ocrRenderType | hocr | + | removeImagesAfter| false | + When I send the API request to the endpoint "/api/v1/misc/ocr-pdf" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response status code should be 200 + + @ocr + Scenario: Process PDF with OCR with sidecar + Given I generate a PDF file as "fileInput" + And the request data includes + | parameter | value | + | languages | eng | + | sidecar | true | + | deskew | true | + | clean | true | + | cleanFinal | true | + | ocrType | Force | + | ocrRenderType | hocr | + | removeImagesAfter| false | + When I send the API request to the endpoint "/api/v1/misc/ocr-pdf" + Then the response content type should be "application/octet-stream" + And the response file should have extension ".zip" + And the response file should have size greater than 0 + And the response status code should be 200 + + + Scenario: Flatten PDF + Given I generate a PDF file as "fileInput" + And the request data includes + | parameter | value | + | flattenOnlyForms | false | + When I send the API request to the endpoint "/api/v1/misc/flatten" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response status code should be 200 + + Scenario: Update metadata + Given I generate a PDF file as "fileInput" + And the request data includes + | parameter | value | + | author | John Doe | + | title | Sample Title | + | subject | Sample Subject | + | keywords | sample, test | + | producer | Test Producer | + When I send the API request to the endpoint "/api/v1/misc/update-metadata" + Then the response content type should be "application/pdf" + And the response file should have size greater than 0 + And the response PDF metadata should include "Author" as "John Doe" + And the response PDF metadata should include "Keywords" as "sample, test" + And the response PDF metadata should include "Subject" as "Sample Subject" + And the response PDF metadata should include "Title" as "Sample Title" + And the response status code should be 200 + + @libre + Scenario: Convert PDF to DOCX + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages with random text + And the request data includes + | parameter | value | + | outputFormat | docx | + When I send the API request to the endpoint "/api/v1/convert/pdf/word" + Then the response status code should be 200 + And the response file should have size greater than 100 + And the response file should have extension ".docx" +# And the response DOCX should contain 3 pages + + @libre + Scenario: Convert PDF to ODT + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages with random text + And the request data includes + | parameter | value | + | outputFormat | odt | + When I send the API request to the endpoint "/api/v1/convert/pdf/word" + Then the response status code should be 200 + And the response file should have size greater than 100 + And the response file should have extension ".odt" +# And the response ODT should contain 3 pages + + @libre + Scenario: Convert PDF to DOC + Given I generate a PDF file as "fileInput" + And the pdf contains 3 pages with random text + And the request data includes + | parameter | value | + | outputFormat | doc | + When I send the API request to the endpoint "/api/v1/convert/pdf/word" + Then the response status code should be 200 + And the response file should have extension ".doc" + And the response file should have size greater than 100 +# And the response DOC should contain 3 pages \ No newline at end of file diff --git a/cucumber/features/steps/step_definitions.py b/cucumber/features/steps/step_definitions.py new file mode 100644 index 00000000..50022d4e --- /dev/null +++ b/cucumber/features/steps/step_definitions.py @@ -0,0 +1,207 @@ +import os +import requests +from behave import given, when, then +from PyPDF2 import PdfWriter, PdfReader +import io +import random +import string +from reportlab.lib.pagesizes import letter +from reportlab.pdfgen import canvas +import mimetypes +import requests + +######### +# GIVEN # +######### + +@given('I generate a PDF file as "{fileInput}"') +def step_generate_pdf(context, fileInput): + context.param_name = fileInput + context.file_name = "genericNonCustomisableName.pdf" + writer = PdfWriter() + writer.add_blank_page(width=72, height=72) # Single blank page + with open(context.file_name, 'wb') as f: + writer.write(f) + if not hasattr(context, 'files'): + context.files = {} + context.files[context.param_name] = open(context.file_name, 'rb') + +@given('the pdf contains {page_count:d} pages') +def step_pdf_contains_pages(context, page_count): + writer = PdfWriter() + for i in range(page_count): + writer.add_blank_page(width=72, height=72) + with open(context.file_name, 'wb') as f: + writer.write(f) + context.files[context.param_name].close() + context.files[context.param_name] = open(context.file_name, 'rb') + +# Duplicate for now... +@given('the pdf contains {page_count:d} blank pages') +def step_pdf_contains_blank_pages(context, page_count): + writer = PdfWriter() + for i in range(page_count): + writer.add_blank_page(width=72, height=72) + with open(context.file_name, 'wb') as f: + writer.write(f) + context.files[context.param_name].close() + context.files[context.param_name] = open(context.file_name, 'rb') + +@given('the pdf contains {page_count:d} pages with random text') +def step_pdf_contains_pages_with_random_text(context, page_count): + buffer = io.BytesIO() + c = canvas.Canvas(buffer, pagesize=letter) + width, height = letter + + for _ in range(page_count): + text = ''.join(random.choices(string.ascii_letters + string.digits, k=100)) + c.drawString(100, height - 100, text) + c.showPage() + + c.save() + + with open(context.file_name, 'wb') as f: + f.write(buffer.getvalue()) + + context.files[context.param_name].close() + context.files[context.param_name] = open(context.file_name, 'rb') + +@given('the pdf pages all contain the text "{text}"') +def step_pdf_pages_contain_text(context, text): + buffer = io.BytesIO() + c = canvas.Canvas(buffer, pagesize=letter) + width, height = letter + + for _ in range(len(PdfReader(context.file_name).pages)): + c.drawString(100, height - 100, text) + c.showPage() + + c.save() + + with open(context.file_name, 'wb') as f: + f.write(buffer.getvalue()) + + context.files[context.param_name].close() + context.files[context.param_name] = open(context.file_name, 'rb') + +@given('the pdf is encrypted with password "{password}"') +def step_encrypt_pdf(context, password): + writer = PdfWriter() + reader = PdfReader(context.file_name) + for i in range(len(reader.pages)): + writer.add_page(reader.pages[i]) + writer.encrypt(password) + with open(context.file_name, 'wb') as f: + writer.write(f) + context.files[context.param_name].close() + context.files[context.param_name] = open(context.file_name, 'rb') + +@given('the request data is') +def step_request_data(context): + context.request_data = eval(context.text) + +@given('the request data includes') +def step_request_data_table(context): + context.request_data = {row['parameter']: row['value'] for row in context.table} + +@given('save the generated PDF file as "{filename}" for debugging') +def save_generated_pdf(context, filename): + with open(filename, 'wb') as f: + f.write(context.files[context.param_name].read()) + print(f"Saved generated PDF content to {filename}") + +######## +# WHEN # +######## + +@when('I send the API request to the endpoint "{endpoint}"') +def step_send_api_request(context, endpoint): + url = f"http://localhost:8080{endpoint}" + files = context.files if hasattr(context, 'files') else {} + + if not hasattr(context, 'request_data') or context.request_data is None: + context.request_data = {} + + form_data = [] + for key, value in context.request_data.items(): + form_data.append((key, (None, value))) + + for key, file in files.items(): + mime_type, _ = mimetypes.guess_type(file.name) + mime_type = mime_type or 'application/octet-stream' + print("form_data " + file.name + " with " + mime_type) + form_data.append((key, (file.name, file, mime_type))) + + response = requests.post(url, files=form_data) + context.response = response + +######## +# THEN # +######## + +@then('the response content type should be "{content_type}"') +def step_check_response_content_type(context, content_type): + actual_content_type = context.response.headers.get('Content-Type', '') + assert actual_content_type.startswith(content_type), f"Expected {content_type} but got {actual_content_type}. Response content: {context.response.content}" + +@then('the response file should have size greater than {size:d}') +def step_check_response_file_size(context, size): + response_file = io.BytesIO(context.response.content) + assert len(response_file.getvalue()) > size + +@then('the response PDF is not passworded') +def step_check_response_pdf_not_passworded(context): + response_file = io.BytesIO(context.response.content) + reader = PdfReader(response_file) + assert not reader.is_encrypted + +@then('the response PDF is passworded') +def step_check_response_pdf_passworded(context): + response_file = io.BytesIO(context.response.content) + try: + reader = PdfReader(response_file) + assert reader.is_encrypted + except PdfReadError as e: + raise AssertionError(f"Failed to read PDF: {str(e)}. Response content: {context.response.content}") + except Exception as e: + raise AssertionError(f"An error occurred: {str(e)}. Response content: {context.response.content}") + +@then('the response status code should be {status_code:d}') +def step_check_response_status_code(context, status_code): + assert context.response.status_code == status_code, f"Expected status code {status_code} but got {context.response.status_code}" + +@then('the response should contain error message "{message}"') +def step_check_response_error_message(context, message): + response_json = context.response.json() + assert response_json.get('error') == message, f"Expected error message '{message}' but got '{response_json.get('error')}'" + +@then('the response PDF should contain {page_count:d} pages') +def step_check_response_pdf_page_count(context, page_count): + response_file = io.BytesIO(context.response.content) + reader = PdfReader(response_file) + assert len(reader.pages) == page_count, f"Expected {page_count} pages but got {len(reader.pages)} pages" + +@then('the response PDF metadata should include "{metadata_key}" as "{metadata_value}"') +def step_check_response_pdf_metadata(context, metadata_key, metadata_value): + response_file = io.BytesIO(context.response.content) + reader = PdfReader(response_file) + metadata = reader.metadata + assert metadata.get("/" + metadata_key) == metadata_value, f"Expected {metadata_key} to be '{metadata_value}' but got '{metadata.get(metadata_key)}'" + +@then('the response file should have extension "{extension}"') +def step_check_response_file_extension(context, extension): + content_disposition = context.response.headers.get('Content-Disposition', '') + filename = "" + if content_disposition: + parts = content_disposition.split(';') + for part in parts: + if part.strip().startswith('filename'): + filename = part.split('=')[1].strip().strip('"') + break + assert filename.endswith(extension), f"Expected file extension {extension} but got {filename}. Response content: {context.response.content}" + +@then('save the response file as "{filename}" for debugging') +def step_save_response_file(context, filename): + with open(filename, 'wb') as f: + f.write(context.response.content) + print(f"Saved response content to {filename}") diff --git a/cucumber/requirements.txt b/cucumber/requirements.txt new file mode 100644 index 00000000..52f20fd2 --- /dev/null +++ b/cucumber/requirements.txt @@ -0,0 +1,4 @@ +behave +requests +PyPDF2 +reportlab \ No newline at end of file diff --git a/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java b/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java index eaac72a0..73dab071 100644 --- a/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java +++ b/src/main/java/stirling/software/SPDF/controller/api/misc/BlankPageController.java @@ -67,7 +67,7 @@ public class BlankPageController { String pageText = textStripper.getText(document); boolean hasText = !pageText.trim().isEmpty(); - Boolean blank = false; + Boolean blank = true; if (hasText) { logger.info("page " + pageIndex + " has text, not blank"); blank = false; diff --git a/src/main/java/stirling/software/SPDF/utils/PDFToFile.java b/src/main/java/stirling/software/SPDF/utils/PDFToFile.java index e52344bf..38579aa8 100644 --- a/src/main/java/stirling/software/SPDF/utils/PDFToFile.java +++ b/src/main/java/stirling/software/SPDF/utils/PDFToFile.java @@ -34,7 +34,10 @@ public class PDFToFile { // Get the original PDF file name without the extension String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); - String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); + String pdfBaseName = originalPdfFileName; + if (originalPdfFileName.contains(".")) { + pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); + } Path tempInputFile = null; Path tempOutputDir = null; @@ -100,8 +103,15 @@ public class PDFToFile { // Get the original PDF file name without the extension String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); - String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); + if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) { + originalPdfFileName = "output.pdf"; + } + // Assume file is pdf if no extension + String pdfBaseName = originalPdfFileName; + if (originalPdfFileName.contains(".")) { + pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); + } // Validate output format List allowedFormats = Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text"); @@ -173,6 +183,7 @@ public class PDFToFile { if (tempInputFile != null) Files.delete(tempInputFile); if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); } + System.out.println("fileBytes=" + fileBytes.length); return WebResponseUtils.bytesToWebResponse( fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM); } diff --git a/src/main/resources/messages_fr_FR.properties b/src/main/resources/messages_fr_FR.properties index 9a0f51ae..ea7f00dd 100644 --- a/src/main/resources/messages_fr_FR.properties +++ b/src/main/resources/messages_fr_FR.properties @@ -17,7 +17,7 @@ true=Vrai false=Faux unknown=Inconnu save=Enregistrer -saveToBrowser=Enregistrer dans le navigateur +saveToBrowser=Enregistrer dans le navigateur close=Fermer filesSelected=fichiers sélectionnés noFavourites=Aucun favori ajouté diff --git a/src/main/resources/settings.yml.template b/src/main/resources/settings.yml.template index f8e9bbbd..b15ca94f 100644 --- a/src/main/resources/settings.yml.template +++ b/src/main/resources/settings.yml.template @@ -41,7 +41,7 @@ system: defaultLocale: 'en-US' # Set the default language (e.g. 'de-DE', 'fr-FR', etc) googlevisibility: false # 'true' to allow Google visibility (via robots.txt), 'false' to disallow enableAlphaFunctionality: false # Set to enable functionality which might need more testing before it fully goes live (This feature might make no changes) - showUpdate: true # see when a new update is available + showUpdate: false # see when a new update is available showUpdateOnlyAdmin: false # Only admins can see when a new update is available, depending on showUpdate it must be set to 'true' customHTMLFiles: false # enable to have files placed in /customFiles/templates override the existing template html files