Merge pull request #1297 from Stirling-Tools/cucumber

Cucumber testcases
This commit is contained in:
Anthony Stirling 2024-05-26 16:25:35 +01:00 committed by GitHub
commit b93bff5cad
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 489 additions and 6 deletions

3
.gitignore vendored
View file

@ -125,3 +125,6 @@ watchedFolders/
# Ignore Mac DS_Store files # Ignore Mac DS_Store files
.DS_Store .DS_Store
**/.DS_Store **/.DS_Store
#cucumber
/cucumber/reports/**

View file

@ -0,0 +1,16 @@
import os
def before_all(context):
context.endpoint = None
context.request_data = None
context.files = {}
context.response = None
def after_scenario(context, scenario):
if hasattr(context, 'files'):
for file in context.files.values():
file.close()
if os.path.exists('response_file'):
os.remove('response_file')
if hasattr(context, 'file_name') and os.path.exists(context.file_name):
os.remove(context.file_name)

View file

@ -0,0 +1,242 @@
Feature: API Validation
Scenario: Remove password
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages
And the pdf is encrypted with password "password123"
And the request data includes
| parameter | value |
| password | password123 |
When I send the API request to the endpoint "/api/v1/security/remove-password"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response PDF is not passworded
And the response status code should be 200
Scenario: Remove password wrong password
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages
And the pdf is encrypted with password "password123"
And the request data includes
| parameter | value |
| password | wrongPassword |
When I send the API request to the endpoint "/api/v1/security/remove-password"
Then the response status code should be 500
And the response should contain error message "Internal Server Error"
Scenario: Get info
Given I generate a PDF file as "fileInput"
When I send the API request to the endpoint "/api/v1/security/get-info-on-pdf"
Then the response content type should be "application/json"
And the response file should have size greater than 100
And the response status code should be 200
Scenario: Add password
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages
And the request data includes
| parameter | value |
| password | password123 |
When I send the API request to the endpoint "/api/v1/security/add-password"
Then the response content type should be "application/pdf"
And the response file should have size greater than 100
And the response PDF is passworded
And the response status code should be 200
Scenario: Add password with other params
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages
And the request data includes
| parameter | value |
| ownerPassword | ownerPass |
| password | password123 |
| keyLength | 256 |
| canPrint | true |
| canModify | false |
When I send the API request to the endpoint "/api/v1/security/add-password"
Then the response content type should be "application/pdf"
And the response file should have size greater than 100
And the response PDF is passworded
And the response status code should be 200
Scenario: Add watermark
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages
And the request data includes
| parameter | value |
| watermarkType | text |
| watermarkText | Sample Watermark |
| fontSize | 30 |
| rotation | 45 |
| opacity | 0.5 |
| widthSpacer | 50 |
| heightSpacer | 50 |
When I send the API request to the endpoint "/api/v1/security/add-watermark"
Then the response content type should be "application/pdf"
And the response file should have size greater than 100
And the response status code should be 200
Scenario: Repair PDF
Given I generate a PDF file as "fileInput"
When I send the API request to the endpoint "/api/v1/misc/repair"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response status code should be 200
Scenario: Remove blank pages
Given I generate a PDF file as "fileInput"
And the pdf contains 3 blank pages
And the request data includes
| parameter | value |
| threshold | 90 |
| whitePercent | 99.9 |
When I send the API request to the endpoint "/api/v1/misc/remove-blanks"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response PDF should contain 0 pages
And the response status code should be 200
@ocr
Scenario: Process PDF with OCR
Given I generate a PDF file as "fileInput"
And the request data includes
| parameter | value |
| languages | eng |
| sidecar | false |
| deskew | true |
| clean | true |
| cleanFinal | true |
| ocrType | Normal |
| ocrRenderType | hocr |
| removeImagesAfter| false |
When I send the API request to the endpoint "/api/v1/misc/ocr-pdf"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response status code should be 200
@ocr
Scenario: Process PDF with text and OCR with type normal
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages with random text
And the request data includes
| parameter | value |
| languages | eng |
| sidecar | false |
| deskew | true |
| clean | true |
| cleanFinal | true |
| ocrType | Normal |
| ocrRenderType | hocr |
| removeImagesAfter| false |
When I send the API request to the endpoint "/api/v1/misc/ocr-pdf"
Then the response status code should be 500
@ocr
Scenario: Process PDF with OCR
Given I generate a PDF file as "fileInput"
And the request data includes
| parameter | value |
| languages | eng |
| sidecar | false |
| deskew | true |
| clean | true |
| cleanFinal | true |
| ocrType | Force |
| ocrRenderType | hocr |
| removeImagesAfter| false |
When I send the API request to the endpoint "/api/v1/misc/ocr-pdf"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response status code should be 200
@ocr
Scenario: Process PDF with OCR with sidecar
Given I generate a PDF file as "fileInput"
And the request data includes
| parameter | value |
| languages | eng |
| sidecar | true |
| deskew | true |
| clean | true |
| cleanFinal | true |
| ocrType | Force |
| ocrRenderType | hocr |
| removeImagesAfter| false |
When I send the API request to the endpoint "/api/v1/misc/ocr-pdf"
Then the response content type should be "application/octet-stream"
And the response file should have extension ".zip"
And the response file should have size greater than 0
And the response status code should be 200
Scenario: Flatten PDF
Given I generate a PDF file as "fileInput"
And the request data includes
| parameter | value |
| flattenOnlyForms | false |
When I send the API request to the endpoint "/api/v1/misc/flatten"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response status code should be 200
Scenario: Update metadata
Given I generate a PDF file as "fileInput"
And the request data includes
| parameter | value |
| author | John Doe |
| title | Sample Title |
| subject | Sample Subject |
| keywords | sample, test |
| producer | Test Producer |
When I send the API request to the endpoint "/api/v1/misc/update-metadata"
Then the response content type should be "application/pdf"
And the response file should have size greater than 0
And the response PDF metadata should include "Author" as "John Doe"
And the response PDF metadata should include "Keywords" as "sample, test"
And the response PDF metadata should include "Subject" as "Sample Subject"
And the response PDF metadata should include "Title" as "Sample Title"
And the response status code should be 200
@libre
Scenario: Convert PDF to DOCX
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages with random text
And the request data includes
| parameter | value |
| outputFormat | docx |
When I send the API request to the endpoint "/api/v1/convert/pdf/word"
Then the response status code should be 200
And the response file should have size greater than 100
And the response file should have extension ".docx"
# And the response DOCX should contain 3 pages
@libre
Scenario: Convert PDF to ODT
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages with random text
And the request data includes
| parameter | value |
| outputFormat | odt |
When I send the API request to the endpoint "/api/v1/convert/pdf/word"
Then the response status code should be 200
And the response file should have size greater than 100
And the response file should have extension ".odt"
# And the response ODT should contain 3 pages
@libre
Scenario: Convert PDF to DOC
Given I generate a PDF file as "fileInput"
And the pdf contains 3 pages with random text
And the request data includes
| parameter | value |
| outputFormat | doc |
When I send the API request to the endpoint "/api/v1/convert/pdf/word"
Then the response status code should be 200
And the response file should have extension ".doc"
And the response file should have size greater than 100
# And the response DOC should contain 3 pages

View file

@ -0,0 +1,207 @@
import os
import requests
from behave import given, when, then
from PyPDF2 import PdfWriter, PdfReader
import io
import random
import string
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import mimetypes
import requests
#########
# GIVEN #
#########
@given('I generate a PDF file as "{fileInput}"')
def step_generate_pdf(context, fileInput):
context.param_name = fileInput
context.file_name = "genericNonCustomisableName.pdf"
writer = PdfWriter()
writer.add_blank_page(width=72, height=72) # Single blank page
with open(context.file_name, 'wb') as f:
writer.write(f)
if not hasattr(context, 'files'):
context.files = {}
context.files[context.param_name] = open(context.file_name, 'rb')
@given('the pdf contains {page_count:d} pages')
def step_pdf_contains_pages(context, page_count):
writer = PdfWriter()
for i in range(page_count):
writer.add_blank_page(width=72, height=72)
with open(context.file_name, 'wb') as f:
writer.write(f)
context.files[context.param_name].close()
context.files[context.param_name] = open(context.file_name, 'rb')
# Duplicate for now...
@given('the pdf contains {page_count:d} blank pages')
def step_pdf_contains_blank_pages(context, page_count):
writer = PdfWriter()
for i in range(page_count):
writer.add_blank_page(width=72, height=72)
with open(context.file_name, 'wb') as f:
writer.write(f)
context.files[context.param_name].close()
context.files[context.param_name] = open(context.file_name, 'rb')
@given('the pdf contains {page_count:d} pages with random text')
def step_pdf_contains_pages_with_random_text(context, page_count):
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
for _ in range(page_count):
text = ''.join(random.choices(string.ascii_letters + string.digits, k=100))
c.drawString(100, height - 100, text)
c.showPage()
c.save()
with open(context.file_name, 'wb') as f:
f.write(buffer.getvalue())
context.files[context.param_name].close()
context.files[context.param_name] = open(context.file_name, 'rb')
@given('the pdf pages all contain the text "{text}"')
def step_pdf_pages_contain_text(context, text):
buffer = io.BytesIO()
c = canvas.Canvas(buffer, pagesize=letter)
width, height = letter
for _ in range(len(PdfReader(context.file_name).pages)):
c.drawString(100, height - 100, text)
c.showPage()
c.save()
with open(context.file_name, 'wb') as f:
f.write(buffer.getvalue())
context.files[context.param_name].close()
context.files[context.param_name] = open(context.file_name, 'rb')
@given('the pdf is encrypted with password "{password}"')
def step_encrypt_pdf(context, password):
writer = PdfWriter()
reader = PdfReader(context.file_name)
for i in range(len(reader.pages)):
writer.add_page(reader.pages[i])
writer.encrypt(password)
with open(context.file_name, 'wb') as f:
writer.write(f)
context.files[context.param_name].close()
context.files[context.param_name] = open(context.file_name, 'rb')
@given('the request data is')
def step_request_data(context):
context.request_data = eval(context.text)
@given('the request data includes')
def step_request_data_table(context):
context.request_data = {row['parameter']: row['value'] for row in context.table}
@given('save the generated PDF file as "{filename}" for debugging')
def save_generated_pdf(context, filename):
with open(filename, 'wb') as f:
f.write(context.files[context.param_name].read())
print(f"Saved generated PDF content to {filename}")
########
# WHEN #
########
@when('I send the API request to the endpoint "{endpoint}"')
def step_send_api_request(context, endpoint):
url = f"http://localhost:8080{endpoint}"
files = context.files if hasattr(context, 'files') else {}
if not hasattr(context, 'request_data') or context.request_data is None:
context.request_data = {}
form_data = []
for key, value in context.request_data.items():
form_data.append((key, (None, value)))
for key, file in files.items():
mime_type, _ = mimetypes.guess_type(file.name)
mime_type = mime_type or 'application/octet-stream'
print("form_data " + file.name + " with " + mime_type)
form_data.append((key, (file.name, file, mime_type)))
response = requests.post(url, files=form_data)
context.response = response
########
# THEN #
########
@then('the response content type should be "{content_type}"')
def step_check_response_content_type(context, content_type):
actual_content_type = context.response.headers.get('Content-Type', '')
assert actual_content_type.startswith(content_type), f"Expected {content_type} but got {actual_content_type}. Response content: {context.response.content}"
@then('the response file should have size greater than {size:d}')
def step_check_response_file_size(context, size):
response_file = io.BytesIO(context.response.content)
assert len(response_file.getvalue()) > size
@then('the response PDF is not passworded')
def step_check_response_pdf_not_passworded(context):
response_file = io.BytesIO(context.response.content)
reader = PdfReader(response_file)
assert not reader.is_encrypted
@then('the response PDF is passworded')
def step_check_response_pdf_passworded(context):
response_file = io.BytesIO(context.response.content)
try:
reader = PdfReader(response_file)
assert reader.is_encrypted
except PdfReadError as e:
raise AssertionError(f"Failed to read PDF: {str(e)}. Response content: {context.response.content}")
except Exception as e:
raise AssertionError(f"An error occurred: {str(e)}. Response content: {context.response.content}")
@then('the response status code should be {status_code:d}')
def step_check_response_status_code(context, status_code):
assert context.response.status_code == status_code, f"Expected status code {status_code} but got {context.response.status_code}"
@then('the response should contain error message "{message}"')
def step_check_response_error_message(context, message):
response_json = context.response.json()
assert response_json.get('error') == message, f"Expected error message '{message}' but got '{response_json.get('error')}'"
@then('the response PDF should contain {page_count:d} pages')
def step_check_response_pdf_page_count(context, page_count):
response_file = io.BytesIO(context.response.content)
reader = PdfReader(response_file)
assert len(reader.pages) == page_count, f"Expected {page_count} pages but got {len(reader.pages)} pages"
@then('the response PDF metadata should include "{metadata_key}" as "{metadata_value}"')
def step_check_response_pdf_metadata(context, metadata_key, metadata_value):
response_file = io.BytesIO(context.response.content)
reader = PdfReader(response_file)
metadata = reader.metadata
assert metadata.get("/" + metadata_key) == metadata_value, f"Expected {metadata_key} to be '{metadata_value}' but got '{metadata.get(metadata_key)}'"
@then('the response file should have extension "{extension}"')
def step_check_response_file_extension(context, extension):
content_disposition = context.response.headers.get('Content-Disposition', '')
filename = ""
if content_disposition:
parts = content_disposition.split(';')
for part in parts:
if part.strip().startswith('filename'):
filename = part.split('=')[1].strip().strip('"')
break
assert filename.endswith(extension), f"Expected file extension {extension} but got {filename}. Response content: {context.response.content}"
@then('save the response file as "{filename}" for debugging')
def step_save_response_file(context, filename):
with open(filename, 'wb') as f:
f.write(context.response.content)
print(f"Saved response content to {filename}")

View file

@ -0,0 +1,4 @@
behave
requests
PyPDF2
reportlab

View file

@ -67,7 +67,7 @@ public class BlankPageController {
String pageText = textStripper.getText(document); String pageText = textStripper.getText(document);
boolean hasText = !pageText.trim().isEmpty(); boolean hasText = !pageText.trim().isEmpty();
Boolean blank = false; Boolean blank = true;
if (hasText) { if (hasText) {
logger.info("page " + pageIndex + " has text, not blank"); logger.info("page " + pageIndex + " has text, not blank");
blank = false; blank = false;

View file

@ -34,7 +34,10 @@ public class PDFToFile {
// Get the original PDF file name without the extension // Get the original PDF file name without the extension
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.')); String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
Path tempInputFile = null; Path tempInputFile = null;
Path tempOutputDir = null; Path tempOutputDir = null;
@ -100,8 +103,15 @@ public class PDFToFile {
// Get the original PDF file name without the extension // Get the original PDF file name without the extension
String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename()); String originalPdfFileName = Filenames.toSimpleFileName(inputFile.getOriginalFilename());
String pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
if (originalPdfFileName == null || "".equals(originalPdfFileName.trim())) {
originalPdfFileName = "output.pdf";
}
// Assume file is pdf if no extension
String pdfBaseName = originalPdfFileName;
if (originalPdfFileName.contains(".")) {
pdfBaseName = originalPdfFileName.substring(0, originalPdfFileName.lastIndexOf('.'));
}
// Validate output format // Validate output format
List<String> allowedFormats = List<String> allowedFormats =
Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text"); Arrays.asList("doc", "docx", "odt", "ppt", "pptx", "odp", "rtf", "xml", "txt:Text");
@ -173,6 +183,7 @@ public class PDFToFile {
if (tempInputFile != null) Files.delete(tempInputFile); if (tempInputFile != null) Files.delete(tempInputFile);
if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile()); if (tempOutputDir != null) FileUtils.deleteDirectory(tempOutputDir.toFile());
} }
System.out.println("fileBytes=" + fileBytes.length);
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(
fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM); fileBytes, fileName, MediaType.APPLICATION_OCTET_STREAM);
} }

View file

@ -41,7 +41,7 @@ system:
defaultLocale: 'en-US' # Set the default language (e.g. 'de-DE', 'fr-FR', etc) defaultLocale: 'en-US' # Set the default language (e.g. 'de-DE', 'fr-FR', etc)
googlevisibility: false # 'true' to allow Google visibility (via robots.txt), 'false' to disallow googlevisibility: false # 'true' to allow Google visibility (via robots.txt), 'false' to disallow
enableAlphaFunctionality: false # Set to enable functionality which might need more testing before it fully goes live (This feature might make no changes) enableAlphaFunctionality: false # Set to enable functionality which might need more testing before it fully goes live (This feature might make no changes)
showUpdate: true # see when a new update is available showUpdate: false # see when a new update is available
showUpdateOnlyAdmin: false # Only admins can see when a new update is available, depending on showUpdate it must be set to 'true' showUpdateOnlyAdmin: false # Only admins can see when a new update is available, depending on showUpdate it must be set to 'true'
customHTMLFiles: false # enable to have files placed in /customFiles/templates override the existing template html files customHTMLFiles: false # enable to have files placed in /customFiles/templates override the existing template html files