This commit is contained in:
Anthony Stirling 2024-02-09 23:45:18 +00:00
parent 8a143d139c
commit 22343e507d
6 changed files with 86 additions and 84 deletions

View file

@ -1,5 +1,32 @@
# Use the base image # Main stage
FROM frooodle/stirling-pdf-base:version8 FROM alpine:3.19.0
# JDK for app
RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories && \
apk add --no-cache \
ca-certificates \
tzdata \
tini \
bash \
curl \
openjdk17-jre \
# Doc conversion
libreoffice@testing \
# OCR MY PDF (unpaper for descew and other advanced featues)
ocrmypdf \
tesseract-ocr-data-eng \
# CV
py3-opencv \
# python3/pip
python3 && \
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
# uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
mv /usr/share/tessdata /usr/share/tessdata-original
ARG VERSION_TAG ARG VERSION_TAG
@ -24,7 +51,7 @@ COPY build/libs/*.jar app.jar
## useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \ ## useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME && \ ## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME && \
# Set up necessary directories and permissions # Set up necessary directories and permissions
RUN mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ RUN mkdir -p /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
##&& \ ##&& \
## chown -R stirlingpdfuser:stirlingpdfgroup /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /customFiles && \ ## chown -R stirlingpdfuser:stirlingpdfgroup /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /customFiles && \
## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original && \ ## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original && \

View file

@ -20,17 +20,19 @@ COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto
COPY build/libs/*.jar app.jar COPY build/libs/*.jar app.jar
RUN apk add --no-cache \ RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories && \
apk add --no-cache \
ca-certificates \ ca-certificates \
tzdata \ tzdata \
tini \ tini \
bash \ bash \
curl \ curl \
openjdk17-jre && \ openjdk17-jre \
# Doc conversion # Doc conversion
apk add --no-cache libreoffice --repository http://dl-cdn.alpinelinux.org/alpine/edge/community && \ libreoffice@testing \
# python and pip # python and pip
apk add --no-cache \
python3 && \ python3 && \
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \ wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
# uno unoconv and HTML # uno unoconv and HTML
@ -40,14 +42,11 @@ RUN apk add --no-cache \
# useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \ # useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
# mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME # mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
# Set up necessary directories and permissions # Set up necessary directories and permissions
mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ mkdir -p /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
# chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/fonts/opentype/noto /configs /customFiles # chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/fonts/opentype/noto /configs /customFiles
# Set font cache and permissions # Set font cache and permissions
fc-cache -f -v && \ fc-cache -f -v && \
chmod +x /scripts/*.sh && \ chmod +x /scripts/*.sh
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories
# chown stirlingpdfuser:stirlingpdfgroup /app.jar # chown stirlingpdfuser:stirlingpdfgroup /app.jar
# Set environment variables # Set environment variables

View file

@ -1,28 +0,0 @@
# Main stage
FROM alpine:3.19.0
# JDK for app
RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories && \
apk add --no-cache \
ca-certificates \
tzdata \
tini \
bash \
curl \
openjdk17-jre \
# Doc conversion
libreoffice@testing \
# OCR MY PDF (unpaper for descew and other advanced featues)
ocrmypdf \
tesseract-ocr-data-eng \
# CV
py3-opencv \
# python3/pip
python3 && \
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
# uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
mv /usr/share/tessdata /usr/share/tessdata-original

View file

@ -4,6 +4,7 @@ import java.io.IOException;
import org.apache.pdfbox.Loader; import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.http.MediaType; import org.springframework.http.MediaType;
import org.springframework.http.ResponseEntity; import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.ModelAttribute; import org.springframework.web.bind.annotation.ModelAttribute;
@ -11,7 +12,6 @@ import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController; import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile; import org.springframework.web.multipart.MultipartFile;
import org.apache.pdfbox.text.PDFTextStripper;
import io.github.pixee.security.Filenames; import io.github.pixee.security.Filenames;
import io.swagger.v3.oas.annotations.Operation; import io.swagger.v3.oas.annotations.Operation;
@ -65,16 +65,20 @@ public class ConvertPDFToOffice {
throws IOException, InterruptedException { throws IOException, InterruptedException {
MultipartFile inputFile = request.getFileInput(); MultipartFile inputFile = request.getFileInput();
String outputFormat = request.getOutputFormat(); String outputFormat = request.getOutputFormat();
if ("txt".equals(request.getOutputFormat())) { if ("txt".equals(request.getOutputFormat())) {
try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) { try (PDDocument document = Loader.loadPDF(inputFile.getBytes())) {
PDFTextStripper stripper = new PDFTextStripper(); PDFTextStripper stripper = new PDFTextStripper();
String text = stripper.getText(document); String text = stripper.getText(document);
return WebResponseUtils.bytesToWebResponse(text.getBytes(), Filenames.toSimpleFileName(inputFile.getOriginalFilename()).replaceFirst("[.][^.]+$", "") return WebResponseUtils.bytesToWebResponse(
+ ".txt" , MediaType.TEXT_PLAIN); text.getBytes(),
Filenames.toSimpleFileName(inputFile.getOriginalFilename())
.replaceFirst("[.][^.]+$", "")
+ ".txt",
MediaType.TEXT_PLAIN);
} }
} else { } else {
PDFToFile pdfToFile = new PDFToFile(); PDFToFile pdfToFile = new PDFToFile();
return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import"); return pdfToFile.processPdfToOfficeFormat(inputFile, outputFormat, "writer_pdf_import");
} }
} }

View file

@ -68,7 +68,9 @@ public class ShowJavascript {
if (script.isEmpty()) { if (script.isEmpty()) {
script = script =
"PDF '" + Filenames.toSimpleFileName(inputFile.getOriginalFilename()) + "' does not contain Javascript"; "PDF '"
+ Filenames.toSimpleFileName(inputFile.getOriginalFilename())
+ "' does not contain Javascript";
} }
return WebResponseUtils.bytesToWebResponse( return WebResponseUtils.bytesToWebResponse(

View file

@ -97,46 +97,44 @@ public class StampController {
PDRectangle pageSize = page.getMediaBox(); PDRectangle pageSize = page.getMediaBox();
float margin = marginFactor * (pageSize.getWidth() + pageSize.getHeight()) / 2; float margin = marginFactor * (pageSize.getWidth() + pageSize.getHeight()) / 2;
PDPageContentStream contentStream = PDPageContentStream contentStream =
new PDPageContentStream( new PDPageContentStream(
document, page, PDPageContentStream.AppendMode.APPEND, true, true); document, page, PDPageContentStream.AppendMode.APPEND, true, true);
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState();
graphicsState.setNonStrokingAlphaConstant(opacity);
contentStream.setGraphicsStateParameters(graphicsState);
PDExtendedGraphicsState graphicsState = new PDExtendedGraphicsState(); if ("text".equalsIgnoreCase(watermarkType)) {
graphicsState.setNonStrokingAlphaConstant(opacity); addTextStamp(
contentStream.setGraphicsStateParameters(graphicsState); contentStream,
watermarkText,
if ("text".equalsIgnoreCase(watermarkType)) { document,
addTextStamp( page,
contentStream, rotation,
watermarkText, position,
document, fontSize,
page, alphabet,
rotation, overrideX,
position, overrideY,
fontSize, margin,
alphabet, customColor);
overrideX, } else if ("image".equalsIgnoreCase(watermarkType)) {
overrideY, addImageStamp(
margin, contentStream,
customColor); watermarkImage,
} else if ("image".equalsIgnoreCase(watermarkType)) { document,
addImageStamp( page,
contentStream, rotation,
watermarkImage, position,
document, fontSize,
page, overrideX,
rotation, overrideY,
position, margin);
fontSize, }
overrideX,
overrideY, contentStream.close();
margin); }
}
contentStream.close();
}
} }
return WebResponseUtils.pdfDocToWebResponse( return WebResponseUtils.pdfDocToWebResponse(
document, document,