commit
f211eefc85
8 changed files with 107 additions and 127 deletions
31
Dockerfile
31
Dockerfile
|
@ -12,36 +12,29 @@ ENV DOCKER_ENABLE_SECURITY=false \
|
|||
# PGID=1000 \
|
||||
# UMASK=022 \
|
||||
|
||||
# Copy necessary files
|
||||
COPY scripts /scripts
|
||||
COPY pipeline /pipeline
|
||||
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto
|
||||
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
# Create user and group
|
||||
##RUN groupadd -g $PGID stirlingpdfgroup && \
|
||||
## useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
|
||||
## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
|
||||
|
||||
## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME && \
|
||||
# Set up necessary directories and permissions
|
||||
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /logs /customFiles /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
|
||||
RUN mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
|
||||
##&& \
|
||||
## chown -R stirlingpdfuser:stirlingpdfgroup /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /customFiles && \
|
||||
## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original
|
||||
|
||||
# Copy necessary files
|
||||
COPY ./scripts/* /scripts/
|
||||
COPY ./pipeline/ /pipeline/
|
||||
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/
|
||||
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original && \
|
||||
# Set font cache and permissions
|
||||
RUN fc-cache -f -v && chmod +x /scripts/*
|
||||
|
||||
##&& \
|
||||
fc-cache -f -v && \
|
||||
chmod +x /scripts/*
|
||||
## chown stirlingpdfuser:stirlingpdfgroup /app.jar && \
|
||||
## chmod +x /scripts/init.sh
|
||||
|
||||
# Expose necessary ports
|
||||
EXPOSE 8080
|
||||
|
||||
# Set user and run command
|
||||
##USER stirlingpdfuser
|
||||
ENTRYPOINT ["/scripts/init.sh"]
|
||||
ENTRYPOINT ["tini", "--", "/scripts/init.sh"]
|
||||
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]
|
||||
|
|
|
@ -1,65 +1,60 @@
|
|||
# Build jbig2enc in a separate stage
|
||||
FROM bellsoft/liberica-openjdk-debian:17
|
||||
# use alpine
|
||||
FROM alpine:3.19.0
|
||||
|
||||
ARG VERSION_TAG
|
||||
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libreoffice-core \
|
||||
libreoffice-common \
|
||||
libreoffice-writer \
|
||||
libreoffice-calc \
|
||||
libreoffice-impress \
|
||||
unoconv && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
# Set Environment Variables
|
||||
ENV DOCKER_ENABLE_SECURITY=false \
|
||||
HOME=/home/stirlingpdfuser \
|
||||
VERSION_TAG=$VERSION_TAG \
|
||||
JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75"
|
||||
# PUID=1000 \
|
||||
# PUID=1000 \
|
||||
# PGID=1000 \
|
||||
# UMASK=022 \
|
||||
|
||||
# Copy necessary files
|
||||
COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh
|
||||
COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
|
||||
COPY pipeline /pipeline
|
||||
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto
|
||||
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
RUN apk add --no-cache \
|
||||
ca-certificates \
|
||||
tzdata \
|
||||
tini \
|
||||
bash \
|
||||
curl \
|
||||
openjdk17-jre && \
|
||||
# Doc conversion
|
||||
apk add --no-cache libreoffice --repository http://dl-cdn.alpinelinux.org/alpine/edge/community && \
|
||||
# python and pip
|
||||
apk add --no-cache \
|
||||
python3 && \
|
||||
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
|
||||
# uno unoconv and HTML
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
|
||||
# Create user and group
|
||||
#RUN groupadd -g $PGID stirlingpdfgroup && \
|
||||
# useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
|
||||
# mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
|
||||
|
||||
# Set up necessary directories and permissions
|
||||
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
|
||||
|
||||
mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
|
||||
# chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/fonts/opentype/noto /configs /customFiles
|
||||
|
||||
# Copy necessary files
|
||||
COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh
|
||||
COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
|
||||
COPY ./pipeline/ /pipeline/
|
||||
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/
|
||||
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
# Set font cache and permissions
|
||||
RUN fc-cache -f -v && \
|
||||
chmod +x /scripts/init-without-ocr.sh && \
|
||||
chmod +x /scripts/download-security-jar.sh
|
||||
|
||||
|
||||
fc-cache -f -v && \
|
||||
chmod +x /scripts/*.sh && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories
|
||||
# chown stirlingpdfuser:stirlingpdfgroup /app.jar
|
||||
|
||||
|
||||
|
||||
|
||||
# Expose the application port
|
||||
EXPOSE 8080
|
||||
|
||||
# Set environment variables
|
||||
ENV ENDPOINTS_GROUPS_TO_REMOVE=Python,OpenCV,OCRmyPDF
|
||||
ENV ENDPOINTS_GROUPS_TO_REMOVE=OpenCV,OCRmyPDF
|
||||
ENV DOCKER_ENABLE_SECURITY=false
|
||||
|
||||
# Run the application
|
||||
#USER stirlingpdfuser
|
||||
ENTRYPOINT ["/scripts/init-without-ocr.sh"]
|
||||
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
|
||||
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
# Build jbig2enc in a separate stage
|
||||
FROM bellsoft/liberica-openjdk-alpine:17
|
||||
# use alpine
|
||||
FROM alpine:3.19.0
|
||||
|
||||
ARG VERSION_TAG
|
||||
|
||||
|
@ -8,39 +8,42 @@ ENV DOCKER_ENABLE_SECURITY=false \
|
|||
HOME=/home/stirlingpdfuser \
|
||||
VERSION_TAG=$VERSION_TAG \
|
||||
JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75"
|
||||
# PUID=1000 \
|
||||
# PUID=1000 \
|
||||
# PGID=1000 \
|
||||
# UMASK=022 \
|
||||
|
||||
# Copy necessary files
|
||||
COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh
|
||||
COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
|
||||
COPY pipeline /pipeline
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
# Create user and group using Alpine's addgroup and adduser
|
||||
#RUN addgroup -g $PGID stirlingpdfgroup && \
|
||||
# adduser -u $PUID -G stirlingpdfgroup -s /bin/sh -D stirlingpdfuser && \
|
||||
# mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
|
||||
|
||||
# Set up necessary directories and permissions
|
||||
#RUN mkdir -p /scripts /configs /customFiles && \
|
||||
# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
|
||||
|
||||
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles
|
||||
COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh
|
||||
COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
|
||||
COPY ./pipeline/ /pipeline/
|
||||
COPY build/libs/*.jar app.jar
|
||||
|
||||
# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
|
||||
RUN mkdir /configs /logs /customFiles && \
|
||||
# Set font cache and permissions
|
||||
#RUN chown stirlingpdfuser:stirlingpdfgroup /app.jar
|
||||
|
||||
RUN chmod +x /scripts/init-without-ocr.sh && \
|
||||
chmod +x /scripts/download-security-jar.sh && \
|
||||
apk add --no-cache curl
|
||||
|
||||
# Expose the application port
|
||||
EXPOSE 8080
|
||||
chmod +x /scripts/*.sh && \
|
||||
apk add --no-cache \
|
||||
ca-certificates \
|
||||
tzdata \
|
||||
tini \
|
||||
bash \
|
||||
curl \
|
||||
openjdk17-jre && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories
|
||||
|
||||
# Set environment variables
|
||||
ENV ENDPOINTS_GROUPS_TO_REMOVE=CLI
|
||||
|
||||
ENTRYPOINT ["/scripts/init-without-ocr.sh"]
|
||||
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
|
||||
|
||||
# Run the application
|
||||
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]
|
||||
|
|
|
@ -1,43 +1,28 @@
|
|||
# Main stage
|
||||
FROM ubuntu:latest AS base
|
||||
|
||||
|
||||
FROM alpine:3.19.0
|
||||
|
||||
# JDK for app
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
openjdk-17-jre && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
|
||||
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories && \
|
||||
apk add --no-cache \
|
||||
ca-certificates \
|
||||
tzdata \
|
||||
tini \
|
||||
bash \
|
||||
curl \
|
||||
openjdk17-jre \
|
||||
# Doc conversion
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
libreoffice-core \
|
||||
libreoffice-common \
|
||||
libreoffice-writer \
|
||||
libreoffice-calc \
|
||||
libreoffice-impress \
|
||||
python3-uno \
|
||||
curl \
|
||||
unoconv && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
|
||||
libreoffice@testing \
|
||||
# OCR MY PDF (unpaper for descew and other advanced featues)
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
|
||||
add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
|
||||
apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ghostscript \
|
||||
python3-pip \
|
||||
ocrmypdf \
|
||||
unpaper && \
|
||||
rm -rf /var/lib/apt/lists/* && \
|
||||
mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \
|
||||
pip install --no-cache-dir --upgrade pip && \
|
||||
pip install --no-cache-dir --upgrade ocrmypdf && \
|
||||
pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
|
||||
tesseract-ocr-data-eng \
|
||||
# CV
|
||||
py3-opencv \
|
||||
# python3/pip
|
||||
python3 && \
|
||||
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
|
||||
# uno unoconv and HTML
|
||||
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
|
||||
mv /usr/share/tessdata /usr/share/tessdata-original
|
||||
|
||||
|
||||
#CV and HTML
|
||||
RUN pip install --no-cache-dir opencv-python-headless WeasyPrint
|
||||
|
|
|
@ -21,7 +21,7 @@ Depending on your requirements, you can choose the appropriate language pack for
|
|||
### Installing Language Packs
|
||||
|
||||
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
|
||||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata` (Debian) or `/usr/share/tesseract/tessdata` (Fedora)
|
||||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
|
||||
|
||||
# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, IT'S REQUIRED.
|
||||
|
||||
|
@ -37,14 +37,14 @@ services:
|
|||
your_service_name:
|
||||
image: your_docker_image_name
|
||||
volumes:
|
||||
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
|
||||
- /location/of/trainingData:/usr/share/tessdata
|
||||
```
|
||||
|
||||
|
||||
#### Docker run
|
||||
Add the following to your existing docker run command
|
||||
```bash
|
||||
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
|
||||
-v /location/of/trainingData:/usr/share/tessdata
|
||||
```
|
||||
|
||||
#### Non-Docker
|
||||
|
|
|
@ -139,7 +139,7 @@ Easiest is to use the langpacks provided by your repositories. Skip the other st
|
|||
Manual:
|
||||
|
||||
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
|
||||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata`
|
||||
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
|
||||
3.
|
||||
Please view [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for more info.
|
||||
**IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED.
|
||||
|
|
|
@ -110,7 +110,7 @@ Docker Run
|
|||
```bash
|
||||
docker run -d \
|
||||
-p 8080:8080 \
|
||||
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata \
|
||||
-v /location/of/trainingData:/usr/share/tessdata \
|
||||
-v /location/of/extraConfigs:/configs \
|
||||
-v /location/of/logs:/logs \
|
||||
-e DOCKER_ENABLE_SECURITY=false \
|
||||
|
@ -131,7 +131,7 @@ services:
|
|||
ports:
|
||||
- '8080:8080'
|
||||
volumes:
|
||||
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata #Required for extra OCR languages
|
||||
- /location/of/trainingData:/usr/share/tessdata #Required for extra OCR languages
|
||||
- /location/of/extraConfigs:/configs
|
||||
# - /location/of/customFiles:/customFiles/
|
||||
# - /location/of/logs:/logs/
|
||||
|
|
|
@ -2,11 +2,15 @@
|
|||
|
||||
# Copy the original tesseract-ocr files to the volume directory without overwriting existing files
|
||||
echo "Copying original files without overwriting existing files"
|
||||
mkdir -p /usr/share/tesseract-ocr
|
||||
cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr
|
||||
mkdir -p /usr/share/tessdata
|
||||
cp -rn /usr/share/tessdata-original/* /usr/share/tessdata
|
||||
|
||||
if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then
|
||||
cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tesseract-ocr/5/tessdata/ || true;
|
||||
cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tessdata || true;
|
||||
fi
|
||||
|
||||
if [ -d /usr/share/tesseract-ocr/5/tessdata ]; then
|
||||
cp -r /usr/share/tesseract-ocr/5/tessdata/* /usr/share/tessdata || true;
|
||||
fi
|
||||
|
||||
# Check if TESSERACT_LANGS environment variable is set and is not empty
|
||||
|
|
Loading…
Reference in a new issue