diff --git a/Dockerfile b/Dockerfile index f0eb592c..63bbc6b0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,37 +11,30 @@ ENV DOCKER_ENABLE_SECURITY=false \ # PUID=1000 \ # PGID=1000 \ # UMASK=022 \ - + +# Copy necessary files +COPY scripts /scripts +COPY pipeline /pipeline +COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto +COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto +COPY build/libs/*.jar app.jar # Create user and group ##RUN groupadd -g $PGID stirlingpdfgroup && \ ## useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \ -## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME - +## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME && \ # Set up necessary directories and permissions -RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /logs /customFiles /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders +RUN mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ ##&& \ ## chown -R stirlingpdfuser:stirlingpdfgroup /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /customFiles && \ -## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original - -# Copy necessary files -COPY ./scripts/* /scripts/ -COPY ./pipeline/ /pipeline/ -COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/ -COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/ -COPY build/libs/*.jar app.jar - +## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original && \ # Set font cache and permissions -RUN fc-cache -f -v && chmod +x /scripts/* - -##&& \ + fc-cache -f -v && \ + chmod +x /scripts/* ## chown stirlingpdfuser:stirlingpdfgroup /app.jar && \ ## chmod +x /scripts/init.sh -# Expose necessary ports -EXPOSE 8080 - # Set user and run command ##USER stirlingpdfuser -ENTRYPOINT ["/scripts/init.sh"] +ENTRYPOINT ["tini", "--", "/scripts/init.sh"] CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"] diff --git a/Dockerfile-lite b/Dockerfile-lite index 054c7639..982a53e7 100644 --- a/Dockerfile-lite +++ b/Dockerfile-lite @@ -1,65 +1,59 @@ -# Build jbig2enc in a separate stage -FROM bellsoft/liberica-openjdk-debian:17 +# use alpine +FROM alpine:latest ARG VERSION_TAG -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libreoffice-core \ - libreoffice-common \ - libreoffice-writer \ - libreoffice-calc \ - libreoffice-impress \ - unoconv && \ - rm -rf /var/lib/apt/lists/* - - # Set Environment Variables ENV DOCKER_ENABLE_SECURITY=false \ HOME=/home/stirlingpdfuser \ VERSION_TAG=$VERSION_TAG \ JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75" -# PUID=1000 \ +# PUID=1000 \ # PGID=1000 \ # UMASK=022 \ +# Copy necessary files +COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh +COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh +COPY pipeline /pipeline +COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto +COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto +COPY build/libs/*.jar app.jar + +RUN apk add --no-cache \ + ca-certificates \ + tzdata \ + tini \ + bash \ + curl \ + openjdk17-jre \ +# Doc conversion + libreoffice-common \ + libreoffice-writer \ + libreoffice-calc \ + libreoffice-impress \ +# python and pip + python3 && \ + wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \ +# uno unoconv and HTML + pip install --break-system-packages --no-cache-dir --upgrade uno unoconv WeasyPrint && \ # Create user and group #RUN groupadd -g $PGID stirlingpdfgroup && \ # useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \ # mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME - # Set up necessary directories and permissions -RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders - + mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \ # chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/fonts/opentype/noto /configs /customFiles - -# Copy necessary files -COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh -COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh -COPY ./pipeline/ /pipeline/ -COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/ -COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/ -COPY build/libs/*.jar app.jar - # Set font cache and permissions -RUN fc-cache -f -v && \ -chmod +x /scripts/init-without-ocr.sh && \ -chmod +x /scripts/download-security-jar.sh - - + fc-cache -f -v && \ + chmod +x /scripts/*.sh # chown stirlingpdfuser:stirlingpdfgroup /app.jar - - - -# Expose the application port -EXPOSE 8080 - # Set environment variables ENV ENDPOINTS_GROUPS_TO_REMOVE=Python,OpenCV,OCRmyPDF ENV DOCKER_ENABLE_SECURITY=false # Run the application #USER stirlingpdfuser -ENTRYPOINT ["/scripts/init-without-ocr.sh"] +ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"] CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"] diff --git a/Dockerfile-ultra-lite b/Dockerfile-ultra-lite index d77c1b94..38eca953 100644 --- a/Dockerfile-ultra-lite +++ b/Dockerfile-ultra-lite @@ -1,5 +1,5 @@ -# Build jbig2enc in a separate stage -FROM bellsoft/liberica-openjdk-alpine:17 +# use alpine +FROM alpine:latest ARG VERSION_TAG @@ -8,39 +8,38 @@ ENV DOCKER_ENABLE_SECURITY=false \ HOME=/home/stirlingpdfuser \ VERSION_TAG=$VERSION_TAG \ JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75" -# PUID=1000 \ +# PUID=1000 \ # PGID=1000 \ # UMASK=022 \ +# Copy necessary files +COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh +COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh +COPY pipeline /pipeline +COPY build/libs/*.jar app.jar + # Create user and group using Alpine's addgroup and adduser #RUN addgroup -g $PGID stirlingpdfgroup && \ # adduser -u $PUID -G stirlingpdfgroup -s /bin/sh -D stirlingpdfuser && \ # mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME - # Set up necessary directories and permissions #RUN mkdir -p /scripts /configs /customFiles && \ -# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders - -RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles -COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh -COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh -COPY ./pipeline/ /pipeline/ -COPY build/libs/*.jar app.jar - +# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders +RUN mkdir /configs /logs /customFiles && \ # Set font cache and permissions #RUN chown stirlingpdfuser:stirlingpdfgroup /app.jar - -RUN chmod +x /scripts/init-without-ocr.sh && \ -chmod +x /scripts/download-security-jar.sh && \ -apk add --no-cache curl - -# Expose the application port -EXPOSE 8080 + chmod +x /scripts/*.sh && \ + apk add --no-cache \ + ca-certificates \ + tzdata \ + tini \ + bash \ + curl # Set environment variables ENV ENDPOINTS_GROUPS_TO_REMOVE=CLI -ENTRYPOINT ["/scripts/init-without-ocr.sh"] +ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"] # Run the application CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"] diff --git a/DockerfileBase b/DockerfileBase index b37050c7..06832347 100644 --- a/DockerfileBase +++ b/DockerfileBase @@ -1,43 +1,24 @@ # Main stage -FROM ubuntu:latest AS base - - +FROM alpine:3.19.0 # JDK for app -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - openjdk-17-jre && \ - rm -rf /var/lib/apt/lists/* - +RUN apk add --no-cache \ + ca-certificates \ + tzdata \ + tini \ + bash \ + curl \ + openjdk17-jre \ # Doc conversion -RUN apt-get update && \ - apt-get install -y --no-install-recommends \ - libreoffice-core \ - libreoffice-common \ - libreoffice-writer \ - libreoffice-calc \ - libreoffice-impress \ - python3-uno \ - curl \ - unoconv && \ - rm -rf /var/lib/apt/lists/* - - + libreoffice \ # OCR MY PDF (unpaper for descew and other advanced featues) -RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \ -add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \ -apt-get update && \ - apt-get install -y --no-install-recommends \ - ghostscript \ - python3-pip \ ocrmypdf \ - unpaper && \ - rm -rf /var/lib/apt/lists/* && \ - mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \ - pip install --no-cache-dir --upgrade pip && \ - pip install --no-cache-dir --upgrade ocrmypdf && \ - pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1 - - -#CV and HTML -RUN pip install --no-cache-dir opencv-python-headless WeasyPrint + tesseract-ocr-data-eng \ +# CV + py3-opencv \ +# python3/pip + python3 && \ + wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \ +# uno unoconv and HTML + pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \ + mv /usr/share/tessdata /usr/share/tessdata-original diff --git a/HowToUseOCR.md b/HowToUseOCR.md index d83d0fd2..8fdf8fe4 100644 --- a/HowToUseOCR.md +++ b/HowToUseOCR.md @@ -21,7 +21,7 @@ Depending on your requirements, you can choose the appropriate language pack for ### Installing Language Packs 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. -2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata` (Debian) or `/usr/share/tesseract/tessdata` (Fedora) +2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata` # DO NOT REMOVE EXISTING ENG.TRAINEDDATA, IT'S REQUIRED. @@ -37,14 +37,14 @@ services: your_service_name: image: your_docker_image_name volumes: - - /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata + - /location/of/trainingData:/usr/share/tessdata ``` #### Docker run Add the following to your existing docker run command ```bash --v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata +-v /location/of/trainingData:/usr/share/tessdata ``` #### Non-Docker diff --git a/LocalRunGuide.md b/LocalRunGuide.md index 824c1ae7..75a53b80 100644 --- a/LocalRunGuide.md +++ b/LocalRunGuide.md @@ -139,7 +139,7 @@ Easiest is to use the langpacks provided by your repositories. Skip the other st Manual: 1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need. -2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata` +2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata` 3. Please view [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for more info. **IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED. diff --git a/README.md b/README.md index e8814b80..5876a7e2 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ Docker Run ```bash docker run -d \ -p 8080:8080 \ - -v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata \ + -v /location/of/trainingData:/usr/share/tessdata \ -v /location/of/extraConfigs:/configs \ -v /location/of/logs:/logs \ -e DOCKER_ENABLE_SECURITY=false \ @@ -131,7 +131,7 @@ services: ports: - '8080:8080' volumes: - - /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata #Required for extra OCR languages + - /location/of/trainingData:/usr/share/tessdata #Required for extra OCR languages - /location/of/extraConfigs:/configs # - /location/of/customFiles:/customFiles/ # - /location/of/logs:/logs/ diff --git a/scripts/init.sh b/scripts/init.sh index 8f1f96db..7f5b19f0 100644 --- a/scripts/init.sh +++ b/scripts/init.sh @@ -3,10 +3,14 @@ # Copy the original tesseract-ocr files to the volume directory without overwriting existing files echo "Copying original files without overwriting existing files" mkdir -p /usr/share/tesseract-ocr -cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr +cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr -if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then - cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tesseract-ocr/5/tessdata/ || true; +if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then + cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tessdata || true; +fi + +if [ -d /usr/share/tesseract-ocr/5/tessdata ]; then + cp -r /usr/share/tesseract-ocr/5/tessdata/* /usr/share/tessdata || true; fi # Check if TESSERACT_LANGS environment variable is set and is not empty @@ -23,4 +27,4 @@ fi /scripts/download-security-jar.sh # Run the main command -exec "$@" \ No newline at end of file +exec "$@"