Merge pull request #624 from Zoey2936/main

switch images to alpine
This commit is contained in:
Anthony Stirling 2024-02-09 23:22:07 +00:00 committed by GitHub
commit f211eefc85
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 107 additions and 127 deletions

View file

@ -12,36 +12,29 @@ ENV DOCKER_ENABLE_SECURITY=false \
# PGID=1000 \
# UMASK=022 \
# Copy necessary files
COPY scripts /scripts
COPY pipeline /pipeline
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto
COPY build/libs/*.jar app.jar
# Create user and group
##RUN groupadd -g $PGID stirlingpdfgroup && \
## useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
## mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME && \
# Set up necessary directories and permissions
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /logs /customFiles /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
RUN mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
##&& \
## chown -R stirlingpdfuser:stirlingpdfgroup /scripts /usr/share/fonts/opentype/noto /usr/share/tesseract-ocr /configs /customFiles && \
## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original
# Copy necessary files
COPY ./scripts/* /scripts/
COPY ./pipeline/ /pipeline/
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/
COPY build/libs/*.jar app.jar
## chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/tesseract-ocr-original && \
# Set font cache and permissions
RUN fc-cache -f -v && chmod +x /scripts/*
##&& \
fc-cache -f -v && \
chmod +x /scripts/*
## chown stirlingpdfuser:stirlingpdfgroup /app.jar && \
## chmod +x /scripts/init.sh
# Expose necessary ports
EXPOSE 8080
# Set user and run command
##USER stirlingpdfuser
ENTRYPOINT ["/scripts/init.sh"]
ENTRYPOINT ["tini", "--", "/scripts/init.sh"]
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]

View file

@ -1,65 +1,60 @@
# Build jbig2enc in a separate stage
FROM bellsoft/liberica-openjdk-debian:17
# use alpine
FROM alpine:3.19.0
ARG VERSION_TAG
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core \
libreoffice-common \
libreoffice-writer \
libreoffice-calc \
libreoffice-impress \
unoconv && \
rm -rf /var/lib/apt/lists/*
# Set Environment Variables
ENV DOCKER_ENABLE_SECURITY=false \
HOME=/home/stirlingpdfuser \
VERSION_TAG=$VERSION_TAG \
JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75"
# PUID=1000 \
# PUID=1000 \
# PGID=1000 \
# UMASK=022 \
# Copy necessary files
COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh
COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
COPY pipeline /pipeline
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto
COPY build/libs/*.jar app.jar
RUN apk add --no-cache \
ca-certificates \
tzdata \
tini \
bash \
curl \
openjdk17-jre && \
# Doc conversion
apk add --no-cache libreoffice --repository http://dl-cdn.alpinelinux.org/alpine/edge/community && \
# python and pip
apk add --no-cache \
python3 && \
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
# uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
# Create user and group
#RUN groupadd -g $PGID stirlingpdfgroup && \
# useradd -u $PUID -g stirlingpdfgroup -s /bin/sh stirlingpdfuser && \
# mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
# Set up necessary directories and permissions
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
mkdir /configs /logs /customFiles /pipeline/watchedFolders /pipeline/finishedFolders && \
# chown -R stirlingpdfuser:stirlingpdfgroup /usr/share/fonts/opentype/noto /configs /customFiles
# Copy necessary files
COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh
COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
COPY ./pipeline/ /pipeline/
COPY src/main/resources/static/fonts/*.ttf /usr/share/fonts/opentype/noto/
COPY src/main/resources/static/fonts/*.otf /usr/share/fonts/opentype/noto/
COPY build/libs/*.jar app.jar
# Set font cache and permissions
RUN fc-cache -f -v && \
chmod +x /scripts/init-without-ocr.sh && \
chmod +x /scripts/download-security-jar.sh
fc-cache -f -v && \
chmod +x /scripts/*.sh && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories
# chown stirlingpdfuser:stirlingpdfgroup /app.jar
# Expose the application port
EXPOSE 8080
# Set environment variables
ENV ENDPOINTS_GROUPS_TO_REMOVE=Python,OpenCV,OCRmyPDF
ENV ENDPOINTS_GROUPS_TO_REMOVE=OpenCV,OCRmyPDF
ENV DOCKER_ENABLE_SECURITY=false
# Run the application
#USER stirlingpdfuser
ENTRYPOINT ["/scripts/init-without-ocr.sh"]
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]

View file

@ -1,5 +1,5 @@
# Build jbig2enc in a separate stage
FROM bellsoft/liberica-openjdk-alpine:17
# use alpine
FROM alpine:3.19.0
ARG VERSION_TAG
@ -8,39 +8,42 @@ ENV DOCKER_ENABLE_SECURITY=false \
HOME=/home/stirlingpdfuser \
VERSION_TAG=$VERSION_TAG \
JAVA_TOOL_OPTIONS="$JAVA_TOOL_OPTIONS -XX:MaxRAMPercentage=75"
# PUID=1000 \
# PUID=1000 \
# PGID=1000 \
# UMASK=022 \
# Copy necessary files
COPY scripts/download-security-jar.sh /scripts/download-security-jar.sh
COPY scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
COPY pipeline /pipeline
COPY build/libs/*.jar app.jar
# Create user and group using Alpine's addgroup and adduser
#RUN addgroup -g $PGID stirlingpdfgroup && \
# adduser -u $PUID -G stirlingpdfgroup -s /bin/sh -D stirlingpdfuser && \
# mkdir -p $HOME && chown stirlingpdfuser:stirlingpdfgroup $HOME
# Set up necessary directories and permissions
#RUN mkdir -p /scripts /configs /customFiles && \
# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
RUN mkdir -p /scripts /usr/share/fonts/opentype/noto /configs /customFiles
COPY ./scripts/download-security-jar.sh /scripts/download-security-jar.sh
COPY ./scripts/init-without-ocr.sh /scripts/init-without-ocr.sh
COPY ./pipeline/ /pipeline/
COPY build/libs/*.jar app.jar
# chown -R stirlingpdfuser:stirlingpdfgroup /scripts /configs /customFiles /logs /pipeline /pipeline/defaultWebUIConfigs /pipeline/watchedFolders /pipeline/finishedFolders
RUN mkdir /configs /logs /customFiles && \
# Set font cache and permissions
#RUN chown stirlingpdfuser:stirlingpdfgroup /app.jar
RUN chmod +x /scripts/init-without-ocr.sh && \
chmod +x /scripts/download-security-jar.sh && \
apk add --no-cache curl
# Expose the application port
EXPOSE 8080
chmod +x /scripts/*.sh && \
apk add --no-cache \
ca-certificates \
tzdata \
tini \
bash \
curl \
openjdk17-jre && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories
# Set environment variables
ENV ENDPOINTS_GROUPS_TO_REMOVE=CLI
ENTRYPOINT ["/scripts/init-without-ocr.sh"]
ENTRYPOINT ["tini", "--", "/scripts/init-without-ocr.sh"]
# Run the application
CMD ["java", "-Dfile.encoding=UTF-8", "-jar", "/app.jar"]

View file

@ -1,43 +1,28 @@
# Main stage
FROM ubuntu:latest AS base
FROM alpine:3.19.0
# JDK for app
RUN apt-get update && \
apt-get install -y --no-install-recommends \
openjdk-17-jre && \
rm -rf /var/lib/apt/lists/*
RUN echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/main" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/community" | tee -a /etc/apk/repositories && \
echo "@testing https://dl-cdn.alpinelinux.org/alpine/edge/testing" | tee -a /etc/apk/repositories && \
apk add --no-cache \
ca-certificates \
tzdata \
tini \
bash \
curl \
openjdk17-jre \
# Doc conversion
RUN apt-get update && \
apt-get install -y --no-install-recommends \
libreoffice-core \
libreoffice-common \
libreoffice-writer \
libreoffice-calc \
libreoffice-impress \
python3-uno \
curl \
unoconv && \
rm -rf /var/lib/apt/lists/*
libreoffice@testing \
# OCR MY PDF (unpaper for descew and other advanced featues)
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
apt-get update && \
apt-get install -y --no-install-recommends \
ghostscript \
python3-pip \
ocrmypdf \
unpaper && \
rm -rf /var/lib/apt/lists/* && \
mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \
pip install --no-cache-dir --upgrade pip && \
pip install --no-cache-dir --upgrade ocrmypdf && \
pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
tesseract-ocr-data-eng \
# CV
py3-opencv \
# python3/pip
python3 && \
wget https://bootstrap.pypa.io/get-pip.py -qO - | python3 - --break-system-packages --no-cache-dir --upgrade && \
# uno unoconv and HTML
pip install --break-system-packages --no-cache-dir --upgrade unoconv WeasyPrint && \
mv /usr/share/tessdata /usr/share/tessdata-original
#CV and HTML
RUN pip install --no-cache-dir opencv-python-headless WeasyPrint

View file

@ -21,7 +21,7 @@ Depending on your requirements, you can choose the appropriate language pack for
### Installing Language Packs
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata` (Debian) or `/usr/share/tesseract/tessdata` (Fedora)
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
# DO NOT REMOVE EXISTING ENG.TRAINEDDATA, IT'S REQUIRED.
@ -37,14 +37,14 @@ services:
your_service_name:
image: your_docker_image_name
volumes:
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
- /location/of/trainingData:/usr/share/tessdata
```
#### Docker run
Add the following to your existing docker run command
```bash
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata
-v /location/of/trainingData:/usr/share/tessdata
```
#### Non-Docker

View file

@ -139,7 +139,7 @@ Easiest is to use the langpacks provided by your repositories. Skip the other st
Manual:
1. Download the desired language pack(s) by selecting the `.traineddata` file(s) for the language(s) you need.
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tesseract-ocr/5/tessdata`
2. Place the `.traineddata` files in the Tesseract tessdata directory: `/usr/share/tessdata`
3.
Please view [OCRmyPDF install guide](https://ocrmypdf.readthedocs.io/en/latest/installation.html) for more info.
**IMPORTANT:** DO NOT REMOVE EXISTING `eng.traineddata`, IT'S REQUIRED.

View file

@ -110,7 +110,7 @@ Docker Run
```bash
docker run -d \
-p 8080:8080 \
-v /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata \
-v /location/of/trainingData:/usr/share/tessdata \
-v /location/of/extraConfigs:/configs \
-v /location/of/logs:/logs \
-e DOCKER_ENABLE_SECURITY=false \
@ -131,7 +131,7 @@ services:
ports:
- '8080:8080'
volumes:
- /location/of/trainingData:/usr/share/tesseract-ocr/5/tessdata #Required for extra OCR languages
- /location/of/trainingData:/usr/share/tessdata #Required for extra OCR languages
- /location/of/extraConfigs:/configs
# - /location/of/customFiles:/customFiles/
# - /location/of/logs:/logs/

View file

@ -2,11 +2,15 @@
# Copy the original tesseract-ocr files to the volume directory without overwriting existing files
echo "Copying original files without overwriting existing files"
mkdir -p /usr/share/tesseract-ocr
cp -rn /usr/share/tesseract-ocr-original/* /usr/share/tesseract-ocr
mkdir -p /usr/share/tessdata
cp -rn /usr/share/tessdata-original/* /usr/share/tessdata
if [ -d /usr/share/tesseract-ocr/4.00/tessdata ]; then
cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tesseract-ocr/5/tessdata/ || true;
cp -r /usr/share/tesseract-ocr/4.00/tessdata/* /usr/share/tessdata || true;
fi
if [ -d /usr/share/tesseract-ocr/5/tessdata ]; then
cp -r /usr/share/tesseract-ocr/5/tessdata/* /usr/share/tessdata || true;
fi
# Check if TESSERACT_LANGS environment variable is set and is not empty