2023-05-01 22:57:48 +02:00
|
|
|
# Main stage
|
2023-12-10 23:02:30 +01:00
|
|
|
FROM ubuntu:latest AS base
|
|
|
|
|
|
|
|
|
|
|
|
|
2023-12-11 00:06:35 +01:00
|
|
|
# JDK for app
|
2023-04-15 22:17:47 +02:00
|
|
|
RUN apt-get update && \
|
|
|
|
apt-get install -y --no-install-recommends \
|
2024-01-04 13:14:07 +01:00
|
|
|
openjdk-17-jre && \
|
|
|
|
rm -rf /var/lib/apt/lists/*
|
2023-12-11 00:06:35 +01:00
|
|
|
|
|
|
|
# Doc conversion
|
|
|
|
RUN apt-get update && \
|
|
|
|
apt-get install -y --no-install-recommends \
|
2023-12-16 11:22:33 +01:00
|
|
|
libreoffice-core \
|
2023-04-15 22:17:47 +02:00
|
|
|
libreoffice-common \
|
2023-12-16 11:22:33 +01:00
|
|
|
libreoffice-writer \
|
|
|
|
libreoffice-calc \
|
|
|
|
libreoffice-impress \
|
2023-12-11 00:06:35 +01:00
|
|
|
python3-uno \
|
2023-12-11 10:32:36 +01:00
|
|
|
curl \
|
2024-01-04 13:14:07 +01:00
|
|
|
unoconv && \
|
|
|
|
rm -rf /var/lib/apt/lists/*
|
2023-12-11 00:06:35 +01:00
|
|
|
|
|
|
|
|
|
|
|
# OCR MY PDF (unpaper for descew and other advanced featues)
|
|
|
|
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
|
|
|
|
add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
|
|
|
|
apt-get update && \
|
|
|
|
apt-get install -y --no-install-recommends \
|
2023-12-10 23:02:30 +01:00
|
|
|
ghostscript \
|
2023-05-01 22:57:48 +02:00
|
|
|
python3-pip \
|
2023-12-10 23:02:30 +01:00
|
|
|
ocrmypdf \
|
2023-12-11 00:06:35 +01:00
|
|
|
unpaper && \
|
2024-01-04 13:14:07 +01:00
|
|
|
rm -rf /var/lib/apt/lists/* && \
|
2023-12-10 15:09:28 +01:00
|
|
|
pip install --upgrade pip && \
|
2023-12-10 23:02:30 +01:00
|
|
|
pip install --no-cache-dir --upgrade ocrmypdf && \
|
2023-12-11 00:06:35 +01:00
|
|
|
pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
|
|
|
|
|
|
|
|
|
2023-12-11 10:57:28 +01:00
|
|
|
#CV and HTML
|
|
|
|
RUN pip install --no-cache-dir opencv-python-headless WeasyPrint
|
2023-12-11 00:06:35 +01:00
|
|
|
|
|
|
|
|
|
|
|
# cleanup and etc
|
2024-01-04 13:14:07 +01:00
|
|
|
RUN mkdir /usr/share/tesseract-ocr-original && \
|
2023-05-01 22:57:48 +02:00
|
|
|
cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \
|
|
|
|
rm -rf /usr/share/tesseract-ocr
|
|
|
|
|
2023-12-10 23:02:30 +01:00
|
|
|
|
2023-12-10 15:09:28 +01:00
|
|
|
|
|
|
|
|