# Main stage
FROM ubuntu:latest AS base



# JDK for app
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    	openjdk-17-jre && \
    rm -rf /var/lib/apt/lists/*
    	
# Doc conversion
RUN apt-get update && \
    apt-get install -y --no-install-recommends \
    	libreoffice-core \
        libreoffice-common \
        libreoffice-writer \
        libreoffice-calc \
        libreoffice-impress \
		python3-uno \
		curl \
		unoconv && \
    rm -rf /var/lib/apt/lists/*
		
		
# OCR MY PDF (unpaper for descew and other advanced featues)
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common gnupg2 && \
add-apt-repository ppa:alex-p/tesseract-ocr5 && apt install -y --no-install-recommends tesseract-ocr && \
apt-get update && \
    apt-get install -y --no-install-recommends \
        ghostscript \
        python3-pip \
        ocrmypdf \
        unpaper && \
    rm -rf /var/lib/apt/lists/* && \
    mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \
    pip install --no-cache-dir --upgrade pip && \
    pip install --no-cache-dir --upgrade ocrmypdf && \
    pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1
    
    
#CV and HTML   
RUN pip install --no-cache-dir opencv-python-headless WeasyPrint