From 56afd35c823c863f7c936d037017b6d8c705d354 Mon Sep 17 00:00:00 2001 From: Peter Dave Hello <hsu@peterdavehello.org> Date: Thu, 4 Jan 2024 20:14:07 +0800 Subject: [PATCH 1/3] Refactor DockerfileBase to clean up apt cache after package installation Changes include: - Cleaning up the apt cache by adding `rm -rf /var/lib/apt/lists/*` after each package installation within the same RUN statement. - Ensuring the Docker image size is minimized by removing unnecessary files immediately after use. These adjustments will result in a more space-efficient Docker image. --- DockerfileBase | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/DockerfileBase b/DockerfileBase index c913635f..4e41d3e8 100644 --- a/DockerfileBase +++ b/DockerfileBase @@ -6,7 +6,8 @@ FROM ubuntu:latest AS base # JDK for app RUN apt-get update && \ apt-get install -y --no-install-recommends \ - openjdk-17-jre + openjdk-17-jre && \ + rm -rf /var/lib/apt/lists/* # Doc conversion RUN apt-get update && \ @@ -18,7 +19,8 @@ RUN apt-get update && \ libreoffice-impress \ python3-uno \ curl \ - unoconv + unoconv && \ + rm -rf /var/lib/apt/lists/* # OCR MY PDF (unpaper for descew and other advanced featues) @@ -30,6 +32,7 @@ apt-get update && \ python3-pip \ ocrmypdf \ unpaper && \ + rm -rf /var/lib/apt/lists/* && \ pip install --upgrade pip && \ pip install --no-cache-dir --upgrade ocrmypdf && \ pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1 @@ -40,8 +43,7 @@ RUN pip install --no-cache-dir opencv-python-headless WeasyPrint # cleanup and etc -RUN rm -rf /var/lib/apt/lists/* && \ - mkdir /usr/share/tesseract-ocr-original && \ +RUN mkdir /usr/share/tesseract-ocr-original && \ cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \ rm -rf /usr/share/tesseract-ocr From 846ebe6dda3b5b907231f214ce96e74d300e6dd2 Mon Sep 17 00:00:00 2001 From: Peter Dave Hello <hsu@peterdavehello.org> Date: Thu, 4 Jan 2024 20:30:55 +0800 Subject: [PATCH 2/3] Refine Tesseract-OCR file backup process in DockerfileBase --- DockerfileBase | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/DockerfileBase b/DockerfileBase index 4e41d3e8..ffc8a89e 100644 --- a/DockerfileBase +++ b/DockerfileBase @@ -33,6 +33,7 @@ apt-get update && \ ocrmypdf \ unpaper && \ rm -rf /var/lib/apt/lists/* && \ + mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \ pip install --upgrade pip && \ pip install --no-cache-dir --upgrade ocrmypdf && \ pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1 @@ -40,13 +41,3 @@ apt-get update && \ #CV and HTML RUN pip install --no-cache-dir opencv-python-headless WeasyPrint - - -# cleanup and etc -RUN mkdir /usr/share/tesseract-ocr-original && \ - cp -r /usr/share/tesseract-ocr/* /usr/share/tesseract-ocr-original && \ - rm -rf /usr/share/tesseract-ocr - - - - \ No newline at end of file From ecb62e0c94bc6711ed6afdb71ccbdd575081988e Mon Sep 17 00:00:00 2001 From: Peter Dave Hello <hsu@peterdavehello.org> Date: Thu, 4 Jan 2024 20:56:24 +0800 Subject: [PATCH 3/3] Apply `--no-cache-dir` to pip upgrade in DockerfileBase Aligned pip upgrade command with others by adding the `--no-cache-dir` flag to reduce image layer size. --- DockerfileBase | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DockerfileBase b/DockerfileBase index ffc8a89e..b37050c7 100644 --- a/DockerfileBase +++ b/DockerfileBase @@ -34,7 +34,7 @@ apt-get update && \ unpaper && \ rm -rf /var/lib/apt/lists/* && \ mv /usr/share/tesseract-ocr /usr/share/tesseract-ocr-original && \ - pip install --upgrade pip && \ + pip install --no-cache-dir --upgrade pip && \ pip install --no-cache-dir --upgrade ocrmypdf && \ pip install --no-cache-dir --upgrade pillow==10.0.1 reportlab==3.6.13 wheel==0.38.1 setuptools==65.5.1 pyjwt==2.4.0 cryptography==39.0.1