From 5f4d2dc4fef97184620a038037ac511c99f78ae2 Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Tue, 1 Oct 2024 17:41:38 +0800 Subject: [PATCH] Updated Dockefile to use cache (#2703) ### What problem does this PR solve? Updated Dockefile to use cache ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [x] Other (please describe): CI --- .github/workflows/tests.yml | 2 +- Dockerfile | 35 ++++++++++++++--------- Dockerfile.slim | 35 ++++++++++++++--------- README.md | 4 +-- README_ja.md | 4 +-- README_ko.md | 4 +-- README_zh.md | 4 +-- docs/guides/develop/build_docker_image.md | 4 +-- download_deps.py | 9 ++++-- 9 files changed, 60 insertions(+), 41 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a556df6f5..822717b6c 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -48,7 +48,7 @@ jobs: - name: Build ragflow:dev-slim run: | RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-$HOME} - cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co . + cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co ${RUNNER_WORKSPACE_PREFIX}/nltk_data . sudo docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . - name: Build ragflow:dev diff --git a/Dockerfile b/Dockerfile index c52ffa583..b3c1bfd46 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,13 +6,18 @@ ENV LIGHTEN=0 WORKDIR /ragflow -RUN apt update && apt --no-install-recommends install -y ca-certificates +RUN rm -f /etc/apt/apt.conf.d/docker-clean \ + && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache + +RUN --mount=type=cache,id=ragflow_base_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt-get --no-install-recommends install -y ca-certificates # if you located in China, you can use tsinghua mirror to speed up apt RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources -RUN apt update && apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 pkg-config libicu-dev libgdiplus python3-poetry \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_base_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 pkg-config libicu-dev libgdiplus python3-poetry \ + && rm -rf /var/lib/apt/lists/* RUN curl -o libssl1.deb http://archive.ubuntu.com/ubuntu/pool/main/o/openssl1.0/libssl1.0.0_1.0.2n-1ubuntu5_amd64.deb && dpkg -i libssl1.deb && rm -f libssl1.deb @@ -22,7 +27,6 @@ ENV PYTHONDONTWRITEBYTECODE=1 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 ENV POETRY_NO_INTERACTION=1 ENV POETRY_VIRTUALENVS_IN_PROJECT=true ENV POETRY_VIRTUALENVS_CREATE=true -ENV POETRY_KEYRING_ENABLED=false ENV POETRY_REQUESTS_TIMEOUT=15 # builder stage @@ -31,16 +35,18 @@ USER root WORKDIR /ragflow -RUN apt update && apt install -y nodejs npm cargo \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_builder_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y nodejs npm cargo && \ + rm -rf /var/lib/apt/lists/* COPY web web -RUN cd web && npm i --force && npm run build +RUN --mount=type=cache,id=ragflow_builder_npm,target=/root/.npm,sharing=locked \ + cd web && npm i --force && npm run build # install dependencies from poetry.lock file COPY pyproject.toml poetry.toml poetry.lock ./ -RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \ +RUN --mount=type=cache,id=ragflow_builder_poetry,target=/root/.cache/pypoetry,sharing=locked \ if [ "$LIGHTEN" -eq 0 ]; then \ poetry install --sync --no-cache --no-root --with=full; \ else \ @@ -55,8 +61,9 @@ WORKDIR /ragflow # Install python packages' dependencies # cv2 requires libGL.so.1 -RUN apt update && apt install -y --no-install-recommends nginx libgl1 vim less \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_production_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \ + rm -rf /var/lib/apt/lists/* COPY web web COPY api api @@ -82,16 +89,16 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ /huggingface.co/maidalun1020/bce-reranker-base_v1 \ | tar -xf - --strip-components=2 -C /root/.ragflow +# Copy nltk data downloaded via download_deps.py +COPY nltk_data /root/nltk_data + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist # Copy Python environment and packages ENV VIRTUAL_ENV=/ragflow/.venv COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} -ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" - -# Download nltk data -RUN python3 -m nltk.downloader wordnet punkt punkt_tab +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PYTHONPATH=/ragflow/ diff --git a/Dockerfile.slim b/Dockerfile.slim index 6508f7dc8..8dba33382 100644 --- a/Dockerfile.slim +++ b/Dockerfile.slim @@ -6,13 +6,18 @@ ENV LIGHTEN=1 WORKDIR /ragflow -RUN apt update && apt --no-install-recommends install -y ca-certificates +RUN rm -f /etc/apt/apt.conf.d/docker-clean \ + && echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache + +RUN --mount=type=cache,id=ragflow_base_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt-get --no-install-recommends install -y ca-certificates # if you located in China, you can use tsinghua mirror to speed up apt RUN sed -i 's|http://archive.ubuntu.com|https://mirrors.tuna.tsinghua.edu.cn|g' /etc/apt/sources.list.d/ubuntu.sources -RUN apt update && apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 pkg-config libicu-dev libgdiplus python3-poetry \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_base_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y curl libpython3-dev nginx libglib2.0-0 libglx-mesa0 pkg-config libicu-dev libgdiplus python3-poetry \ + && rm -rf /var/lib/apt/lists/* RUN curl -o libssl1.deb http://archive.ubuntu.com/ubuntu/pool/main/o/openssl1.0/libssl1.0.0_1.0.2n-1ubuntu5_amd64.deb && dpkg -i libssl1.deb && rm -f libssl1.deb @@ -22,7 +27,6 @@ ENV PYTHONDONTWRITEBYTECODE=1 DOTNET_SYSTEM_GLOBALIZATION_INVARIANT=1 ENV POETRY_NO_INTERACTION=1 ENV POETRY_VIRTUALENVS_IN_PROJECT=true ENV POETRY_VIRTUALENVS_CREATE=true -ENV POETRY_KEYRING_ENABLED=false ENV POETRY_REQUESTS_TIMEOUT=15 # builder stage @@ -31,16 +35,18 @@ USER root WORKDIR /ragflow -RUN apt update && apt install -y nodejs npm cargo \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_builder_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y nodejs npm cargo && \ + rm -rf /var/lib/apt/lists/* COPY web web -RUN cd web && npm i --force && npm run build +RUN --mount=type=cache,id=ragflow_builder_npm,target=/root/.npm,sharing=locked \ + cd web && npm i --force && npm run build # install dependencies from poetry.lock file COPY pyproject.toml poetry.toml poetry.lock ./ -RUN --mount=type=cache,target=/root/.cache/pypoetry,sharing=locked \ +RUN --mount=type=cache,id=ragflow_builder_poetry,target=/root/.cache/pypoetry,sharing=locked \ if [ "$LIGHTEN" -eq 0 ]; then \ poetry install --sync --no-cache --no-root --with=full; \ else \ @@ -55,8 +61,9 @@ WORKDIR /ragflow # Install python packages' dependencies # cv2 requires libGL.so.1 -RUN apt update && apt install -y --no-install-recommends nginx libgl1 vim less \ - && apt clean && rm -rf /var/lib/apt/lists/* +RUN --mount=type=cache,id=ragflow_production_apt,target=/var/cache/apt,sharing=locked \ + apt update && apt install -y --no-install-recommends nginx libgl1 vim less && \ + rm -rf /var/lib/apt/lists/* COPY web web COPY api api @@ -75,16 +82,16 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ /huggingface.co/InfiniFlow/deepdoc \ | tar -xf - --strip-components=3 -C /ragflow/rag/res/deepdoc +# Copy nltk data downloaded via download_deps.py +COPY nltk_data /root/nltk_data + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist # Copy Python environment and packages ENV VIRTUAL_ENV=/ragflow/.venv COPY --from=builder ${VIRTUAL_ENV} ${VIRTUAL_ENV} -ENV PATH="${VIRTUAL_ENV}/bin:/root/.local/bin:${PATH}" - -# Download nltk data -RUN python3 -m nltk.downloader wordnet punkt punkt_tab +ENV PATH="${VIRTUAL_ENV}/bin:${PATH}" ENV PYTHONPATH=/ragflow/ diff --git a/README.md b/README.md index 72f165490..68aeed342 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ This image is approximately 1 GB in size and relies on external LLM and embeddin ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ``` @@ -232,7 +232,7 @@ This image is approximately 9 GB in size. As it includes embedding models, it re ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev . ``` diff --git a/README_ja.md b/README_ja.md index 2d02e50d8..833c658af 100644 --- a/README_ja.md +++ b/README_ja.md @@ -202,7 +202,7 @@ ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ``` @@ -214,7 +214,7 @@ docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev . ``` diff --git a/README_ko.md b/README_ko.md index b2593f36f..8fe0fe1bc 100644 --- a/README_ko.md +++ b/README_ko.md @@ -204,7 +204,7 @@ ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ``` @@ -216,7 +216,7 @@ docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev . ``` diff --git a/README_zh.md b/README_zh.md index 853ab09c5..0284a5730 100644 --- a/README_zh.md +++ b/README_zh.md @@ -204,7 +204,7 @@ ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ``` @@ -216,7 +216,7 @@ docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . ```bash git clone https://github.com/infiniflow/ragflow.git cd ragflow/ -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev . ``` diff --git a/docs/guides/develop/build_docker_image.md b/docs/guides/develop/build_docker_image.md index 285ea0a04..648b2167c 100644 --- a/docs/guides/develop/build_docker_image.md +++ b/docs/guides/develop/build_docker_image.md @@ -64,7 +64,7 @@ This image is approximately 1 GB in size and relies on external LLM services, as On a `linux/amd64` host: ```bash -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev-amd64 . docker push infiniflow/ragflow:dev-amd64 @@ -72,7 +72,7 @@ docker push infiniflow/ragflow:dev-amd64 On a `linux/arm64` host: ```bash -pip3 install huggingface-hub +pip3 install huggingface-hub nltk python3 download_deps.py docker build -f Dockerfile -t infiniflow/ragflow:dev-arm64 . docker push infiniflow/ragflow:dev-arm64 diff --git a/download_deps.py b/download_deps.py index 03f52c84c..6ff72f6ce 100644 --- a/download_deps.py +++ b/download_deps.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 from huggingface_hub import snapshot_download +import nltk import os repos = [ @@ -12,13 +13,17 @@ repos = [ "maidalun1020/bce-reranker-base_v1", ] - def download_model(repo_id): - local_dir = os.path.join("huggingface.co", repo_id) + local_dir = os.path.abspath(os.path.join("huggingface.co", repo_id)) os.makedirs(local_dir, exist_ok=True) snapshot_download(repo_id=repo_id, local_dir=local_dir) if __name__ == "__main__": + local_dir = os.path.abspath('nltk_data') + for data in ['wordnet', 'punkt', 'wordnet']: + print(f"Downloading nltk {data}...") + nltk.download(data, download_dir=local_dir) for repo_id in repos: + print(f"Downloading huggingface repo {repo_id}...") download_model(repo_id)