From c06e765a5b1ce410e2d2bf508c4946f403a9983e Mon Sep 17 00:00:00 2001 From: Zhichang Yu Date: Sun, 3 Nov 2024 00:20:26 +0800 Subject: [PATCH] Added tika jar into image to avoid downloading (#3167) ### What problem does this PR solve? Added tika jar into image to avoid downloading. Close #3017 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- .github/workflows/tests.yml | 2 +- Dockerfile | 5 +++++ Dockerfile.slim | 5 +++++ download_deps.py | 2 ++ 4 files changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 71538deff..b155f8fe0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -48,7 +48,7 @@ jobs: - name: Build ragflow:dev-slim run: | RUNNER_WORKSPACE_PREFIX=${RUNNER_WORKSPACE_PREFIX:-$HOME} - cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co ${RUNNER_WORKSPACE_PREFIX}/nltk_data ${RUNNER_WORKSPACE_PREFIX}/libssl*.deb . + cp -r ${RUNNER_WORKSPACE_PREFIX}/huggingface.co ${RUNNER_WORKSPACE_PREFIX}/nltk_data ${RUNNER_WORKSPACE_PREFIX}/libssl*.deb ${RUNNER_WORKSPACE_PREFIX}/tika-server*.jar* . sudo docker pull ubuntu:24.04 sudo docker build -f Dockerfile.slim -t infiniflow/ragflow:dev-slim . diff --git a/Dockerfile b/Dockerfile index 682336f58..b700cca34 100644 --- a/Dockerfile +++ b/Dockerfile @@ -104,6 +104,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ # Copy nltk data downloaded via download_deps.py COPY nltk_data /root/nltk_data +# https://github.com/chrismattmann/tika-python +# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. +COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./ +ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar" + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/Dockerfile.slim b/Dockerfile.slim index e2e35f326..f61c68b5f 100644 --- a/Dockerfile.slim +++ b/Dockerfile.slim @@ -97,6 +97,11 @@ RUN --mount=type=bind,source=huggingface.co,target=/huggingface.co \ # Copy nltk data downloaded via download_deps.py COPY nltk_data /root/nltk_data +# https://github.com/chrismattmann/tika-python +# This is the only way to run python-tika without internet access. Without this set, the default is to check the tika version and pull latest every time from Apache. +COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./ +ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar" + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/download_deps.py b/download_deps.py index ab8fafb2f..58399293d 100644 --- a/download_deps.py +++ b/download_deps.py @@ -7,6 +7,8 @@ import urllib.request urls = [ "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar", + "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5", ] repos = [