diff --git a/Dockerfile b/Dockerfile index 6e0a1a831..a6c69800f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -119,6 +119,9 @@ COPY nltk_data /root/nltk_data COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./ ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar" +# Copy cl100k_base +COPY 9b5ad71b2ce5302211f9c61530b329a4922fc6a4 ./ + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/Dockerfile.slim b/Dockerfile.slim index 6b093db87..c3059cb23 100644 --- a/Dockerfile.slim +++ b/Dockerfile.slim @@ -112,6 +112,9 @@ COPY nltk_data /root/nltk_data COPY tika-server-standard-3.0.0.jar tika-server-standard-3.0.0.jar.md5 ./ ENV TIKA_SERVER_JAR="file:///ragflow/tika-server-standard.jar" +# Copy cl100k_base +COPY 9b5ad71b2ce5302211f9c61530b329a4922fc6a4 ./ + # Copy compiled web pages COPY --from=builder /ragflow/web/dist /ragflow/web/dist diff --git a/download_deps.py b/download_deps.py index 58399293d..ac454a046 100644 --- a/download_deps.py +++ b/download_deps.py @@ -1,4 +1,5 @@ #!/usr/bin/env python3 +from os import rename from huggingface_hub import snapshot_download import nltk @@ -9,6 +10,7 @@ urls = [ "http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb", "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar", "https://repo1.maven.org/maven2/org/apache/tika/tika-server-standard/3.0.0/tika-server-standard-3.0.0.jar.md5", + "https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken", ] repos = [ @@ -41,3 +43,5 @@ if __name__ == "__main__": for repo_id in repos: print(f"Downloading huggingface repo {repo_id}...") download_model(repo_id) + + rename("cl100k_base.tiktoken", "9b5ad71b2ce5302211f9c61530b329a4922fc6a4") diff --git a/rag/utils/__init__.py b/rag/utils/__init__.py index 7cd56e9f8..d75fb69c3 100644 --- a/rag/utils/__init__.py +++ b/rag/utils/__init__.py @@ -17,7 +17,7 @@ import os import re import tiktoken - +from api.utils.file_utils import get_project_base_directory def singleton(cls, *args, **kw): instances = {} @@ -71,9 +71,10 @@ def findMaxTm(fnm): pass return m - -encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") - +tiktoken_cache_dir = get_project_base_directory() +os.environ["TIKTOKEN_CACHE_DIR"] = tiktoken_cache_dir +# encoder = tiktoken.encoding_for_model("gpt-3.5-turbo") +encoder = tiktoken.get_encoding("cl100k_base") def num_tokens_from_string(string: str) -> int: """Returns the number of tokens in a text string."""