From c255a20d7ce5c38a2fcc74035b04f8bb292ee891 Mon Sep 17 00:00:00 2001 From: Rain Chen <71397+rainchen@users.noreply.github.com> Date: Mon, 20 May 2024 13:20:27 +0800 Subject: [PATCH] allow to config max segmentation tokens length for RAG document using environment variable (#4375) --- api/.env.example | 3 +++ api/config.py | 6 ++++++ api/core/indexing_runner.py | 5 +++-- api/core/rag/index_processor/index_processor_base.py | 7 +++++-- docker/docker-compose.yaml | 4 ++++ 5 files changed, 21 insertions(+), 4 deletions(-) diff --git a/api/.env.example b/api/.env.example index 05d4f4a530..cadb476150 100644 --- a/api/.env.example +++ b/api/.env.example @@ -176,3 +176,6 @@ HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576 # 1MB # Log file path LOG_FILE= + +# Indexing configuration +INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000 diff --git a/api/config.py b/api/config.py index 4d1f905440..ed2ba41d4a 100644 --- a/api/config.py +++ b/api/config.py @@ -79,6 +79,7 @@ DEFAULTS = { 'KEYWORD_DATA_SOURCE_TYPE': 'database', 'INNER_API': 'False', 'ENTERPRISE_ENABLED': 'False', + 'INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH': 1000, } @@ -379,3 +380,8 @@ class Config: self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE') self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED') + + # ------------------------ + # Indexing Configurations. + # ------------------------ + self.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH = get_env('INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH') diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index b812999b87..d3886b8e77 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -411,8 +411,9 @@ class IndexingRunner: # The user-defined segmentation rule rules = json.loads(processing_rule.rules) segmentation = rules["segmentation"] - if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: - raise ValueError("Custom segment length should be between 50 and 1000.") + max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) + if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: + raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") separator = segmentation["separator"] if separator: diff --git a/api/core/rag/index_processor/index_processor_base.py b/api/core/rag/index_processor/index_processor_base.py index 509a1a189b..c51634fee1 100644 --- a/api/core/rag/index_processor/index_processor_base.py +++ b/api/core/rag/index_processor/index_processor_base.py @@ -2,6 +2,8 @@ from abc import ABC, abstractmethod from typing import Optional +from flask import current_app + from core.model_manager import ModelInstance from core.rag.extractor.entity.extract_setting import ExtractSetting from core.rag.models.document import Document @@ -43,8 +45,9 @@ class BaseIndexProcessor(ABC): # The user-defined segmentation rule rules = processing_rule['rules'] segmentation = rules["segmentation"] - if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000: - raise ValueError("Custom segment length should be between 50 and 1000.") + max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH']) + if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length: + raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.") separator = segmentation["separator"] if separator: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 8e7c94469d..d130527682 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -167,6 +167,8 @@ services: # SSRF Proxy server SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128' SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128' + # Indexing configuration + INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 depends_on: - db - redis @@ -287,6 +289,8 @@ services: NOTION_CLIENT_SECRET: you-client-secret NOTION_CLIENT_ID: you-client-id NOTION_INTERNAL_SECRET: you-internal-secret + # Indexing configuration + INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000 depends_on: - db - redis