mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 18:49:02 +08:00
allow to config max segmentation tokens length for RAG document using environment variable (#4375)
This commit is contained in:
parent
b5204111da
commit
c255a20d7c
@ -176,3 +176,6 @@ HTTP_REQUEST_NODE_MAX_TEXT_SIZE=1048576 # 1MB
|
|||||||
|
|
||||||
# Log file path
|
# Log file path
|
||||||
LOG_FILE=
|
LOG_FILE=
|
||||||
|
|
||||||
|
# Indexing configuration
|
||||||
|
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=1000
|
||||||
|
@ -79,6 +79,7 @@ DEFAULTS = {
|
|||||||
'KEYWORD_DATA_SOURCE_TYPE': 'database',
|
'KEYWORD_DATA_SOURCE_TYPE': 'database',
|
||||||
'INNER_API': 'False',
|
'INNER_API': 'False',
|
||||||
'ENTERPRISE_ENABLED': 'False',
|
'ENTERPRISE_ENABLED': 'False',
|
||||||
|
'INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH': 1000,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -379,3 +380,8 @@ class Config:
|
|||||||
|
|
||||||
self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE')
|
self.KEYWORD_DATA_SOURCE_TYPE = get_env('KEYWORD_DATA_SOURCE_TYPE')
|
||||||
self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED')
|
self.ENTERPRISE_ENABLED = get_bool_env('ENTERPRISE_ENABLED')
|
||||||
|
|
||||||
|
# ------------------------
|
||||||
|
# Indexing Configurations.
|
||||||
|
# ------------------------
|
||||||
|
self.INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH = get_env('INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH')
|
||||||
|
@ -411,8 +411,9 @@ class IndexingRunner:
|
|||||||
# The user-defined segmentation rule
|
# The user-defined segmentation rule
|
||||||
rules = json.loads(processing_rule.rules)
|
rules = json.loads(processing_rule.rules)
|
||||||
segmentation = rules["segmentation"]
|
segmentation = rules["segmentation"]
|
||||||
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000:
|
max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH'])
|
||||||
raise ValueError("Custom segment length should be between 50 and 1000.")
|
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
|
||||||
|
raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")
|
||||||
|
|
||||||
separator = segmentation["separator"]
|
separator = segmentation["separator"]
|
||||||
if separator:
|
if separator:
|
||||||
|
@ -2,6 +2,8 @@
|
|||||||
from abc import ABC, abstractmethod
|
from abc import ABC, abstractmethod
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from flask import current_app
|
||||||
|
|
||||||
from core.model_manager import ModelInstance
|
from core.model_manager import ModelInstance
|
||||||
from core.rag.extractor.entity.extract_setting import ExtractSetting
|
from core.rag.extractor.entity.extract_setting import ExtractSetting
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -43,8 +45,9 @@ class BaseIndexProcessor(ABC):
|
|||||||
# The user-defined segmentation rule
|
# The user-defined segmentation rule
|
||||||
rules = processing_rule['rules']
|
rules = processing_rule['rules']
|
||||||
segmentation = rules["segmentation"]
|
segmentation = rules["segmentation"]
|
||||||
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > 1000:
|
max_segmentation_tokens_length = int(current_app.config['INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH'])
|
||||||
raise ValueError("Custom segment length should be between 50 and 1000.")
|
if segmentation["max_tokens"] < 50 or segmentation["max_tokens"] > max_segmentation_tokens_length:
|
||||||
|
raise ValueError(f"Custom segment length should be between 50 and {max_segmentation_tokens_length}.")
|
||||||
|
|
||||||
separator = segmentation["separator"]
|
separator = segmentation["separator"]
|
||||||
if separator:
|
if separator:
|
||||||
|
@ -167,6 +167,8 @@ services:
|
|||||||
# SSRF Proxy server
|
# SSRF Proxy server
|
||||||
SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128'
|
SSRF_PROXY_HTTP_URL: 'http://ssrf_proxy:3128'
|
||||||
SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128'
|
SSRF_PROXY_HTTPS_URL: 'http://ssrf_proxy:3128'
|
||||||
|
# Indexing configuration
|
||||||
|
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000
|
||||||
depends_on:
|
depends_on:
|
||||||
- db
|
- db
|
||||||
- redis
|
- redis
|
||||||
@ -287,6 +289,8 @@ services:
|
|||||||
NOTION_CLIENT_SECRET: you-client-secret
|
NOTION_CLIENT_SECRET: you-client-secret
|
||||||
NOTION_CLIENT_ID: you-client-id
|
NOTION_CLIENT_ID: you-client-id
|
||||||
NOTION_INTERNAL_SECRET: you-internal-secret
|
NOTION_INTERNAL_SECRET: you-internal-secret
|
||||||
|
# Indexing configuration
|
||||||
|
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: 1000
|
||||||
depends_on:
|
depends_on:
|
||||||
- db
|
- db
|
||||||
- redis
|
- redis
|
||||||
|
Loading…
x
Reference in New Issue
Block a user