From e09f6e49879bbab8c01a90870629820757568d95 Mon Sep 17 00:00:00 2001 From: Joel Date: Wed, 22 Jan 2025 10:43:40 +0800 Subject: [PATCH] feat: support config chunk length by env (#12925) --- docker/docker-compose-template.yaml | 3 ++- docker/docker-compose.yaml | 1 + web/.env.example | 3 +++ web/app/components/datasets/create/step-two/index.tsx | 9 +++++---- web/app/components/datasets/create/step-two/inputs.tsx | 6 ++++-- web/app/layout.tsx | 1 + web/docker/entrypoint.sh | 1 + 7 files changed, 17 insertions(+), 7 deletions(-) diff --git a/docker/docker-compose-template.yaml b/docker/docker-compose-template.yaml index e2daead92e..c2984fe868 100644 --- a/docker/docker-compose-template.yaml +++ b/docker/docker-compose-template.yaml @@ -1,4 +1,4 @@ -x-shared-env: &shared-api-worker-env +x-shared-env: &shared-api-worker-env services: # API service api: @@ -57,6 +57,7 @@ services: TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000} CSP_WHITELIST: ${CSP_WHITELIST:-} TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-} + INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-} # The postgres database. db: diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index f60fcdbcfc..21e72a4cd6 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -448,6 +448,7 @@ services: TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000} CSP_WHITELIST: ${CSP_WHITELIST:-} TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-} + INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-} # The postgres database. db: diff --git a/web/.env.example b/web/.env.example index 2decef02fa..e2117ddfd8 100644 --- a/web/.env.example +++ b/web/.env.example @@ -28,3 +28,6 @@ NEXT_PUBLIC_CSP_WHITELIST= # The maximum number of top-k value for RAG. NEXT_PUBLIC_TOP_K_MAX_VALUE=10 + +# The maximum number of tokens for segmentation +NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000 diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index d6fa45b4fb..ec9b3a5b42 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -98,6 +98,7 @@ export enum IndexingType { const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 const DEFAULT_OVERLAP = 50 +const MAXIMUM_CHUNK_TOKEN_LENGTH = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) type ParentChildConfig = { chunkForContext: ParentMode @@ -163,7 +164,7 @@ const StepTwo = ({ doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)) }, []) const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length - const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000) + const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH) const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() @@ -342,8 +343,8 @@ const StepTwo = ({ } const updatePreview = () => { - if (segmentationType === ProcessMode.general && maxChunkLength > 4000) { - Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') }) + if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) { + Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) }) return } fetchEstimate() @@ -393,7 +394,7 @@ const StepTwo = ({ score_threshold_enabled: false, score_threshold: 0.5, }) - // eslint-disable-next-line react-hooks/exhaustive-deps + // eslint-disable-next-line react-hooks/exhaustive-deps }, [rerankDefaultModel, isRerankDefaultModelValid]) const getCreationParams = () => { diff --git a/web/app/components/datasets/create/step-two/inputs.tsx b/web/app/components/datasets/create/step-two/inputs.tsx index 4231f6242d..acd4e3ae43 100644 --- a/web/app/components/datasets/create/step-two/inputs.tsx +++ b/web/app/components/datasets/create/step-two/inputs.tsx @@ -39,6 +39,8 @@ export const DelimiterInput: FC = (props) => } export const MaxLengthInput: FC = (props) => { + const maxValue = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10) + const { t } = useTranslation() return {t('datasetCreation.stepTwo.maxLength')} @@ -46,8 +48,8 @@ export const MaxLengthInput: FC = (props) => { diff --git a/web/app/layout.tsx b/web/app/layout.tsx index b52c904561..da659e6467 100644 --- a/web/app/layout.tsx +++ b/web/app/layout.tsx @@ -45,6 +45,7 @@ const LocaleLayout = ({ data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT} data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS} data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE} + data-public-indexing-max-segmentation-tokens-length={process.env.NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} > diff --git a/web/docker/entrypoint.sh b/web/docker/entrypoint.sh index 8c6de0eb79..bad95b6cbe 100755 --- a/web/docker/entrypoint.sh +++ b/web/docker/entrypoint.sh @@ -24,5 +24,6 @@ export NEXT_TELEMETRY_DISABLED=${NEXT_TELEMETRY_DISABLED} export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS} export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} +export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH} pm2 start ./pm2.json --no-daemon