feat: support config chunk length by env (#12925)

This commit is contained in:
Joel 2025-01-22 10:43:40 +08:00 committed by GitHub
parent e23f4b0265
commit e09f6e4987
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 17 additions and 7 deletions

View File

@ -57,6 +57,7 @@ services:
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000} TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
CSP_WHITELIST: ${CSP_WHITELIST:-} CSP_WHITELIST: ${CSP_WHITELIST:-}
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-} TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
# The postgres database. # The postgres database.
db: db:

View File

@ -448,6 +448,7 @@ services:
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000} TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
CSP_WHITELIST: ${CSP_WHITELIST:-} CSP_WHITELIST: ${CSP_WHITELIST:-}
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-} TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
# The postgres database. # The postgres database.
db: db:

View File

@ -28,3 +28,6 @@ NEXT_PUBLIC_CSP_WHITELIST=
# The maximum number of top-k value for RAG. # The maximum number of top-k value for RAG.
NEXT_PUBLIC_TOP_K_MAX_VALUE=10 NEXT_PUBLIC_TOP_K_MAX_VALUE=10
# The maximum number of tokens for segmentation
NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000

View File

@ -98,6 +98,7 @@ export enum IndexingType {
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n' const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500 const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
const DEFAULT_OVERLAP = 50 const DEFAULT_OVERLAP = 50
const MAXIMUM_CHUNK_TOKEN_LENGTH = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
type ParentChildConfig = { type ParentChildConfig = {
chunkForContext: ParentMode chunkForContext: ParentMode
@ -163,7 +164,7 @@ const StepTwo = ({
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER)) doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
}, []) }, [])
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000) const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP) const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
const [rules, setRules] = useState<PreProcessingRule[]>([]) const [rules, setRules] = useState<PreProcessingRule[]>([])
const [defaultConfig, setDefaultConfig] = useState<Rules>() const [defaultConfig, setDefaultConfig] = useState<Rules>()
@ -342,8 +343,8 @@ const StepTwo = ({
} }
const updatePreview = () => { const updatePreview = () => {
if (segmentationType === ProcessMode.general && maxChunkLength > 4000) { if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') }) Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
return return
} }
fetchEstimate() fetchEstimate()
@ -393,7 +394,7 @@ const StepTwo = ({
score_threshold_enabled: false, score_threshold_enabled: false,
score_threshold: 0.5, score_threshold: 0.5,
}) })
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [rerankDefaultModel, isRerankDefaultModelValid]) }, [rerankDefaultModel, isRerankDefaultModelValid])
const getCreationParams = () => { const getCreationParams = () => {

View File

@ -39,6 +39,8 @@ export const DelimiterInput: FC<InputProps & { tooltip?: string }> = (props) =>
} }
export const MaxLengthInput: FC<InputNumberProps> = (props) => { export const MaxLengthInput: FC<InputNumberProps> = (props) => {
const maxValue = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
const { t } = useTranslation() const { t } = useTranslation()
return <FormField label={<div className='system-sm-semibold mb-1'> return <FormField label={<div className='system-sm-semibold mb-1'>
{t('datasetCreation.stepTwo.maxLength')} {t('datasetCreation.stepTwo.maxLength')}
@ -46,8 +48,8 @@ export const MaxLengthInput: FC<InputNumberProps> = (props) => {
<InputNumber <InputNumber
type="number" type="number"
className='h-9' className='h-9'
placeholder={'≤ 4000'} placeholder={`${maxValue}`}
max={4000} max={maxValue}
min={1} min={1}
{...props} {...props}
/> />

View File

@ -45,6 +45,7 @@ const LocaleLayout = ({
data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT} data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT}
data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS} data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS}
data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE} data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE}
data-public-indexing-max-segmentation-tokens-length={process.env.NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
> >
<BrowserInitor> <BrowserInitor>
<SentryInitor> <SentryInitor>

View File

@ -24,5 +24,6 @@ export NEXT_TELEMETRY_DISABLED=${NEXT_TELEMETRY_DISABLED}
export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS} export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS}
export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST} export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE} export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
pm2 start ./pm2.json --no-daemon pm2 start ./pm2.json --no-daemon