mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 10:48:59 +08:00
feat: support config chunk length by env (#12925)
This commit is contained in:
parent
e23f4b0265
commit
e09f6e4987
@ -1,4 +1,4 @@
|
|||||||
x-shared-env: &shared-api-worker-env
|
x-shared-env: &shared-api-worker-env
|
||||||
services:
|
services:
|
||||||
# API service
|
# API service
|
||||||
api:
|
api:
|
||||||
@ -57,6 +57,7 @@ services:
|
|||||||
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
|
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
|
||||||
CSP_WHITELIST: ${CSP_WHITELIST:-}
|
CSP_WHITELIST: ${CSP_WHITELIST:-}
|
||||||
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
|
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
|
||||||
|
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
|
||||||
|
|
||||||
# The postgres database.
|
# The postgres database.
|
||||||
db:
|
db:
|
||||||
|
@ -448,6 +448,7 @@ services:
|
|||||||
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
|
TEXT_GENERATION_TIMEOUT_MS: ${TEXT_GENERATION_TIMEOUT_MS:-60000}
|
||||||
CSP_WHITELIST: ${CSP_WHITELIST:-}
|
CSP_WHITELIST: ${CSP_WHITELIST:-}
|
||||||
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
|
TOP_K_MAX_VALUE: ${TOP_K_MAX_VALUE:-}
|
||||||
|
INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH: ${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH:-}
|
||||||
|
|
||||||
# The postgres database.
|
# The postgres database.
|
||||||
db:
|
db:
|
||||||
|
@ -28,3 +28,6 @@ NEXT_PUBLIC_CSP_WHITELIST=
|
|||||||
|
|
||||||
# The maximum number of top-k value for RAG.
|
# The maximum number of top-k value for RAG.
|
||||||
NEXT_PUBLIC_TOP_K_MAX_VALUE=10
|
NEXT_PUBLIC_TOP_K_MAX_VALUE=10
|
||||||
|
|
||||||
|
# The maximum number of tokens for segmentation
|
||||||
|
NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=4000
|
||||||
|
@ -98,6 +98,7 @@ export enum IndexingType {
|
|||||||
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
||||||
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
|
const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
|
||||||
const DEFAULT_OVERLAP = 50
|
const DEFAULT_OVERLAP = 50
|
||||||
|
const MAXIMUM_CHUNK_TOKEN_LENGTH = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
|
||||||
|
|
||||||
type ParentChildConfig = {
|
type ParentChildConfig = {
|
||||||
chunkForContext: ParentMode
|
chunkForContext: ParentMode
|
||||||
@ -163,7 +164,7 @@ const StepTwo = ({
|
|||||||
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
|
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
|
||||||
}, [])
|
}, [])
|
||||||
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
|
const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
|
||||||
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
|
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(MAXIMUM_CHUNK_TOKEN_LENGTH)
|
||||||
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
|
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
|
||||||
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
||||||
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
const [defaultConfig, setDefaultConfig] = useState<Rules>()
|
||||||
@ -342,8 +343,8 @@ const StepTwo = ({
|
|||||||
}
|
}
|
||||||
|
|
||||||
const updatePreview = () => {
|
const updatePreview = () => {
|
||||||
if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
|
if (segmentationType === ProcessMode.general && maxChunkLength > MAXIMUM_CHUNK_TOKEN_LENGTH) {
|
||||||
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: MAXIMUM_CHUNK_TOKEN_LENGTH }) })
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
fetchEstimate()
|
fetchEstimate()
|
||||||
@ -393,7 +394,7 @@ const StepTwo = ({
|
|||||||
score_threshold_enabled: false,
|
score_threshold_enabled: false,
|
||||||
score_threshold: 0.5,
|
score_threshold: 0.5,
|
||||||
})
|
})
|
||||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||||
}, [rerankDefaultModel, isRerankDefaultModelValid])
|
}, [rerankDefaultModel, isRerankDefaultModelValid])
|
||||||
|
|
||||||
const getCreationParams = () => {
|
const getCreationParams = () => {
|
||||||
|
@ -39,6 +39,8 @@ export const DelimiterInput: FC<InputProps & { tooltip?: string }> = (props) =>
|
|||||||
}
|
}
|
||||||
|
|
||||||
export const MaxLengthInput: FC<InputNumberProps> = (props) => {
|
export const MaxLengthInput: FC<InputNumberProps> = (props) => {
|
||||||
|
const maxValue = parseInt(globalThis.document?.body?.getAttribute('data-public-indexing-max-segmentation-tokens-length') || '4000', 10)
|
||||||
|
|
||||||
const { t } = useTranslation()
|
const { t } = useTranslation()
|
||||||
return <FormField label={<div className='system-sm-semibold mb-1'>
|
return <FormField label={<div className='system-sm-semibold mb-1'>
|
||||||
{t('datasetCreation.stepTwo.maxLength')}
|
{t('datasetCreation.stepTwo.maxLength')}
|
||||||
@ -46,8 +48,8 @@ export const MaxLengthInput: FC<InputNumberProps> = (props) => {
|
|||||||
<InputNumber
|
<InputNumber
|
||||||
type="number"
|
type="number"
|
||||||
className='h-9'
|
className='h-9'
|
||||||
placeholder={'≤ 4000'}
|
placeholder={`≤ ${maxValue}`}
|
||||||
max={4000}
|
max={maxValue}
|
||||||
min={1}
|
min={1}
|
||||||
{...props}
|
{...props}
|
||||||
/>
|
/>
|
||||||
|
@ -45,6 +45,7 @@ const LocaleLayout = ({
|
|||||||
data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT}
|
data-public-site-about={process.env.NEXT_PUBLIC_SITE_ABOUT}
|
||||||
data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS}
|
data-public-text-generation-timeout-ms={process.env.NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS}
|
||||||
data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE}
|
data-public-top-k-max-value={process.env.NEXT_PUBLIC_TOP_K_MAX_VALUE}
|
||||||
|
data-public-indexing-max-segmentation-tokens-length={process.env.NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
|
||||||
>
|
>
|
||||||
<BrowserInitor>
|
<BrowserInitor>
|
||||||
<SentryInitor>
|
<SentryInitor>
|
||||||
|
@ -24,5 +24,6 @@ export NEXT_TELEMETRY_DISABLED=${NEXT_TELEMETRY_DISABLED}
|
|||||||
export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS}
|
export NEXT_PUBLIC_TEXT_GENERATION_TIMEOUT_MS=${TEXT_GENERATION_TIMEOUT_MS}
|
||||||
export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
|
export NEXT_PUBLIC_CSP_WHITELIST=${CSP_WHITELIST}
|
||||||
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
|
export NEXT_PUBLIC_TOP_K_MAX_VALUE=${TOP_K_MAX_VALUE}
|
||||||
|
export NEXT_PUBLIC_INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH=${INDEXING_MAX_SEGMENTATION_TOKENS_LENGTH}
|
||||||
|
|
||||||
pm2 start ./pm2.json --no-daemon
|
pm2 start ./pm2.json --no-daemon
|
||||||
|
Loading…
x
Reference in New Issue
Block a user