From 89fcf4ea7c2e31f232c0375b62a0c98125a614e8 Mon Sep 17 00:00:00 2001 From: KVOJJJin Date: Fri, 26 Jan 2024 13:24:40 +0800 Subject: [PATCH] Feat: chunk overlap supported (#2209) Co-authored-by: jyong --- api/core/indexing_runner.py | 4 +- api/models/dataset.py | 3 +- api/services/dataset_service.py | 3 +- .../datasets/create/step-two/index.module.css | 2 +- .../datasets/create/step-two/index.tsx | 39 +++++++++++++++++-- web/i18n/lang/dataset-creation.en.ts | 3 ++ web/i18n/lang/dataset-creation.pt.ts | 3 ++ web/i18n/lang/dataset-creation.zh.ts | 3 ++ web/models/datasets.ts | 1 + 9 files changed, 53 insertions(+), 8 deletions(-) diff --git a/api/core/indexing_runner.py b/api/core/indexing_runner.py index 836cf3e671..42c43a6643 100644 --- a/api/core/indexing_runner.py +++ b/api/core/indexing_runner.py @@ -562,7 +562,7 @@ class IndexingRunner: character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder( chunk_size=segmentation["max_tokens"], - chunk_overlap=0, + chunk_overlap=segmentation.get('chunk_overlap', 0), fixed_separator=separator, separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance @@ -571,7 +571,7 @@ class IndexingRunner: # Automatic segmentation character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder( chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'], - chunk_overlap=0, + chunk_overlap=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['chunk_overlap'], separators=["\n\n", "。", ".", " ", ""], embedding_model_instance=embedding_model_instance ) diff --git a/api/models/dataset.py b/api/models/dataset.py index b9f8eacca6..06fd55aeca 100644 --- a/api/models/dataset.py +++ b/api/models/dataset.py @@ -134,7 +134,8 @@ class DatasetProcessRule(db.Model): ], 'segmentation': { 'delimiter': '\n', - 'max_tokens': 1000 + 'max_tokens': 500, + 'chunk_overlap': 50 } } diff --git a/api/services/dataset_service.py b/api/services/dataset_service.py index 236ffe7008..7cdded1ae8 100644 --- a/api/services/dataset_service.py +++ b/api/services/dataset_service.py @@ -241,7 +241,8 @@ class DocumentService: ], 'segmentation': { 'delimiter': '\n', - 'max_tokens': 500 + 'max_tokens': 500, + 'chunk_overlap': 50 } } } diff --git a/web/app/components/datasets/create/step-two/index.module.css b/web/app/components/datasets/create/step-two/index.module.css index 7ad0bbc81f..9c7dee5781 100644 --- a/web/app/components/datasets/create/step-two/index.module.css +++ b/web/app/components/datasets/create/step-two/index.module.css @@ -18,7 +18,7 @@ } .form .label { - @apply pt-6 pb-2; + @apply pt-6 pb-2 flex items-center; font-weight: 500; font-size: 16px; line-height: 24px; diff --git a/web/app/components/datasets/create/step-two/index.tsx b/web/app/components/datasets/create/step-two/index.tsx index 6ffa94e822..c894f13747 100644 --- a/web/app/components/datasets/create/step-two/index.tsx +++ b/web/app/components/datasets/create/step-two/index.tsx @@ -33,13 +33,14 @@ import { DataSourceType, DocForm } from '@/models/datasets' import NotionIcon from '@/app/components/base/notion-icon' import Switch from '@/app/components/base/switch' import { MessageChatSquare } from '@/app/components/base/icons/src/public/common' -import { XClose } from '@/app/components/base/icons/src/vender/line/general' +import { HelpCircle, XClose } from '@/app/components/base/icons/src/vender/line/general' import { useDatasetDetailContext } from '@/context/dataset-detail' import I18n from '@/context/i18n' import { IS_CE_EDITION } from '@/config' import { RETRIEVE_METHOD } from '@/types/app' import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints' import Tooltip from '@/app/components/base/tooltip' +import TooltipPlus from '@/app/components/base/tooltip-plus' import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks' import { LanguagesSupportedUnderscore, getModelRuntimeSupported } from '@/utils/language' @@ -99,7 +100,8 @@ const StepTwo = ({ const [previewScrolled, setPreviewScrolled] = useState(false) const [segmentationType, setSegmentationType] = useState(SegmentType.AUTO) const [segmentIdentifier, setSegmentIdentifier] = useState('\\n') - const [max, setMax] = useState(1000) + const [max, setMax] = useState(500) + const [overlap, setOverlap] = useState(50) const [rules, setRules] = useState([]) const [defaultConfig, setDefaultConfig] = useState() const hasSetIndexType = !!indexingType @@ -171,6 +173,7 @@ const StepTwo = ({ if (defaultConfig) { setSegmentIdentifier((defaultConfig.segmentation.separator === '\n' ? '\\n' : defaultConfig.segmentation.separator) || '\\n') setMax(defaultConfig.segmentation.max_tokens) + setOverlap(defaultConfig.segmentation.chunk_overlap) setRules(defaultConfig.pre_processing_rules) } } @@ -207,6 +210,7 @@ const StepTwo = ({ segmentation: { separator: segmentIdentifier === '\\n' ? '\n' : segmentIdentifier, max_tokens: max, + chunk_overlap: overlap, }, } processRule.rules = ruleObj @@ -275,6 +279,10 @@ const StepTwo = ({ } = useModelListAndDefaultModelAndCurrentProviderAndModel(3) const getCreationParams = () => { let params + if (segmentationType === SegmentType.CUSTOM && overlap > max) { + Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') }) + return + } if (isSetting) { params = { original_document_id: documentDetail?.id, @@ -337,6 +345,7 @@ const StepTwo = ({ const separator = res.rules.segmentation.separator setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') setMax(res.rules.segmentation.max_tokens) + setOverlap(res.rules.segmentation.chunk_overlap) setRules(res.rules.pre_processing_rules) setDefaultConfig(res.rules) } @@ -350,8 +359,10 @@ const StepTwo = ({ const rules = documentDetail.dataset_process_rule.rules const separator = rules.segmentation.separator const max = rules.segmentation.max_tokens + const overlap = rules.segmentation.chunk_overlap setSegmentIdentifier((separator === '\n' ? '\\n' : separator) || '\\n') setMax(max) + setOverlap(overlap) setRules(rules.pre_processing_rules) setDefaultConfig(rules) } @@ -569,13 +580,35 @@ const StepTwo = ({ setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))} /> +
+
+
+ {t('datasetCreation.stepTwo.overlap')} + + {t('datasetCreation.stepTwo.overlapTip')} +
+ }> + + +
+ setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))} + /> +
+
{t('datasetCreation.stepTwo.rules')}
diff --git a/web/i18n/lang/dataset-creation.en.ts b/web/i18n/lang/dataset-creation.en.ts index 7bb268218b..3568b8e84e 100644 --- a/web/i18n/lang/dataset-creation.en.ts +++ b/web/i18n/lang/dataset-creation.en.ts @@ -59,6 +59,9 @@ const translation = { separator: 'Segment identifier', separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")', maxLength: 'Maximum chunk length', + overlap: 'Chunk overlap', + overlapTip: 'Setting the chunk overlap can maintain the semantic relevance between them, enhancing the retrieve effect. It is recommended to set 10%-25% of the maximum chunk size.', + overlapCheck: 'chunk overlap should not bigger than maximun chunk length', rules: 'Text preprocessing rules', removeExtraSpaces: 'Replace consecutive spaces, newlines and tabs', removeUrlEmails: 'Delete all URLs and email addresses', diff --git a/web/i18n/lang/dataset-creation.pt.ts b/web/i18n/lang/dataset-creation.pt.ts index 871819fa15..6a5a91a945 100644 --- a/web/i18n/lang/dataset-creation.pt.ts +++ b/web/i18n/lang/dataset-creation.pt.ts @@ -59,6 +59,9 @@ const translation = { separator: 'Identificador de segmento', separatorPlaceholder: 'Por exemplo, nova linha (\\\\n) ou separador especial (como "***")', maxLength: 'Comprimento máximo do fragmento', + overlap: 'Sobreposição de blocos', + overlapTip: 'Configurar a sobreposição de blocos pode manter a relevância semântica entre eles, melhorando o efeito de recuperação. É recomendado definir de 10% a 25% do tamanho máximo do bloco.', + overlapCheck: 'a sobreposição de blocos não deve ser maior que o comprimento máximo do bloco', rules: 'Regras de pré-processamento de texto', removeExtraSpaces: 'Substituir espaços consecutivos, quebras de linha e tabulações', removeUrlEmails: 'Excluir todos os URLs e endereços de e-mail', diff --git a/web/i18n/lang/dataset-creation.zh.ts b/web/i18n/lang/dataset-creation.zh.ts index d9f1734799..d96ed45cc7 100644 --- a/web/i18n/lang/dataset-creation.zh.ts +++ b/web/i18n/lang/dataset-creation.zh.ts @@ -59,6 +59,9 @@ const translation = { separator: '分段标识符', separatorPlaceholder: '例如换行符(\n)或特定的分隔符(如 "***")', maxLength: '分段最大长度', + overlap: '分段重叠长度', + overlapTip: '设置分段之间的重叠长度可以保留分段之间的语义关系,提升召回效果。建议设置为最大分段长度的10%-25%', + overlapCheck: '分段重叠长度不能大于分段最大长度', rules: '文本预处理规则', removeExtraSpaces: '替换掉连续的空格、换行符和制表符', removeUrlEmails: '删除所有 URL 和电子邮件地址', diff --git a/web/models/datasets.ts b/web/models/datasets.ts index 8940af00ba..bc0053e086 100644 --- a/web/models/datasets.ts +++ b/web/models/datasets.ts @@ -108,6 +108,7 @@ export type PreProcessingRule = { export type Segmentation = { separator: string max_tokens: number + chunk_overlap: number } export const DocumentIndexingStatusList = [