From ed184ed87e45e3dd38894bdb2b63102eaaae957d Mon Sep 17 00:00:00 2001 From: balibabu Date: Wed, 22 May 2024 18:04:18 +0800 Subject: [PATCH] Implements RAPTOR for better chunking #882 (#883) ### What problem does this PR solve? Implements RAPTOR for better chunking #882 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- .../components/chunk-method-modal/index.tsx | 7 + .../components/parse-configuration/index.tsx | 206 ++++++++++++++++++ web/src/locales/en.ts | 20 ++ web/src/locales/zh-traditional.ts | 19 ++ web/src/locales/zh.ts | 19 ++ .../knowledge-setting/configuration.tsx | 25 ++- .../components/knowledge-setting/hooks.ts | 2 +- 7 files changed, 288 insertions(+), 10 deletions(-) create mode 100644 web/src/components/parse-configuration/index.tsx diff --git a/web/src/components/chunk-method-modal/index.tsx b/web/src/components/chunk-method-modal/index.tsx index 21dde83d4..df6e3c25b 100644 --- a/web/src/components/chunk-method-modal/index.tsx +++ b/web/src/components/chunk-method-modal/index.tsx @@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks'; import { useTranslate } from '@/hooks/commonHooks'; import LayoutRecognize from '../layout-recognize'; +import ParseConfiguration, { + showRaptorParseConfiguration, +} from '../parse-configuration'; import styles from './index.less'; interface IProps extends Omit { @@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC = ({ onCancel={hideModal} afterClose={afterClose} confirmLoading={loading} + width={700} > @@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC = ({ )} {showMaxTokenNumber && } + {showRaptorParseConfiguration(selectedTag) && ( + + )} ); diff --git a/web/src/components/parse-configuration/index.tsx b/web/src/components/parse-configuration/index.tsx new file mode 100644 index 000000000..45e45bcdf --- /dev/null +++ b/web/src/components/parse-configuration/index.tsx @@ -0,0 +1,206 @@ +import { useTranslate } from '@/hooks/commonHooks'; +import { PlusOutlined } from '@ant-design/icons'; +import { + Button, + Divider, + Flex, + Form, + Input, + InputNumber, + Slider, + Switch, +} from 'antd'; +import random from 'lodash/random'; + +export const excludedParseMethods = ['table', 'resume', 'one']; + +export const showRaptorParseConfiguration = (parserId: string) => { + return !excludedParseMethods.includes(parserId); +}; + +// The three types "table", "resume" and "one" do not display this configuration. +const ParseConfiguration = () => { + const form = Form.useFormInstance(); + const { t } = useTranslate('knowledgeConfiguration'); + + const handleGenerate = () => { + form.setFieldValue( + ['parser_config', 'raptor', 'random_seed'], + random(10000), + ); + }; + + return ( + <> + + + + + + prevValues.parser_config.raptor.use_raptor !== + curValues.parser_config.raptor.use_raptor + } + > + {({ getFieldValue }) => { + const useRaptor = getFieldValue([ + 'parser_config', + 'raptor', + 'use_raptor', + ]); + + return ( + useRaptor && ( + <> + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ) + ); + }} + + + ); +}; + +export default ParseConfiguration; diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index a941b9663..aa3395f62 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -265,6 +265,26 @@ export default {

If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.

`, + useRaptor: 'Use RAPTOR to enhance retrieval', + useRaptorTip: + 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059', + prompt: 'Prompt', + promptTip: 'LLM prompt used for summarization.', + promptMessage: 'Prompt is required', + promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following: + {cluster_content} + The above is the content you need to summarize.`, + maxToken: 'Max token', + maxTokenTip: 'Maximum token number for summarization.', + maxTokenMessage: 'Max token is required', + threshold: 'Threshold', + thresholdTip: 'The bigger the threshold is the less cluster will be.', + thresholdMessage: 'Threshold is required', + maxCluster: 'Max cluster', + maxClusterTip: 'Maximum cluster number.', + maxClusterMessage: 'Max cluster is required', + randomSeed: 'Random seed', + randomSeedMessage: 'Random seed is required', }, chunk: { chunk: 'Chunk', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 8b7d7708c..3f5b91436 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -238,6 +238,25 @@ export default {

如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。

`, + useRaptor: '使用RAPTOR文件增強策略', + useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059', + prompt: '提示詞', + promptMessage: '提示詞是必填項', + promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下: + {集群內容} + 以上就是你需要總結的內容。`, + maxToken: '最大token數', + maxTokenMessage: '最大token數是必填項', + threshold: '臨界點', + thresholdMessage: '臨界點是必填項', + maxCluster: '最大聚類數', + maxClusterMessage: '最大聚類數是必填項', + randomSeed: '隨機種子', + randomSeedMessage: '隨機種子是必填項', + promptTip: 'LLM提示用於總結。', + maxTokenTip: '用於匯總的最大token數。', + thresholdTip: '閾值越大,聚類越少。', + maxClusterTip: '最大聚類數。', }, chunk: { chunk: '解析塊', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index b85d4b031..71e5f08bf 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -255,6 +255,25 @@ export default {

如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。

`, + useRaptor: '使用召回增强RAPTOR策略', + useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059', + prompt: '提示词', + promptMessage: '提示词是必填项', + promptText: `请总结以下段落。 小心数字,不要编造。 段落如下: + {集群内容} + 以上就是你需要总结的内容。`, + maxToken: '最大token数', + maxTokenMessage: '最大token数是必填项', + threshold: '临界点', + thresholdMessage: '临界点是必填项', + maxCluster: '最大聚类数', + maxClusterMessage: '最大聚类数是必填项', + randomSeed: '随机种子', + randomSeedMessage: '随机种子是必填项', + promptTip: 'LLM提示用于总结。', + maxTokenTip: '用于汇总的最大token数。', + thresholdTip: '阈值越大,聚类越少。', + maxClusterTip: '最大聚类数。', }, chunk: { chunk: '解析块', diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx index 80c9140ba..6141d1b30 100644 --- a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx +++ b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx @@ -8,6 +8,9 @@ import { import LayoutRecognize from '@/components/layout-recognize'; import MaxTokenNumber from '@/components/max-token-number'; +import ParseConfiguration, { + showRaptorParseConfiguration, +} from '@/components/parse-configuration'; import { useTranslate } from '@/hooks/commonHooks'; import { FormInstance } from 'antd/lib'; import styles from './index.less'; @@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => { {({ getFieldValue }) => { const parserId = getFieldValue('parser_id'); - if (parserId === 'naive') { - return ( - <> - - - - ); - } - return null; + return ( + <> + {parserId === 'naive' && ( + <> + + + + )} + {showRaptorParseConfiguration(parserId) && ( + + )} + + ); }} diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts index 14b773f1f..d1199ef70 100644 --- a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts +++ b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts @@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => { 'embd_id', 'parser_id', 'language', - 'parser_config.chunk_token_num', + 'parser_config', ]), avatar: fileList, });