From ed184ed87e45e3dd38894bdb2b63102eaaae957d Mon Sep 17 00:00:00 2001
From: balibabu
Date: Wed, 22 May 2024 18:04:18 +0800
Subject: [PATCH] Implements RAPTOR for better chunking #882 (#883)
### What problem does this PR solve?
Implements RAPTOR for better chunking #882
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---
.../components/chunk-method-modal/index.tsx | 7 +
.../components/parse-configuration/index.tsx | 206 ++++++++++++++++++
web/src/locales/en.ts | 20 ++
web/src/locales/zh-traditional.ts | 19 ++
web/src/locales/zh.ts | 19 ++
.../knowledge-setting/configuration.tsx | 25 ++-
.../components/knowledge-setting/hooks.ts | 2 +-
7 files changed, 288 insertions(+), 10 deletions(-)
create mode 100644 web/src/components/parse-configuration/index.tsx
diff --git a/web/src/components/chunk-method-modal/index.tsx b/web/src/components/chunk-method-modal/index.tsx
index 21dde83d4..df6e3c25b 100644
--- a/web/src/components/chunk-method-modal/index.tsx
+++ b/web/src/components/chunk-method-modal/index.tsx
@@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
import { useTranslate } from '@/hooks/commonHooks';
import LayoutRecognize from '../layout-recognize';
+import ParseConfiguration, {
+ showRaptorParseConfiguration,
+} from '../parse-configuration';
import styles from './index.less';
interface IProps extends Omit {
@@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC = ({
onCancel={hideModal}
afterClose={afterClose}
confirmLoading={loading}
+ width={700}
>
@@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC = ({
)}
{showMaxTokenNumber && }
+ {showRaptorParseConfiguration(selectedTag) && (
+
+ )}
);
diff --git a/web/src/components/parse-configuration/index.tsx b/web/src/components/parse-configuration/index.tsx
new file mode 100644
index 000000000..45e45bcdf
--- /dev/null
+++ b/web/src/components/parse-configuration/index.tsx
@@ -0,0 +1,206 @@
+import { useTranslate } from '@/hooks/commonHooks';
+import { PlusOutlined } from '@ant-design/icons';
+import {
+ Button,
+ Divider,
+ Flex,
+ Form,
+ Input,
+ InputNumber,
+ Slider,
+ Switch,
+} from 'antd';
+import random from 'lodash/random';
+
+export const excludedParseMethods = ['table', 'resume', 'one'];
+
+export const showRaptorParseConfiguration = (parserId: string) => {
+ return !excludedParseMethods.includes(parserId);
+};
+
+// The three types "table", "resume" and "one" do not display this configuration.
+const ParseConfiguration = () => {
+ const form = Form.useFormInstance();
+ const { t } = useTranslate('knowledgeConfiguration');
+
+ const handleGenerate = () => {
+ form.setFieldValue(
+ ['parser_config', 'raptor', 'random_seed'],
+ random(10000),
+ );
+ };
+
+ return (
+ <>
+
+
+
+
+
+ prevValues.parser_config.raptor.use_raptor !==
+ curValues.parser_config.raptor.use_raptor
+ }
+ >
+ {({ getFieldValue }) => {
+ const useRaptor = getFieldValue([
+ 'parser_config',
+ 'raptor',
+ 'use_raptor',
+ ]);
+
+ return (
+ useRaptor && (
+ <>
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ >
+ )
+ );
+ }}
+
+ >
+ );
+};
+
+export default ParseConfiguration;
diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts
index a941b9663..aa3395f62 100644
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@@ -265,6 +265,26 @@ export default {
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
`,
+ useRaptor: 'Use RAPTOR to enhance retrieval',
+ useRaptorTip:
+ 'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
+ prompt: 'Prompt',
+ promptTip: 'LLM prompt used for summarization.',
+ promptMessage: 'Prompt is required',
+ promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
+ {cluster_content}
+ The above is the content you need to summarize.`,
+ maxToken: 'Max token',
+ maxTokenTip: 'Maximum token number for summarization.',
+ maxTokenMessage: 'Max token is required',
+ threshold: 'Threshold',
+ thresholdTip: 'The bigger the threshold is the less cluster will be.',
+ thresholdMessage: 'Threshold is required',
+ maxCluster: 'Max cluster',
+ maxClusterTip: 'Maximum cluster number.',
+ maxClusterMessage: 'Max cluster is required',
+ randomSeed: 'Random seed',
+ randomSeedMessage: 'Random seed is required',
},
chunk: {
chunk: 'Chunk',
diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts
index 8b7d7708c..3f5b91436 100644
--- a/web/src/locales/zh-traditional.ts
+++ b/web/src/locales/zh-traditional.ts
@@ -238,6 +238,25 @@ export default {
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
`,
+ useRaptor: '使用RAPTOR文件增強策略',
+ useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
+ prompt: '提示詞',
+ promptMessage: '提示詞是必填項',
+ promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
+ {集群內容}
+ 以上就是你需要總結的內容。`,
+ maxToken: '最大token數',
+ maxTokenMessage: '最大token數是必填項',
+ threshold: '臨界點',
+ thresholdMessage: '臨界點是必填項',
+ maxCluster: '最大聚類數',
+ maxClusterMessage: '最大聚類數是必填項',
+ randomSeed: '隨機種子',
+ randomSeedMessage: '隨機種子是必填項',
+ promptTip: 'LLM提示用於總結。',
+ maxTokenTip: '用於匯總的最大token數。',
+ thresholdTip: '閾值越大,聚類越少。',
+ maxClusterTip: '最大聚類數。',
},
chunk: {
chunk: '解析塊',
diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts
index b85d4b031..71e5f08bf 100644
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@@ -255,6 +255,25 @@ export default {
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
`,
+ useRaptor: '使用召回增强RAPTOR策略',
+ useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
+ prompt: '提示词',
+ promptMessage: '提示词是必填项',
+ promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
+ {集群内容}
+ 以上就是你需要总结的内容。`,
+ maxToken: '最大token数',
+ maxTokenMessage: '最大token数是必填项',
+ threshold: '临界点',
+ thresholdMessage: '临界点是必填项',
+ maxCluster: '最大聚类数',
+ maxClusterMessage: '最大聚类数是必填项',
+ randomSeed: '随机种子',
+ randomSeedMessage: '随机种子是必填项',
+ promptTip: 'LLM提示用于总结。',
+ maxTokenTip: '用于汇总的最大token数。',
+ thresholdTip: '阈值越大,聚类越少。',
+ maxClusterTip: '最大聚类数。',
},
chunk: {
chunk: '解析块',
diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
index 80c9140ba..6141d1b30 100644
--- a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
@@ -8,6 +8,9 @@ import {
import LayoutRecognize from '@/components/layout-recognize';
import MaxTokenNumber from '@/components/max-token-number';
+import ParseConfiguration, {
+ showRaptorParseConfiguration,
+} from '@/components/parse-configuration';
import { useTranslate } from '@/hooks/commonHooks';
import { FormInstance } from 'antd/lib';
import styles from './index.less';
@@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
{({ getFieldValue }) => {
const parserId = getFieldValue('parser_id');
- if (parserId === 'naive') {
- return (
- <>
-
-
- >
- );
- }
- return null;
+ return (
+ <>
+ {parserId === 'naive' && (
+ <>
+
+
+ >
+ )}
+ {showRaptorParseConfiguration(parserId) && (
+
+ )}
+ >
+ );
}}
diff --git a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
index 14b773f1f..d1199ef70 100644
--- a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
@@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
'embd_id',
'parser_id',
'language',
- 'parser_config.chunk_token_num',
+ 'parser_config',
]),
avatar: fileList,
});