Implements RAPTOR for better chunking #882 (#883)

### What problem does this PR solve? Implements RAPTOR for better chunking #882 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-08-13 20:25:55 +08:00 · 2024-05-22 18:04:18 +08:00 · 2024-05-22 18:04:18 +08:00 · ed184ed87e
commit ed184ed87e
parent 43412571f7
7 changed files with 288 additions and 10 deletions
--- a/web/src/components/chunk-method-modal/index.tsx
+++ b/web/src/components/chunk-method-modal/index.tsx
@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';

 import { useTranslate } from '@/hooks/commonHooks';
 import LayoutRecognize from '../layout-recognize';
+import ParseConfiguration, {
+  showRaptorParseConfiguration,
+} from '../parse-configuration';
 import styles from './index.less';

 interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
      onCancel={hideModal}
      afterClose={afterClose}
      confirmLoading={loading}
+      width={700}
    >
      <Space size={[0, 8]} wrap>
        <Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
          </Form.Item>
        )}
        {showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
+        {showRaptorParseConfiguration(selectedTag) && (
+          <ParseConfiguration></ParseConfiguration>
+        )}
      </Form>
    </Modal>
  );
--- a/web/src/components/parse-configuration/index.tsx
+++ b/web/src/components/parse-configuration/index.tsx
@ -0,0 +1,206 @@
+import { useTranslate } from '@/hooks/commonHooks';
+import { PlusOutlined } from '@ant-design/icons';
+import {
+  Button,
+  Divider,
+  Flex,
+  Form,
+  Input,
+  InputNumber,
+  Slider,
+  Switch,
+} from 'antd';
+import random from 'lodash/random';
+
+export const excludedParseMethods = ['table', 'resume', 'one'];
+
+export const showRaptorParseConfiguration = (parserId: string) => {
+  return !excludedParseMethods.includes(parserId);
+};
+
+// The three types "table", "resume" and "one" do not display this configuration.
+const ParseConfiguration = () => {
+  const form = Form.useFormInstance();
+  const { t } = useTranslate('knowledgeConfiguration');
+
+  const handleGenerate = () => {
+    form.setFieldValue(
+      ['parser_config', 'raptor', 'random_seed'],
+      random(10000),
+    );
+  };
+
+  return (
+    <>
+      <Divider></Divider>
+      <Form.Item
+        name={['parser_config', 'raptor', 'use_raptor']}
+        label={t('useRaptor')}
+        initialValue={false}
+        valuePropName="checked"
+        tooltip={t('useRaptorTip')}
+      >
+        <Switch />
+      </Form.Item>
+      <Form.Item
+        shouldUpdate={(prevValues, curValues) =>
+          prevValues.parser_config.raptor.use_raptor !==
+          curValues.parser_config.raptor.use_raptor
+        }
+      >
+        {({ getFieldValue }) => {
+          const useRaptor = getFieldValue([
+            'parser_config',
+            'raptor',
+            'use_raptor',
+          ]);
+
+          return (
+            useRaptor && (
+              <>
+                <Form.Item
+                  name={['parser_config', 'raptor', 'prompt']}
+                  label={t('prompt')}
+                  initialValue={t('promptText')}
+                  tooltip={t('promptTip')}
+                  rules={[
+                    {
+                      required: true,
+                      message: t('promptMessage'),
+                    },
+                  ]}
+                >
+                  <Input.TextArea rows={8} />
+                </Form.Item>
+                <Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
+                  <Flex gap={20} align="center">
+                    <Flex flex={1}>
+                      <Form.Item
+                        name={['parser_config', 'raptor', 'max_token']}
+                        noStyle
+                        initialValue={128}
+                        rules={[
+                          {
+                            required: true,
+                            message: t('maxTokenMessage'),
+                          },
+                        ]}
+                      >
+                        <Slider max={2048} style={{ width: '100%' }} />
+                      </Form.Item>
+                    </Flex>
+                    <Form.Item
+                      name={['parser_config', 'raptor', 'max_token']}
+                      noStyle
+                      rules={[
+                        {
+                          required: true,
+                          message: t('maxTokenMessage'),
+                        },
+                      ]}
+                    >
+                      <InputNumber max={2048} min={0} />
+                    </Form.Item>
+                  </Flex>
+                </Form.Item>
+                <Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
+                  <Flex gap={20} align="center">
+                    <Flex flex={1}>
+                      <Form.Item
+                        name={['parser_config', 'raptor', 'threshold']}
+                        noStyle
+                        initialValue={0.1}
+                        rules={[
+                          {
+                            required: true,
+                            message: t('thresholdMessage'),
+                          },
+                        ]}
+                      >
+                        <Slider
+                          min={0}
+                          max={1}
+                          style={{ width: '100%' }}
+                          step={0.01}
+                        />
+                      </Form.Item>
+                    </Flex>
+                    <Form.Item
+                      name={['parser_config', 'raptor', 'threshold']}
+                      noStyle
+                      rules={[
+                        {
+                          required: true,
+                          message: t('thresholdMessage'),
+                        },
+                      ]}
+                    >
+                      <InputNumber max={1} min={0} step={0.01} />
+                    </Form.Item>
+                  </Flex>
+                </Form.Item>
+                <Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
+                  <Flex gap={20} align="center">
+                    <Flex flex={1}>
+                      <Form.Item
+                        name={['parser_config', 'raptor', 'max_cluster']}
+                        noStyle
+                        initialValue={64}
+                        rules={[
+                          {
+                            required: true,
+                            message: t('maxClusterMessage'),
+                          },
+                        ]}
+                      >
+                        <Slider min={1} max={1024} style={{ width: '100%' }} />
+                      </Form.Item>
+                    </Flex>
+                    <Form.Item
+                      name={['parser_config', 'raptor', 'max_cluster']}
+                      noStyle
+                      rules={[
+                        {
+                          required: true,
+                          message: t('maxClusterMessage'),
+                        },
+                      ]}
+                    >
+                      <InputNumber max={1024} min={1} />
+                    </Form.Item>
+                  </Flex>
+                </Form.Item>
+                <Form.Item label={t('randomSeed')}>
+                  <Flex gap={20} align="center">
+                    <Flex flex={1}>
+                      <Form.Item
+                        name={['parser_config', 'raptor', 'random_seed']}
+                        noStyle
+                        initialValue={0}
+                        rules={[
+                          {
+                            required: true,
+                            message: t('randomSeedMessage'),
+                          },
+                        ]}
+                      >
+                        <InputNumber style={{ width: '100%' }} />
+                      </Form.Item>
+                    </Flex>
+                    <Form.Item noStyle>
+                      <Button type="primary" onClick={handleGenerate}>
+                        <PlusOutlined />
+                      </Button>
+                    </Form.Item>
+                  </Flex>
+                </Form.Item>
+              </>
+            )
+          );
+        }}
+      </Form.Item>
+    </>
+  );
+};
+
+export default ParseConfiguration;
--- a/web/src/locales/en.ts
+++ b/web/src/locales/en.ts
@ -265,6 +265,26 @@ export default {
    </p><p>
    If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
    </p>`,
+      useRaptor: 'Use RAPTOR to enhance retrieval',
+      useRaptorTip:
+        'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
+      prompt: 'Prompt',
+      promptTip: 'LLM prompt used for summarization.',
+      promptMessage: 'Prompt is required',
+      promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
+      {cluster_content}
+      The above is the content you need to summarize.`,
+      maxToken: 'Max token',
+      maxTokenTip: 'Maximum token number for summarization.',
+      maxTokenMessage: 'Max token is required',
+      threshold: 'Threshold',
+      thresholdTip: 'The bigger the threshold is the less cluster will be.',
+      thresholdMessage: 'Threshold is required',
+      maxCluster: 'Max cluster',
+      maxClusterTip: 'Maximum cluster number.',
+      maxClusterMessage: 'Max cluster is required',
+      randomSeed: 'Random seed',
+      randomSeedMessage: 'Random seed is required',
    },
    chunk: {
      chunk: 'Chunk',
--- a/web/src/locales/zh-traditional.ts
+++ b/web/src/locales/zh-traditional.ts
@ -238,6 +238,25 @@ export default {
        </p><p>
        如果你要總結的東西需要一篇文章的全部上下文，並且所選LLM的上下文長度覆蓋了文檔長度，你可以嘗試這種方法。
        </p>`,
+      useRaptor: '使用RAPTOR文件增強策略',
+      useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
+      prompt: '提示詞',
+      promptMessage: '提示詞是必填項',
+      promptText: `请請總結以下段落。 小心數字，不要編造。 段落如下：
+      {集群內容}
+      以上就是你需要總結的內容。`,
+      maxToken: '最大token數',
+      maxTokenMessage: '最大token數是必填項',
+      threshold: '臨界點',
+      thresholdMessage: '臨界點是必填項',
+      maxCluster: '最大聚類數',
+      maxClusterMessage: '最大聚類數是必填項',
+      randomSeed: '隨機種子',
+      randomSeedMessage: '隨機種子是必填項',
+      promptTip: 'LLM提示用於總結。',
+      maxTokenTip: '用於匯總的最大token數。',
+      thresholdTip: '閾值越大，聚類越少。',
+      maxClusterTip: '最大聚類數。',
    },
    chunk: {
      chunk: '解析塊',
--- a/web/src/locales/zh.ts
+++ b/web/src/locales/zh.ts
@ -255,6 +255,25 @@ export default {
      </p><p>
      如果你要总结的东西需要一篇文章的全部上下文，并且所选LLM的上下文长度覆盖了文档长度，你可以尝试这种方法。
      </p>`,
+      useRaptor: '使用召回增强RAPTOR策略',
+      useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
+      prompt: '提示词',
+      promptMessage: '提示词是必填项',
+      promptText: `请总结以下段落。 小心数字，不要编造。 段落如下：
+      {集群内容}
+      以上就是你需要总结的内容。`,
+      maxToken: '最大token数',
+      maxTokenMessage: '最大token数是必填项',
+      threshold: '临界点',
+      thresholdMessage: '临界点是必填项',
+      maxCluster: '最大聚类数',
+      maxClusterMessage: '最大聚类数是必填项',
+      randomSeed: '随机种子',
+      randomSeedMessage: '随机种子是必填项',
+      promptTip: 'LLM提示用于总结。',
+      maxTokenTip: '用于汇总的最大token数。',
+      thresholdTip: '阈值越大，聚类越少。',
+      maxClusterTip: '最大聚类数。',
    },
    chunk: {
      chunk: '解析块',
--- a/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/configuration.tsx
@ -8,6 +8,9 @@ import {

 import LayoutRecognize from '@/components/layout-recognize';
 import MaxTokenNumber from '@/components/max-token-number';
+import ParseConfiguration, {
+  showRaptorParseConfiguration,
+} from '@/components/parse-configuration';
 import { useTranslate } from '@/hooks/commonHooks';
 import { FormInstance } from 'antd/lib';
 import styles from './index.less';
@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
        {({ getFieldValue }) => {
          const parserId = getFieldValue('parser_id');

-          if (parserId === 'naive') {
-            return (
-              <>
-                <MaxTokenNumber></MaxTokenNumber>
-                <LayoutRecognize></LayoutRecognize>
-              </>
-            );
-          }
-          return null;
+          return (
+            <>
+              {parserId === 'naive' && (
+                <>
+                  <MaxTokenNumber></MaxTokenNumber>
+                  <LayoutRecognize></LayoutRecognize>
+                </>
+              )}
+              {showRaptorParseConfiguration(parserId) && (
+                <ParseConfiguration></ParseConfiguration>
+              )}
+            </>
+          );
        }}
      </Form.Item>

--- a/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
+++ b/web/src/pages/add-knowledge/components/knowledge-setting/hooks.ts
@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
        'embd_id',
        'parser_id',
        'language',
-        'parser_config.chunk_token_num',
+        'parser_config',
      ]),
      avatar: fileList,
    });