Implements RAPTOR for better chunking #882 (#883)

### What problem does this PR solve?

Implements RAPTOR for better chunking #882

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
balibabu 2024-05-22 18:04:18 +08:00 committed by GitHub
parent 43412571f7
commit ed184ed87e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 288 additions and 10 deletions

View File

@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
import { useTranslate } from '@/hooks/commonHooks'; import { useTranslate } from '@/hooks/commonHooks';
import LayoutRecognize from '../layout-recognize'; import LayoutRecognize from '../layout-recognize';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '../parse-configuration';
import styles from './index.less'; import styles from './index.less';
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> { interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
onCancel={hideModal} onCancel={hideModal}
afterClose={afterClose} afterClose={afterClose}
confirmLoading={loading} confirmLoading={loading}
width={700}
> >
<Space size={[0, 8]} wrap> <Space size={[0, 8]} wrap>
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}> <Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
</Form.Item> </Form.Item>
)} )}
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>} {showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
{showRaptorParseConfiguration(selectedTag) && (
<ParseConfiguration></ParseConfiguration>
)}
</Form> </Form>
</Modal> </Modal>
); );

View File

@ -0,0 +1,206 @@
import { useTranslate } from '@/hooks/commonHooks';
import { PlusOutlined } from '@ant-design/icons';
import {
Button,
Divider,
Flex,
Form,
Input,
InputNumber,
Slider,
Switch,
} from 'antd';
import random from 'lodash/random';
export const excludedParseMethods = ['table', 'resume', 'one'];
export const showRaptorParseConfiguration = (parserId: string) => {
return !excludedParseMethods.includes(parserId);
};
// The three types "table", "resume" and "one" do not display this configuration.
const ParseConfiguration = () => {
const form = Form.useFormInstance();
const { t } = useTranslate('knowledgeConfiguration');
const handleGenerate = () => {
form.setFieldValue(
['parser_config', 'raptor', 'random_seed'],
random(10000),
);
};
return (
<>
<Divider></Divider>
<Form.Item
name={['parser_config', 'raptor', 'use_raptor']}
label={t('useRaptor')}
initialValue={false}
valuePropName="checked"
tooltip={t('useRaptorTip')}
>
<Switch />
</Form.Item>
<Form.Item
shouldUpdate={(prevValues, curValues) =>
prevValues.parser_config.raptor.use_raptor !==
curValues.parser_config.raptor.use_raptor
}
>
{({ getFieldValue }) => {
const useRaptor = getFieldValue([
'parser_config',
'raptor',
'use_raptor',
]);
return (
useRaptor && (
<>
<Form.Item
name={['parser_config', 'raptor', 'prompt']}
label={t('prompt')}
initialValue={t('promptText')}
tooltip={t('promptTip')}
rules={[
{
required: true,
message: t('promptMessage'),
},
]}
>
<Input.TextArea rows={8} />
</Form.Item>
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
initialValue={128}
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<Slider max={2048} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<InputNumber max={2048} min={0} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
initialValue={0.1}
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<Slider
min={0}
max={1}
style={{ width: '100%' }}
step={0.01}
/>
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<InputNumber max={1} min={0} step={0.01} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
initialValue={64}
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<Slider min={1} max={1024} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<InputNumber max={1024} min={1} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('randomSeed')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'random_seed']}
noStyle
initialValue={0}
rules={[
{
required: true,
message: t('randomSeedMessage'),
},
]}
>
<InputNumber style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item noStyle>
<Button type="primary" onClick={handleGenerate}>
<PlusOutlined />
</Button>
</Form.Item>
</Flex>
</Form.Item>
</>
)
);
}}
</Form.Item>
</>
);
};
export default ParseConfiguration;

View File

@ -265,6 +265,26 @@ export default {
</p><p> </p><p>
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method. If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
</p>`, </p>`,
useRaptor: 'Use RAPTOR to enhance retrieval',
useRaptorTip:
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
prompt: 'Prompt',
promptTip: 'LLM prompt used for summarization.',
promptMessage: 'Prompt is required',
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
{cluster_content}
The above is the content you need to summarize.`,
maxToken: 'Max token',
maxTokenTip: 'Maximum token number for summarization.',
maxTokenMessage: 'Max token is required',
threshold: 'Threshold',
thresholdTip: 'The bigger the threshold is the less cluster will be.',
thresholdMessage: 'Threshold is required',
maxCluster: 'Max cluster',
maxClusterTip: 'Maximum cluster number.',
maxClusterMessage: 'Max cluster is required',
randomSeed: 'Random seed',
randomSeedMessage: 'Random seed is required',
}, },
chunk: { chunk: {
chunk: 'Chunk', chunk: 'Chunk',

View File

@ -238,6 +238,25 @@ export default {
</p><p> </p><p>
西LLM的上下文長度覆蓋了文檔長度 西LLM的上下文長度覆蓋了文檔長度
</p>`, </p>`,
useRaptor: '使用RAPTOR文件增強策略',
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
prompt: '提示詞',
promptMessage: '提示詞是必填項',
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
{}
`,
maxToken: '最大token數',
maxTokenMessage: '最大token數是必填項',
threshold: '臨界點',
thresholdMessage: '臨界點是必填項',
maxCluster: '最大聚類數',
maxClusterMessage: '最大聚類數是必填項',
randomSeed: '隨機種子',
randomSeedMessage: '隨機種子是必填項',
promptTip: 'LLM提示用於總結。',
maxTokenTip: '用於匯總的最大token數。',
thresholdTip: '閾值越大,聚類越少。',
maxClusterTip: '最大聚類數。',
}, },
chunk: { chunk: {
chunk: '解析塊', chunk: '解析塊',

View File

@ -255,6 +255,25 @@ export default {
</p><p> </p><p>
西LLM的上下文长度覆盖了文档长度 西LLM的上下文长度覆盖了文档长度
</p>`, </p>`,
useRaptor: '使用召回增强RAPTOR策略',
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
prompt: '提示词',
promptMessage: '提示词是必填项',
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
{}
`,
maxToken: '最大token数',
maxTokenMessage: '最大token数是必填项',
threshold: '临界点',
thresholdMessage: '临界点是必填项',
maxCluster: '最大聚类数',
maxClusterMessage: '最大聚类数是必填项',
randomSeed: '随机种子',
randomSeedMessage: '随机种子是必填项',
promptTip: 'LLM提示用于总结。',
maxTokenTip: '用于汇总的最大token数。',
thresholdTip: '阈值越大,聚类越少。',
maxClusterTip: '最大聚类数。',
}, },
chunk: { chunk: {
chunk: '解析块', chunk: '解析块',

View File

@ -8,6 +8,9 @@ import {
import LayoutRecognize from '@/components/layout-recognize'; import LayoutRecognize from '@/components/layout-recognize';
import MaxTokenNumber from '@/components/max-token-number'; import MaxTokenNumber from '@/components/max-token-number';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '@/components/parse-configuration';
import { useTranslate } from '@/hooks/commonHooks'; import { useTranslate } from '@/hooks/commonHooks';
import { FormInstance } from 'antd/lib'; import { FormInstance } from 'antd/lib';
import styles from './index.less'; import styles from './index.less';
@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
{({ getFieldValue }) => { {({ getFieldValue }) => {
const parserId = getFieldValue('parser_id'); const parserId = getFieldValue('parser_id');
if (parserId === 'naive') { return (
return ( <>
<> {parserId === 'naive' && (
<MaxTokenNumber></MaxTokenNumber> <>
<LayoutRecognize></LayoutRecognize> <MaxTokenNumber></MaxTokenNumber>
</> <LayoutRecognize></LayoutRecognize>
); </>
} )}
return null; {showRaptorParseConfiguration(parserId) && (
<ParseConfiguration></ParseConfiguration>
)}
</>
);
}} }}
</Form.Item> </Form.Item>

View File

@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
'embd_id', 'embd_id',
'parser_id', 'parser_id',
'language', 'language',
'parser_config.chunk_token_num', 'parser_config',
]), ]),
avatar: fileList, avatar: fileList,
}); });