Implements RAPTOR for better chunking #882 (#883)

### What problem does this PR solve?

Implements RAPTOR for better chunking #882

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
balibabu 2024-05-22 18:04:18 +08:00 committed by GitHub
parent 43412571f7
commit ed184ed87e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 288 additions and 10 deletions

View File

@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
import { useTranslate } from '@/hooks/commonHooks';
import LayoutRecognize from '../layout-recognize';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '../parse-configuration';
import styles from './index.less';
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
onCancel={hideModal}
afterClose={afterClose}
confirmLoading={loading}
width={700}
>
<Space size={[0, 8]} wrap>
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
</Form.Item>
)}
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
{showRaptorParseConfiguration(selectedTag) && (
<ParseConfiguration></ParseConfiguration>
)}
</Form>
</Modal>
);

View File

@ -0,0 +1,206 @@
import { useTranslate } from '@/hooks/commonHooks';
import { PlusOutlined } from '@ant-design/icons';
import {
Button,
Divider,
Flex,
Form,
Input,
InputNumber,
Slider,
Switch,
} from 'antd';
import random from 'lodash/random';
export const excludedParseMethods = ['table', 'resume', 'one'];
export const showRaptorParseConfiguration = (parserId: string) => {
return !excludedParseMethods.includes(parserId);
};
// The three types "table", "resume" and "one" do not display this configuration.
const ParseConfiguration = () => {
const form = Form.useFormInstance();
const { t } = useTranslate('knowledgeConfiguration');
const handleGenerate = () => {
form.setFieldValue(
['parser_config', 'raptor', 'random_seed'],
random(10000),
);
};
return (
<>
<Divider></Divider>
<Form.Item
name={['parser_config', 'raptor', 'use_raptor']}
label={t('useRaptor')}
initialValue={false}
valuePropName="checked"
tooltip={t('useRaptorTip')}
>
<Switch />
</Form.Item>
<Form.Item
shouldUpdate={(prevValues, curValues) =>
prevValues.parser_config.raptor.use_raptor !==
curValues.parser_config.raptor.use_raptor
}
>
{({ getFieldValue }) => {
const useRaptor = getFieldValue([
'parser_config',
'raptor',
'use_raptor',
]);
return (
useRaptor && (
<>
<Form.Item
name={['parser_config', 'raptor', 'prompt']}
label={t('prompt')}
initialValue={t('promptText')}
tooltip={t('promptTip')}
rules={[
{
required: true,
message: t('promptMessage'),
},
]}
>
<Input.TextArea rows={8} />
</Form.Item>
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
initialValue={128}
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<Slider max={2048} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_token']}
noStyle
rules={[
{
required: true,
message: t('maxTokenMessage'),
},
]}
>
<InputNumber max={2048} min={0} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
initialValue={0.1}
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<Slider
min={0}
max={1}
style={{ width: '100%' }}
step={0.01}
/>
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'threshold']}
noStyle
rules={[
{
required: true,
message: t('thresholdMessage'),
},
]}
>
<InputNumber max={1} min={0} step={0.01} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
initialValue={64}
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<Slider min={1} max={1024} style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item
name={['parser_config', 'raptor', 'max_cluster']}
noStyle
rules={[
{
required: true,
message: t('maxClusterMessage'),
},
]}
>
<InputNumber max={1024} min={1} />
</Form.Item>
</Flex>
</Form.Item>
<Form.Item label={t('randomSeed')}>
<Flex gap={20} align="center">
<Flex flex={1}>
<Form.Item
name={['parser_config', 'raptor', 'random_seed']}
noStyle
initialValue={0}
rules={[
{
required: true,
message: t('randomSeedMessage'),
},
]}
>
<InputNumber style={{ width: '100%' }} />
</Form.Item>
</Flex>
<Form.Item noStyle>
<Button type="primary" onClick={handleGenerate}>
<PlusOutlined />
</Button>
</Form.Item>
</Flex>
</Form.Item>
</>
)
);
}}
</Form.Item>
</>
);
};
export default ParseConfiguration;

View File

@ -265,6 +265,26 @@ export default {
</p><p>
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
</p>`,
useRaptor: 'Use RAPTOR to enhance retrieval',
useRaptorTip:
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
prompt: 'Prompt',
promptTip: 'LLM prompt used for summarization.',
promptMessage: 'Prompt is required',
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
{cluster_content}
The above is the content you need to summarize.`,
maxToken: 'Max token',
maxTokenTip: 'Maximum token number for summarization.',
maxTokenMessage: 'Max token is required',
threshold: 'Threshold',
thresholdTip: 'The bigger the threshold is the less cluster will be.',
thresholdMessage: 'Threshold is required',
maxCluster: 'Max cluster',
maxClusterTip: 'Maximum cluster number.',
maxClusterMessage: 'Max cluster is required',
randomSeed: 'Random seed',
randomSeedMessage: 'Random seed is required',
},
chunk: {
chunk: 'Chunk',

View File

@ -238,6 +238,25 @@ export default {
</p><p>
西LLM的上下文長度覆蓋了文檔長度
</p>`,
useRaptor: '使用RAPTOR文件增強策略',
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
prompt: '提示詞',
promptMessage: '提示詞是必填項',
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
{}
`,
maxToken: '最大token數',
maxTokenMessage: '最大token數是必填項',
threshold: '臨界點',
thresholdMessage: '臨界點是必填項',
maxCluster: '最大聚類數',
maxClusterMessage: '最大聚類數是必填項',
randomSeed: '隨機種子',
randomSeedMessage: '隨機種子是必填項',
promptTip: 'LLM提示用於總結。',
maxTokenTip: '用於匯總的最大token數。',
thresholdTip: '閾值越大,聚類越少。',
maxClusterTip: '最大聚類數。',
},
chunk: {
chunk: '解析塊',

View File

@ -255,6 +255,25 @@ export default {
</p><p>
西LLM的上下文长度覆盖了文档长度
</p>`,
useRaptor: '使用召回增强RAPTOR策略',
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
prompt: '提示词',
promptMessage: '提示词是必填项',
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
{}
`,
maxToken: '最大token数',
maxTokenMessage: '最大token数是必填项',
threshold: '临界点',
thresholdMessage: '临界点是必填项',
maxCluster: '最大聚类数',
maxClusterMessage: '最大聚类数是必填项',
randomSeed: '随机种子',
randomSeedMessage: '随机种子是必填项',
promptTip: 'LLM提示用于总结。',
maxTokenTip: '用于汇总的最大token数。',
thresholdTip: '阈值越大,聚类越少。',
maxClusterTip: '最大聚类数。',
},
chunk: {
chunk: '解析块',

View File

@ -8,6 +8,9 @@ import {
import LayoutRecognize from '@/components/layout-recognize';
import MaxTokenNumber from '@/components/max-token-number';
import ParseConfiguration, {
showRaptorParseConfiguration,
} from '@/components/parse-configuration';
import { useTranslate } from '@/hooks/commonHooks';
import { FormInstance } from 'antd/lib';
import styles from './index.less';
@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
{({ getFieldValue }) => {
const parserId = getFieldValue('parser_id');
if (parserId === 'naive') {
return (
<>
<MaxTokenNumber></MaxTokenNumber>
<LayoutRecognize></LayoutRecognize>
</>
);
}
return null;
return (
<>
{parserId === 'naive' && (
<>
<MaxTokenNumber></MaxTokenNumber>
<LayoutRecognize></LayoutRecognize>
</>
)}
{showRaptorParseConfiguration(parserId) && (
<ParseConfiguration></ParseConfiguration>
)}
</>
);
}}
</Form.Item>

View File

@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
'embd_id',
'parser_id',
'language',
'parser_config.chunk_token_num',
'parser_config',
]),
avatar: fileList,
});