mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 23:56:01 +08:00
### What problem does this PR solve? Implements RAPTOR for better chunking #882 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
43412571f7
commit
ed184ed87e
@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
|
|||||||
|
|
||||||
import { useTranslate } from '@/hooks/commonHooks';
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
import LayoutRecognize from '../layout-recognize';
|
import LayoutRecognize from '../layout-recognize';
|
||||||
|
import ParseConfiguration, {
|
||||||
|
showRaptorParseConfiguration,
|
||||||
|
} from '../parse-configuration';
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
|
|
||||||
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
||||||
@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|||||||
onCancel={hideModal}
|
onCancel={hideModal}
|
||||||
afterClose={afterClose}
|
afterClose={afterClose}
|
||||||
confirmLoading={loading}
|
confirmLoading={loading}
|
||||||
|
width={700}
|
||||||
>
|
>
|
||||||
<Space size={[0, 8]} wrap>
|
<Space size={[0, 8]} wrap>
|
||||||
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
||||||
@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
|||||||
</Form.Item>
|
</Form.Item>
|
||||||
)}
|
)}
|
||||||
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
|
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
|
||||||
|
{showRaptorParseConfiguration(selectedTag) && (
|
||||||
|
<ParseConfiguration></ParseConfiguration>
|
||||||
|
)}
|
||||||
</Form>
|
</Form>
|
||||||
</Modal>
|
</Modal>
|
||||||
);
|
);
|
||||||
|
206
web/src/components/parse-configuration/index.tsx
Normal file
206
web/src/components/parse-configuration/index.tsx
Normal file
@ -0,0 +1,206 @@
|
|||||||
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
|
import { PlusOutlined } from '@ant-design/icons';
|
||||||
|
import {
|
||||||
|
Button,
|
||||||
|
Divider,
|
||||||
|
Flex,
|
||||||
|
Form,
|
||||||
|
Input,
|
||||||
|
InputNumber,
|
||||||
|
Slider,
|
||||||
|
Switch,
|
||||||
|
} from 'antd';
|
||||||
|
import random from 'lodash/random';
|
||||||
|
|
||||||
|
export const excludedParseMethods = ['table', 'resume', 'one'];
|
||||||
|
|
||||||
|
export const showRaptorParseConfiguration = (parserId: string) => {
|
||||||
|
return !excludedParseMethods.includes(parserId);
|
||||||
|
};
|
||||||
|
|
||||||
|
// The three types "table", "resume" and "one" do not display this configuration.
|
||||||
|
const ParseConfiguration = () => {
|
||||||
|
const form = Form.useFormInstance();
|
||||||
|
const { t } = useTranslate('knowledgeConfiguration');
|
||||||
|
|
||||||
|
const handleGenerate = () => {
|
||||||
|
form.setFieldValue(
|
||||||
|
['parser_config', 'raptor', 'random_seed'],
|
||||||
|
random(10000),
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
return (
|
||||||
|
<>
|
||||||
|
<Divider></Divider>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'use_raptor']}
|
||||||
|
label={t('useRaptor')}
|
||||||
|
initialValue={false}
|
||||||
|
valuePropName="checked"
|
||||||
|
tooltip={t('useRaptorTip')}
|
||||||
|
>
|
||||||
|
<Switch />
|
||||||
|
</Form.Item>
|
||||||
|
<Form.Item
|
||||||
|
shouldUpdate={(prevValues, curValues) =>
|
||||||
|
prevValues.parser_config.raptor.use_raptor !==
|
||||||
|
curValues.parser_config.raptor.use_raptor
|
||||||
|
}
|
||||||
|
>
|
||||||
|
{({ getFieldValue }) => {
|
||||||
|
const useRaptor = getFieldValue([
|
||||||
|
'parser_config',
|
||||||
|
'raptor',
|
||||||
|
'use_raptor',
|
||||||
|
]);
|
||||||
|
|
||||||
|
return (
|
||||||
|
useRaptor && (
|
||||||
|
<>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'prompt']}
|
||||||
|
label={t('prompt')}
|
||||||
|
initialValue={t('promptText')}
|
||||||
|
tooltip={t('promptTip')}
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('promptMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<Input.TextArea rows={8} />
|
||||||
|
</Form.Item>
|
||||||
|
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
|
||||||
|
<Flex gap={20} align="center">
|
||||||
|
<Flex flex={1}>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'max_token']}
|
||||||
|
noStyle
|
||||||
|
initialValue={128}
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('maxTokenMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<Slider max={2048} style={{ width: '100%' }} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'max_token']}
|
||||||
|
noStyle
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('maxTokenMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<InputNumber max={2048} min={0} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
</Form.Item>
|
||||||
|
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
|
||||||
|
<Flex gap={20} align="center">
|
||||||
|
<Flex flex={1}>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'threshold']}
|
||||||
|
noStyle
|
||||||
|
initialValue={0.1}
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('thresholdMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<Slider
|
||||||
|
min={0}
|
||||||
|
max={1}
|
||||||
|
style={{ width: '100%' }}
|
||||||
|
step={0.01}
|
||||||
|
/>
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'threshold']}
|
||||||
|
noStyle
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('thresholdMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<InputNumber max={1} min={0} step={0.01} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
</Form.Item>
|
||||||
|
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
|
||||||
|
<Flex gap={20} align="center">
|
||||||
|
<Flex flex={1}>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'max_cluster']}
|
||||||
|
noStyle
|
||||||
|
initialValue={64}
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('maxClusterMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<Slider min={1} max={1024} style={{ width: '100%' }} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'max_cluster']}
|
||||||
|
noStyle
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('maxClusterMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<InputNumber max={1024} min={1} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
</Form.Item>
|
||||||
|
<Form.Item label={t('randomSeed')}>
|
||||||
|
<Flex gap={20} align="center">
|
||||||
|
<Flex flex={1}>
|
||||||
|
<Form.Item
|
||||||
|
name={['parser_config', 'raptor', 'random_seed']}
|
||||||
|
noStyle
|
||||||
|
initialValue={0}
|
||||||
|
rules={[
|
||||||
|
{
|
||||||
|
required: true,
|
||||||
|
message: t('randomSeedMessage'),
|
||||||
|
},
|
||||||
|
]}
|
||||||
|
>
|
||||||
|
<InputNumber style={{ width: '100%' }} />
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
<Form.Item noStyle>
|
||||||
|
<Button type="primary" onClick={handleGenerate}>
|
||||||
|
<PlusOutlined />
|
||||||
|
</Button>
|
||||||
|
</Form.Item>
|
||||||
|
</Flex>
|
||||||
|
</Form.Item>
|
||||||
|
</>
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}}
|
||||||
|
</Form.Item>
|
||||||
|
</>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
export default ParseConfiguration;
|
@ -265,6 +265,26 @@ export default {
|
|||||||
</p><p>
|
</p><p>
|
||||||
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
||||||
</p>`,
|
</p>`,
|
||||||
|
useRaptor: 'Use RAPTOR to enhance retrieval',
|
||||||
|
useRaptorTip:
|
||||||
|
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
||||||
|
prompt: 'Prompt',
|
||||||
|
promptTip: 'LLM prompt used for summarization.',
|
||||||
|
promptMessage: 'Prompt is required',
|
||||||
|
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
|
||||||
|
{cluster_content}
|
||||||
|
The above is the content you need to summarize.`,
|
||||||
|
maxToken: 'Max token',
|
||||||
|
maxTokenTip: 'Maximum token number for summarization.',
|
||||||
|
maxTokenMessage: 'Max token is required',
|
||||||
|
threshold: 'Threshold',
|
||||||
|
thresholdTip: 'The bigger the threshold is the less cluster will be.',
|
||||||
|
thresholdMessage: 'Threshold is required',
|
||||||
|
maxCluster: 'Max cluster',
|
||||||
|
maxClusterTip: 'Maximum cluster number.',
|
||||||
|
maxClusterMessage: 'Max cluster is required',
|
||||||
|
randomSeed: 'Random seed',
|
||||||
|
randomSeedMessage: 'Random seed is required',
|
||||||
},
|
},
|
||||||
chunk: {
|
chunk: {
|
||||||
chunk: 'Chunk',
|
chunk: 'Chunk',
|
||||||
|
@ -238,6 +238,25 @@ export default {
|
|||||||
</p><p>
|
</p><p>
|
||||||
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
||||||
</p>`,
|
</p>`,
|
||||||
|
useRaptor: '使用RAPTOR文件增強策略',
|
||||||
|
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
||||||
|
prompt: '提示詞',
|
||||||
|
promptMessage: '提示詞是必填項',
|
||||||
|
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
|
||||||
|
{集群內容}
|
||||||
|
以上就是你需要總結的內容。`,
|
||||||
|
maxToken: '最大token數',
|
||||||
|
maxTokenMessage: '最大token數是必填項',
|
||||||
|
threshold: '臨界點',
|
||||||
|
thresholdMessage: '臨界點是必填項',
|
||||||
|
maxCluster: '最大聚類數',
|
||||||
|
maxClusterMessage: '最大聚類數是必填項',
|
||||||
|
randomSeed: '隨機種子',
|
||||||
|
randomSeedMessage: '隨機種子是必填項',
|
||||||
|
promptTip: 'LLM提示用於總結。',
|
||||||
|
maxTokenTip: '用於匯總的最大token數。',
|
||||||
|
thresholdTip: '閾值越大,聚類越少。',
|
||||||
|
maxClusterTip: '最大聚類數。',
|
||||||
},
|
},
|
||||||
chunk: {
|
chunk: {
|
||||||
chunk: '解析塊',
|
chunk: '解析塊',
|
||||||
|
@ -255,6 +255,25 @@ export default {
|
|||||||
</p><p>
|
</p><p>
|
||||||
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
||||||
</p>`,
|
</p>`,
|
||||||
|
useRaptor: '使用召回增强RAPTOR策略',
|
||||||
|
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
||||||
|
prompt: '提示词',
|
||||||
|
promptMessage: '提示词是必填项',
|
||||||
|
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
|
||||||
|
{集群内容}
|
||||||
|
以上就是你需要总结的内容。`,
|
||||||
|
maxToken: '最大token数',
|
||||||
|
maxTokenMessage: '最大token数是必填项',
|
||||||
|
threshold: '临界点',
|
||||||
|
thresholdMessage: '临界点是必填项',
|
||||||
|
maxCluster: '最大聚类数',
|
||||||
|
maxClusterMessage: '最大聚类数是必填项',
|
||||||
|
randomSeed: '随机种子',
|
||||||
|
randomSeedMessage: '随机种子是必填项',
|
||||||
|
promptTip: 'LLM提示用于总结。',
|
||||||
|
maxTokenTip: '用于汇总的最大token数。',
|
||||||
|
thresholdTip: '阈值越大,聚类越少。',
|
||||||
|
maxClusterTip: '最大聚类数。',
|
||||||
},
|
},
|
||||||
chunk: {
|
chunk: {
|
||||||
chunk: '解析块',
|
chunk: '解析块',
|
||||||
|
@ -8,6 +8,9 @@ import {
|
|||||||
|
|
||||||
import LayoutRecognize from '@/components/layout-recognize';
|
import LayoutRecognize from '@/components/layout-recognize';
|
||||||
import MaxTokenNumber from '@/components/max-token-number';
|
import MaxTokenNumber from '@/components/max-token-number';
|
||||||
|
import ParseConfiguration, {
|
||||||
|
showRaptorParseConfiguration,
|
||||||
|
} from '@/components/parse-configuration';
|
||||||
import { useTranslate } from '@/hooks/commonHooks';
|
import { useTranslate } from '@/hooks/commonHooks';
|
||||||
import { FormInstance } from 'antd/lib';
|
import { FormInstance } from 'antd/lib';
|
||||||
import styles from './index.less';
|
import styles from './index.less';
|
||||||
@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
|
|||||||
{({ getFieldValue }) => {
|
{({ getFieldValue }) => {
|
||||||
const parserId = getFieldValue('parser_id');
|
const parserId = getFieldValue('parser_id');
|
||||||
|
|
||||||
if (parserId === 'naive') {
|
return (
|
||||||
return (
|
<>
|
||||||
<>
|
{parserId === 'naive' && (
|
||||||
<MaxTokenNumber></MaxTokenNumber>
|
<>
|
||||||
<LayoutRecognize></LayoutRecognize>
|
<MaxTokenNumber></MaxTokenNumber>
|
||||||
</>
|
<LayoutRecognize></LayoutRecognize>
|
||||||
);
|
</>
|
||||||
}
|
)}
|
||||||
return null;
|
{showRaptorParseConfiguration(parserId) && (
|
||||||
|
<ParseConfiguration></ParseConfiguration>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
);
|
||||||
}}
|
}}
|
||||||
</Form.Item>
|
</Form.Item>
|
||||||
|
|
||||||
|
@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
|||||||
'embd_id',
|
'embd_id',
|
||||||
'parser_id',
|
'parser_id',
|
||||||
'language',
|
'language',
|
||||||
'parser_config.chunk_token_num',
|
'parser_config',
|
||||||
]),
|
]),
|
||||||
avatar: fileList,
|
avatar: fileList,
|
||||||
});
|
});
|
||||||
|
Loading…
x
Reference in New Issue
Block a user