mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 20:25:55 +08:00
### What problem does this PR solve? Implements RAPTOR for better chunking #882 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
43412571f7
commit
ed184ed87e
@ -23,6 +23,9 @@ import { useFetchParserListOnMount } from './hooks';
|
||||
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import LayoutRecognize from '../layout-recognize';
|
||||
import ParseConfiguration, {
|
||||
showRaptorParseConfiguration,
|
||||
} from '../parse-configuration';
|
||||
import styles from './index.less';
|
||||
|
||||
interface IProps extends Omit<IModalManagerChildrenProps, 'showModal'> {
|
||||
@ -111,6 +114,7 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
||||
onCancel={hideModal}
|
||||
afterClose={afterClose}
|
||||
confirmLoading={loading}
|
||||
width={700}
|
||||
>
|
||||
<Space size={[0, 8]} wrap>
|
||||
<Form.Item label={t('chunkMethod')} className={styles.chunkMethod}>
|
||||
@ -255,6 +259,9 @@ const ChunkMethodModal: React.FC<IProps> = ({
|
||||
</Form.Item>
|
||||
)}
|
||||
{showMaxTokenNumber && <MaxTokenNumber></MaxTokenNumber>}
|
||||
{showRaptorParseConfiguration(selectedTag) && (
|
||||
<ParseConfiguration></ParseConfiguration>
|
||||
)}
|
||||
</Form>
|
||||
</Modal>
|
||||
);
|
||||
|
206
web/src/components/parse-configuration/index.tsx
Normal file
206
web/src/components/parse-configuration/index.tsx
Normal file
@ -0,0 +1,206 @@
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import { PlusOutlined } from '@ant-design/icons';
|
||||
import {
|
||||
Button,
|
||||
Divider,
|
||||
Flex,
|
||||
Form,
|
||||
Input,
|
||||
InputNumber,
|
||||
Slider,
|
||||
Switch,
|
||||
} from 'antd';
|
||||
import random from 'lodash/random';
|
||||
|
||||
export const excludedParseMethods = ['table', 'resume', 'one'];
|
||||
|
||||
export const showRaptorParseConfiguration = (parserId: string) => {
|
||||
return !excludedParseMethods.includes(parserId);
|
||||
};
|
||||
|
||||
// The three types "table", "resume" and "one" do not display this configuration.
|
||||
const ParseConfiguration = () => {
|
||||
const form = Form.useFormInstance();
|
||||
const { t } = useTranslate('knowledgeConfiguration');
|
||||
|
||||
const handleGenerate = () => {
|
||||
form.setFieldValue(
|
||||
['parser_config', 'raptor', 'random_seed'],
|
||||
random(10000),
|
||||
);
|
||||
};
|
||||
|
||||
return (
|
||||
<>
|
||||
<Divider></Divider>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'use_raptor']}
|
||||
label={t('useRaptor')}
|
||||
initialValue={false}
|
||||
valuePropName="checked"
|
||||
tooltip={t('useRaptorTip')}
|
||||
>
|
||||
<Switch />
|
||||
</Form.Item>
|
||||
<Form.Item
|
||||
shouldUpdate={(prevValues, curValues) =>
|
||||
prevValues.parser_config.raptor.use_raptor !==
|
||||
curValues.parser_config.raptor.use_raptor
|
||||
}
|
||||
>
|
||||
{({ getFieldValue }) => {
|
||||
const useRaptor = getFieldValue([
|
||||
'parser_config',
|
||||
'raptor',
|
||||
'use_raptor',
|
||||
]);
|
||||
|
||||
return (
|
||||
useRaptor && (
|
||||
<>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'prompt']}
|
||||
label={t('prompt')}
|
||||
initialValue={t('promptText')}
|
||||
tooltip={t('promptTip')}
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('promptMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<Input.TextArea rows={8} />
|
||||
</Form.Item>
|
||||
<Form.Item label={t('maxToken')} tooltip={t('maxTokenTip')}>
|
||||
<Flex gap={20} align="center">
|
||||
<Flex flex={1}>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'max_token']}
|
||||
noStyle
|
||||
initialValue={128}
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('maxTokenMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<Slider max={2048} style={{ width: '100%' }} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'max_token']}
|
||||
noStyle
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('maxTokenMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<InputNumber max={2048} min={0} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
</Form.Item>
|
||||
<Form.Item label={t('threshold')} tooltip={t('thresholdTip')}>
|
||||
<Flex gap={20} align="center">
|
||||
<Flex flex={1}>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'threshold']}
|
||||
noStyle
|
||||
initialValue={0.1}
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('thresholdMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<Slider
|
||||
min={0}
|
||||
max={1}
|
||||
style={{ width: '100%' }}
|
||||
step={0.01}
|
||||
/>
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'threshold']}
|
||||
noStyle
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('thresholdMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<InputNumber max={1} min={0} step={0.01} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
</Form.Item>
|
||||
<Form.Item label={t('maxCluster')} tooltip={t('maxClusterTip')}>
|
||||
<Flex gap={20} align="center">
|
||||
<Flex flex={1}>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'max_cluster']}
|
||||
noStyle
|
||||
initialValue={64}
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('maxClusterMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<Slider min={1} max={1024} style={{ width: '100%' }} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'max_cluster']}
|
||||
noStyle
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('maxClusterMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<InputNumber max={1024} min={1} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
</Form.Item>
|
||||
<Form.Item label={t('randomSeed')}>
|
||||
<Flex gap={20} align="center">
|
||||
<Flex flex={1}>
|
||||
<Form.Item
|
||||
name={['parser_config', 'raptor', 'random_seed']}
|
||||
noStyle
|
||||
initialValue={0}
|
||||
rules={[
|
||||
{
|
||||
required: true,
|
||||
message: t('randomSeedMessage'),
|
||||
},
|
||||
]}
|
||||
>
|
||||
<InputNumber style={{ width: '100%' }} />
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
<Form.Item noStyle>
|
||||
<Button type="primary" onClick={handleGenerate}>
|
||||
<PlusOutlined />
|
||||
</Button>
|
||||
</Form.Item>
|
||||
</Flex>
|
||||
</Form.Item>
|
||||
</>
|
||||
)
|
||||
);
|
||||
}}
|
||||
</Form.Item>
|
||||
</>
|
||||
);
|
||||
};
|
||||
|
||||
export default ParseConfiguration;
|
@ -265,6 +265,26 @@ export default {
|
||||
</p><p>
|
||||
If you want to summarize something that needs all the context of an article and the selected LLM's context length covers the document length, you can try this method.
|
||||
</p>`,
|
||||
useRaptor: 'Use RAPTOR to enhance retrieval',
|
||||
useRaptorTip:
|
||||
'Recursive Abstractive Processing for Tree-Organized Retrieval, please refer to https://huggingface.co/papers/2401.18059',
|
||||
prompt: 'Prompt',
|
||||
promptTip: 'LLM prompt used for summarization.',
|
||||
promptMessage: 'Prompt is required',
|
||||
promptText: `Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:
|
||||
{cluster_content}
|
||||
The above is the content you need to summarize.`,
|
||||
maxToken: 'Max token',
|
||||
maxTokenTip: 'Maximum token number for summarization.',
|
||||
maxTokenMessage: 'Max token is required',
|
||||
threshold: 'Threshold',
|
||||
thresholdTip: 'The bigger the threshold is the less cluster will be.',
|
||||
thresholdMessage: 'Threshold is required',
|
||||
maxCluster: 'Max cluster',
|
||||
maxClusterTip: 'Maximum cluster number.',
|
||||
maxClusterMessage: 'Max cluster is required',
|
||||
randomSeed: 'Random seed',
|
||||
randomSeedMessage: 'Random seed is required',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Chunk',
|
||||
|
@ -238,6 +238,25 @@ export default {
|
||||
</p><p>
|
||||
如果你要總結的東西需要一篇文章的全部上下文,並且所選LLM的上下文長度覆蓋了文檔長度,你可以嘗試這種方法。
|
||||
</p>`,
|
||||
useRaptor: '使用RAPTOR文件增強策略',
|
||||
useRaptorTip: '請參考 https://huggingface.co/papers/2401.18059',
|
||||
prompt: '提示詞',
|
||||
promptMessage: '提示詞是必填項',
|
||||
promptText: `请請總結以下段落。 小心數字,不要編造。 段落如下:
|
||||
{集群內容}
|
||||
以上就是你需要總結的內容。`,
|
||||
maxToken: '最大token數',
|
||||
maxTokenMessage: '最大token數是必填項',
|
||||
threshold: '臨界點',
|
||||
thresholdMessage: '臨界點是必填項',
|
||||
maxCluster: '最大聚類數',
|
||||
maxClusterMessage: '最大聚類數是必填項',
|
||||
randomSeed: '隨機種子',
|
||||
randomSeedMessage: '隨機種子是必填項',
|
||||
promptTip: 'LLM提示用於總結。',
|
||||
maxTokenTip: '用於匯總的最大token數。',
|
||||
thresholdTip: '閾值越大,聚類越少。',
|
||||
maxClusterTip: '最大聚類數。',
|
||||
},
|
||||
chunk: {
|
||||
chunk: '解析塊',
|
||||
|
@ -255,6 +255,25 @@ export default {
|
||||
</p><p>
|
||||
如果你要总结的东西需要一篇文章的全部上下文,并且所选LLM的上下文长度覆盖了文档长度,你可以尝试这种方法。
|
||||
</p>`,
|
||||
useRaptor: '使用召回增强RAPTOR策略',
|
||||
useRaptorTip: '请参考 https://huggingface.co/papers/2401.18059',
|
||||
prompt: '提示词',
|
||||
promptMessage: '提示词是必填项',
|
||||
promptText: `请总结以下段落。 小心数字,不要编造。 段落如下:
|
||||
{集群内容}
|
||||
以上就是你需要总结的内容。`,
|
||||
maxToken: '最大token数',
|
||||
maxTokenMessage: '最大token数是必填项',
|
||||
threshold: '临界点',
|
||||
thresholdMessage: '临界点是必填项',
|
||||
maxCluster: '最大聚类数',
|
||||
maxClusterMessage: '最大聚类数是必填项',
|
||||
randomSeed: '随机种子',
|
||||
randomSeedMessage: '随机种子是必填项',
|
||||
promptTip: 'LLM提示用于总结。',
|
||||
maxTokenTip: '用于汇总的最大token数。',
|
||||
thresholdTip: '阈值越大,聚类越少。',
|
||||
maxClusterTip: '最大聚类数。',
|
||||
},
|
||||
chunk: {
|
||||
chunk: '解析块',
|
||||
|
@ -8,6 +8,9 @@ import {
|
||||
|
||||
import LayoutRecognize from '@/components/layout-recognize';
|
||||
import MaxTokenNumber from '@/components/max-token-number';
|
||||
import ParseConfiguration, {
|
||||
showRaptorParseConfiguration,
|
||||
} from '@/components/parse-configuration';
|
||||
import { useTranslate } from '@/hooks/commonHooks';
|
||||
import { FormInstance } from 'antd/lib';
|
||||
import styles from './index.less';
|
||||
@ -99,15 +102,19 @@ const ConfigurationForm = ({ form }: { form: FormInstance }) => {
|
||||
{({ getFieldValue }) => {
|
||||
const parserId = getFieldValue('parser_id');
|
||||
|
||||
if (parserId === 'naive') {
|
||||
return (
|
||||
<>
|
||||
<MaxTokenNumber></MaxTokenNumber>
|
||||
<LayoutRecognize></LayoutRecognize>
|
||||
</>
|
||||
);
|
||||
}
|
||||
return null;
|
||||
return (
|
||||
<>
|
||||
{parserId === 'naive' && (
|
||||
<>
|
||||
<MaxTokenNumber></MaxTokenNumber>
|
||||
<LayoutRecognize></LayoutRecognize>
|
||||
</>
|
||||
)}
|
||||
{showRaptorParseConfiguration(parserId) && (
|
||||
<ParseConfiguration></ParseConfiguration>
|
||||
)}
|
||||
</>
|
||||
);
|
||||
}}
|
||||
</Form.Item>
|
||||
|
||||
|
@ -62,7 +62,7 @@ export const useFetchKnowledgeConfigurationOnMount = (form: FormInstance) => {
|
||||
'embd_id',
|
||||
'parser_id',
|
||||
'language',
|
||||
'parser_config.chunk_token_num',
|
||||
'parser_config',
|
||||
]),
|
||||
avatar: fileList,
|
||||
});
|
||||
|
Loading…
x
Reference in New Issue
Block a user