From 86b546f657d8b5774284f814f4a507f5c8abf5eb Mon Sep 17 00:00:00 2001 From: writinwaters <93570324+writinwaters@users.noreply.github.com> Date: Wed, 30 Oct 2024 15:33:36 +0800 Subject: [PATCH] Updated parser_config description (#3104) ### What problem does this PR solve? ### Type of change - [x] Documentation Update --- api/http_api_reference.md | 47 ++++++++++++++-------- api/python_api_reference.md | 77 ++++++++++++++++++++++++++++++------- 2 files changed, 94 insertions(+), 30 deletions(-) diff --git a/api/http_api_reference.md b/api/http_api_reference.md index 156b8c230..8fe0a5215 100644 --- a/api/http_api_reference.md +++ b/api/http_api_reference.md @@ -78,7 +78,7 @@ curl --request POST \ - `"chunk_method"`: (*Body parameter*), `enum` The chunking method of the dataset to create. Available options: - `"naive"`: General (default) - - `"manual`: Manual + - `"manual"`: Manual - `"qa"`: Q&A - `"table"`: Table - `"paper"`: Paper @@ -88,16 +88,23 @@ curl --request POST \ - `"picture"`: Picture - `"one"`: One - `"knowledge_graph"`: Knowledge Graph - - `"email"`: Email - `"parser_config"`: (*Body parameter*), `object` - The configuration settings for the dataset parser, a JSON object containing the following attributes: - - `"chunk_token_count"`: Defaults to `128`. - - `"layout_recognize"`: Defaults to `true`. - - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`. - - `"delimiter"`: Defaults to `"\n!?。;!?"`. - - `"task_page_size"`: Defaults to `12`. For PDF only. - - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`. + The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: + - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes: + - `"chunk_token_count"`: Defaults to `128`. + - `"layout_recognize"`: Defaults to `true`. + - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`. + - `"delimiter"`: Defaults to `"\n!?。;!?"`. + - `"task_page_size"`: Defaults to `12`. For PDF only. + - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`. + - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: + - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`. + - If `"chunk_method"` is `"table"` or `"one"`, `"parser_config"` is an empty JSON object. + - If `"chunk_method"` is `"knowledge_graph"`, the `"parser_config"` object contains the following attributes: + - `"chunk_token_count"`: Defaults to `128`. + - `"delimiter"`: Defaults to `"\n!?。;!?"`. + - `"entity_types"`: Defaults to `["organization","person","location","event","time"]` ### Response @@ -256,7 +263,6 @@ curl --request PUT \ - `"picture"`: Picture - `"one"`:One - `"knowledge_graph"`: Knowledge Graph - - `"email"`: Email ### Response @@ -511,13 +517,22 @@ curl --request PUT \ - `"picture"`: Picture - `"one"`: One - `"knowledge_graph"`: Knowledge Graph - - `"email"`: Email - `"parser_config"`: (*Body parameter*), `object` - The parsing configuration for the document: - - `"chunk_token_count"`: Defaults to `128`. - - `"layout_recognize"`: Defaults to `true`. - - `"delimiter"`: Defaults to `"\n!?。;!?"`. - - `"task_page_size"`: Defaults to `12`. For PDF only. + The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`: + - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes: + - `"chunk_token_count"`: Defaults to `128`. + - `"layout_recognize"`: Defaults to `true`. + - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`. + - `"delimiter"`: Defaults to `"\n!?。;!?"`. + - `"task_page_size"`: Defaults to `12`. For PDF only. + - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`. + - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute: + - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`. + - If `"chunk_method"` is `"table"` or `"one"`, `"parser_config"` is an empty JSON object. + - If `"chunk_method"` is `"knowledge_graph"`, the `"parser_config"` object contains the following attributes: + - `"chunk_token_count"`: Defaults to `128`. + - `"delimiter"`: Defaults to `"\n!?。;!?"`. + - `"entity_types"`: Defaults to `["organization","person","location","event","time"]` ### Response diff --git a/api/python_api_reference.md b/api/python_api_reference.md index bc6f4d306..be73a277f 100644 --- a/api/python_api_reference.md +++ b/api/python_api_reference.md @@ -75,16 +75,31 @@ The chunking method of the dataset to create. Available options: - `"picture"`: Picture - `"one"`: One - `"knowledge_graph"`: Knowledge Graph -- `"email"`: Email #### parser_config -The parser configuration of the dataset. A `ParserConfig` object contains the following attributes: +The parser configuration of the dataset. A `ParserConfig` object's attributes vary based on the selected `"chunk_method"`: -- `chunk_token_count`: Defaults to `128`. -- `layout_recognize`: Defaults to `True`. -- `delimiter`: Defaults to `"\n!?。;!?"`. -- `task_page_size`: Defaults to `12`. +- `"chunk_method"`=`"naive"`: + `{"chunk_token_num":128,"delimiter":"\\n!?;。;!?","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`. +- `chunk_method`=`"qa"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"manuel"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"table"`: + `None` +- `chunk_method`=`"paper"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"book"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"laws"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"presentation"`: + `{"raptor": {"user_raptor": False}}` +- `chunk_method`=`"one"`: + `None` +- `chunk_method`=`"knowledge-graph"`: + `{"chunk_token_num":128,"delimiter":"\\n!?;。;!?","entity_types":["organization","person","location","event","time"]}` ### Returns @@ -225,7 +240,6 @@ A dictionary representing the attributes to update, with the following keys: - `"picture"`: Picture - `"one"`: One - `"knowledge_graph"`: Knowledge Graph - - `"email"`: Email ### Returns @@ -296,11 +310,6 @@ Updates configurations for the current document. A dictionary representing the attributes to update, with the following keys: - `"display_name"`: `str` The name of the document to update. -- `"parser_config"`: `dict[str, Any]` The parsing configuration for the document: - - `"chunk_token_count"`: Defaults to `128`. - - `"layout_recognize"`: Defaults to `True`. - - `"delimiter"`: Defaults to `'\n!?。;!?'`. - - `"task_page_size"`: Defaults to `12`. - `"chunk_method"`: `str` The parsing method to apply to the document. - `"naive"`: General - `"manual`: Manual @@ -313,7 +322,27 @@ A dictionary representing the attributes to update, with the following keys: - `"picture"`: Picture - `"one"`: One - `"knowledge_graph"`: Knowledge Graph - - `"email"`: Email +- `"parser_config"`: `dict[str, Any]` The parsing configuration for the document. Its attributes vary based on the selected `"chunk_method"`: + - `"chunk_method"`=`"naive"`: + `{"chunk_token_num":128,"delimiter":"\\n!?;。;!?","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`. + - `chunk_method`=`"qa"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"manuel"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"table"`: + `None` + - `chunk_method`=`"paper"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"book"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"laws"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"presentation"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"one"`: + `None` + - `chunk_method`=`"knowledge-graph"`: + `{"chunk_token_num":128,"delimiter":"\\n!?;。;!?","entity_types":["organization","person","location","event","time"]}` ### Returns @@ -412,7 +441,6 @@ A `Document` object contains the following attributes: - `thumbnail`: The thumbnail image of the document. Defaults to `None`. - `dataset_id`: The dataset ID associated with the document. Defaults to `None`. - `chunk_method` The chunk method name. Defaults to `"naive"`. -- `parser_config`: `ParserConfig` Configuration object for the parser. Defaults to `{"pages": [[1, 1000000]]}`. - `source_type`: The source type of the document. Defaults to `"local"`. - `type`: Type or category of the document. Defaults to `""`. Reserved for future use. - `created_by`: `str` The creator of the document. Defaults to `""`. @@ -430,6 +458,27 @@ A `Document` object contains the following attributes: - `"DONE"` - `"FAIL"` - `status`: `str` Reserved for future use. +- `parser_config`: `ParserConfig` Configuration object for the parser. Its attributes vary based on the selected `chunk_method`: + - `chunk_method`=`"naive"`: + `{"chunk_token_num":128,"delimiter":"\\n!?;。;!?","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`. + - `chunk_method`=`"qa"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"manuel"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"table"`: + `None` + - `chunk_method`=`"paper"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"book"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"laws"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"presentation"`: + `{"raptor": {"user_raptor": False}}` + - `chunk_method`=`"one"`: + `None` + - `chunk_method`=`"knowledge-graph"`: + `{"chunk_token_num":128,"delimiter": "\\n!?;。;!?","entity_types":["organization","person","location","event","time"]}` ### Examples