From 86b546f657d8b5774284f814f4a507f5c8abf5eb Mon Sep 17 00:00:00 2001
From: writinwaters <93570324+writinwaters@users.noreply.github.com>
Date: Wed, 30 Oct 2024 15:33:36 +0800
Subject: [PATCH] Updated parser_config description (#3104)

### What problem does this PR solve?



### Type of change


- [x] Documentation Update
---
 api/http_api_reference.md   | 47 ++++++++++++++--------
 api/python_api_reference.md | 77 ++++++++++++++++++++++++++++++-------
 2 files changed, 94 insertions(+), 30 deletions(-)
diff --git a/api/http_api_reference.md b/api/http_api_reference.md
index 156b8c230..8fe0a5215 100644
--- a/api/http_api_reference.md
+++ b/api/http_api_reference.md
@@ -78,7 +78,7 @@ curl --request POST \
 - `"chunk_method"`: (*Body parameter*), `enum<string>`  
   The chunking method of the dataset to create. Available options:  
   - `"naive"`: General (default)
-  - `"manual`: Manual
+  - `"manual"`: Manual
   - `"qa"`: Q&A
   - `"table"`: Table
   - `"paper"`: Paper
@@ -88,16 +88,23 @@ curl --request POST \
   - `"picture"`: Picture
   - `"one"`: One
   - `"knowledge_graph"`: Knowledge Graph
-  - `"email"`: Email
 
 - `"parser_config"`: (*Body parameter*), `object`  
-  The configuration settings for the dataset parser, a JSON object containing the following attributes:
-  - `"chunk_token_count"`: Defaults to `128`.
-  - `"layout_recognize"`: Defaults to `true`.
-  - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`.
-  - `"delimiter"`: Defaults to `"\n!?。；！？"`.
-  - `"task_page_size"`: Defaults to `12`. For PDF only.
-  - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`.
+  The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`:  
+  - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes:
+    - `"chunk_token_count"`: Defaults to `128`.
+    - `"layout_recognize"`: Defaults to `true`.
+    - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`.
+    - `"delimiter"`: Defaults to `"\n!?。；！？"`.
+    - `"task_page_size"`: Defaults to `12`. For PDF only.
+    - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`.
+  - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute:  
+    - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`.
+  - If `"chunk_method"` is `"table"` or `"one"`, `"parser_config"` is an empty JSON object.
+  - If `"chunk_method"` is `"knowledge_graph"`, the `"parser_config"` object contains the following attributes:  
+    - `"chunk_token_count"`: Defaults to `128`.
+    - `"delimiter"`: Defaults to `"\n!?。；！？"`.
+    - `"entity_types"`: Defaults to `["organization","person","location","event","time"]`
 
 ### Response
 
@@ -256,7 +263,6 @@ curl --request PUT \
   - `"picture"`: Picture
   - `"one"`:One
   - `"knowledge_graph"`: Knowledge Graph
-  - `"email"`: Email
 
 ### Response
 
@@ -511,13 +517,22 @@ curl --request PUT \
   - `"picture"`: Picture
   - `"one"`: One
   - `"knowledge_graph"`: Knowledge Graph
-  - `"email"`: Email
 - `"parser_config"`: (*Body parameter*), `object`  
-  The parsing configuration for the document:  
-  - `"chunk_token_count"`: Defaults to `128`.
-  - `"layout_recognize"`: Defaults to `true`.
-  - `"delimiter"`: Defaults to `"\n!?。；！？"`.
-  - `"task_page_size"`: Defaults to `12`. For PDF only.
+  The configuration settings for the dataset parser. The attributes in this JSON object vary with the selected `"chunk_method"`:  
+  - If `"chunk_method"` is `"naive"`, the `"parser_config"` object contains the following attributes:
+    - `"chunk_token_count"`: Defaults to `128`.
+    - `"layout_recognize"`: Defaults to `true`.
+    - `"html4excel"`: Indicates whether to convert Excel documents into HTML format. Defaults to `false`.
+    - `"delimiter"`: Defaults to `"\n!?。；！？"`.
+    - `"task_page_size"`: Defaults to `12`. For PDF only.
+    - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`.
+  - If `"chunk_method"` is `"qa"`, `"manuel"`, `"paper"`, `"book"`, `"laws"`, or `"presentation"`, the `"parser_config"` object contains the following attribute:
+    - `"raptor"`: Raptor-specific settings. Defaults to: `{"use_raptor": false}`.
+  - If `"chunk_method"` is `"table"` or `"one"`, `"parser_config"` is an empty JSON object.
+  - If `"chunk_method"` is `"knowledge_graph"`, the `"parser_config"` object contains the following attributes:
+    - `"chunk_token_count"`: Defaults to `128`.
+    - `"delimiter"`: Defaults to `"\n!?。；！？"`.
+    - `"entity_types"`: Defaults to `["organization","person","location","event","time"]`
 
 ### Response
 
diff --git a/api/python_api_reference.md b/api/python_api_reference.md
index bc6f4d306..be73a277f 100644
--- a/api/python_api_reference.md
+++ b/api/python_api_reference.md
@@ -75,16 +75,31 @@ The chunking method of the dataset to create. Available options:
 - `"picture"`: Picture
 - `"one"`: One
 - `"knowledge_graph"`: Knowledge Graph
-- `"email"`: Email
 
 #### parser_config
 
-The parser configuration of the dataset. A `ParserConfig` object contains the following attributes:
+The parser configuration of the dataset. A `ParserConfig` object's attributes vary based on the selected `"chunk_method"`:
 
-- `chunk_token_count`: Defaults to `128`.
-- `layout_recognize`: Defaults to `True`.
-- `delimiter`: Defaults to `"\n!?。；！？"`.
-- `task_page_size`: Defaults to `12`.
+- `"chunk_method"`=`"naive"`:  
+  `{"chunk_token_num":128,"delimiter":"\\n!?;。；！？","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`.
+- `chunk_method`=`"qa"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"manuel"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"table"`:  
+  `None`
+- `chunk_method`=`"paper"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"book"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"laws"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"presentation"`:  
+  `{"raptor": {"user_raptor": False}}`
+- `chunk_method`=`"one"`:  
+  `None`
+- `chunk_method`=`"knowledge-graph"`:  
+  `{"chunk_token_num":128,"delimiter":"\\n!?;。；！？","entity_types":["organization","person","location","event","time"]}`
 
 ### Returns
 
@@ -225,7 +240,6 @@ A dictionary representing the attributes to update, with the following keys:
   - `"picture"`: Picture
   - `"one"`: One
   - `"knowledge_graph"`: Knowledge Graph
-  - `"email"`: Email
 
 ### Returns
 
@@ -296,11 +310,6 @@ Updates configurations for the current document.
 A dictionary representing the attributes to update, with the following keys:
 
 - `"display_name"`: `str` The name of the document to update.
-- `"parser_config"`: `dict[str, Any]` The parsing configuration for the document:
-  - `"chunk_token_count"`: Defaults to `128`.
-  - `"layout_recognize"`: Defaults to `True`.
-  - `"delimiter"`: Defaults to `'\n!?。；！？'`.
-  - `"task_page_size"`: Defaults to `12`.
 - `"chunk_method"`: `str` The parsing method to apply to the document.
   - `"naive"`: General
   - `"manual`: Manual
@@ -313,7 +322,27 @@ A dictionary representing the attributes to update, with the following keys:
   - `"picture"`: Picture
   - `"one"`: One
   - `"knowledge_graph"`: Knowledge Graph
-  - `"email"`: Email
+- `"parser_config"`: `dict[str, Any]` The parsing configuration for the document. Its attributes vary based on the selected `"chunk_method"`:
+  - `"chunk_method"`=`"naive"`:  
+    `{"chunk_token_num":128,"delimiter":"\\n!?;。；！？","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`.
+  - `chunk_method`=`"qa"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"manuel"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"table"`:  
+    `None`
+  - `chunk_method`=`"paper"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"book"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"laws"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"presentation"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"one"`:  
+    `None`
+  - `chunk_method`=`"knowledge-graph"`:  
+    `{"chunk_token_num":128,"delimiter":"\\n!?;。；！？","entity_types":["organization","person","location","event","time"]}`
 
 ### Returns
 
@@ -412,7 +441,6 @@ A `Document` object contains the following attributes:
 - `thumbnail`: The thumbnail image of the document. Defaults to `None`.
 - `dataset_id`: The dataset ID associated with the document. Defaults to `None`.
 - `chunk_method` The chunk method name. Defaults to `"naive"`.
-- `parser_config`: `ParserConfig` Configuration object for the parser. Defaults to `{"pages": [[1, 1000000]]}`.
 - `source_type`: The source type of the document. Defaults to `"local"`.
 - `type`: Type or category of the document. Defaults to `""`. Reserved for future use.
 - `created_by`: `str` The creator of the document. Defaults to `""`.
@@ -430,6 +458,27 @@ A `Document` object contains the following attributes:
   - `"DONE"`
   - `"FAIL"`
 - `status`: `str` Reserved for future use.
+- `parser_config`: `ParserConfig` Configuration object for the parser. Its attributes vary based on the selected `chunk_method`:
+  - `chunk_method`=`"naive"`:  
+    `{"chunk_token_num":128,"delimiter":"\\n!?;。；！？","html4excel":False,"layout_recognize":True,"raptor":{"user_raptor":False}}`.
+  - `chunk_method`=`"qa"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"manuel"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"table"`:  
+    `None`
+  - `chunk_method`=`"paper"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"book"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"laws"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"presentation"`:  
+    `{"raptor": {"user_raptor": False}}`
+  - `chunk_method`=`"one"`:  
+    `None`
+  - `chunk_method`=`"knowledge-graph"`:  
+    `{"chunk_token_num":128,"delimiter": "\\n!?;。；！？","entity_types":["organization","person","location","event","time"]}`
 
 ### Examples