diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index e247dc956..e17667e4f 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -353,7 +353,7 @@ def get_parser_config(chunk_method, parser_config): if not chunk_method: chunk_method = "naive" key_mapping = { - "naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, + "naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, "qa": {"raptor": {"use_raptor": False}}, "tag": None, "resume": None, @@ -364,7 +364,7 @@ def get_parser_config(chunk_method, parser_config): "laws": {"raptor": {"use_raptor": False}}, "presentation": {"raptor": {"use_raptor": False}}, "one": None, - "knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]}, + "knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]}, "email": None, "picture": None, } diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index c95381a5d..dea0ddb87 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -96,7 +96,7 @@ class ParserConfig(Base): auto_keywords: int = Field(default=0, ge=0, le=32) auto_questions: int = Field(default=0, ge=0, le=10) chunk_token_num: int = Field(default=128, ge=1, le=2048) - delimiter: str = Field(default=r"\n!?;。;!?", min_length=1) + delimiter: str = Field(default=r"\n", min_length=1) graphrag: Optional[GraphragConfig] = None html4excel: bool = False layout_recognize: str = "DeepDOC" diff --git a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py index bb2c1c60a..42db8ca99 100644 --- a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -400,7 +400,7 @@ class TestDatasetCreation: if parser_config is None: assert res["data"]["parser_config"] == { "chunk_token_num": 128, - "delimiter": r"\n!?;。;!?", + "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, @@ -410,7 +410,7 @@ class TestDatasetCreation: "auto_keywords": 0, "auto_questions": 0, "chunk_token_num": 128, - "delimiter": r"\n!?;。;!?", + "delimiter": r"\n", "filename_embd_weight": None, "graphrag": None, "html4excel": False, diff --git a/sdk/python/test/test_http_api/test_file_management_within_dataset/test_update_document.py b/sdk/python/test/test_http_api/test_file_management_within_dataset/test_update_document.py index 56814bc80..122661b9e 100644 --- a/sdk/python/test/test_http_api/test_file_management_within_dataset/test_update_document.py +++ b/sdk/python/test/test_http_api/test_file_management_within_dataset/test_update_document.py @@ -303,7 +303,7 @@ class TestUpdateDocumentParserConfig: "chunk_token_num": 128, "layout_recognize": "DeepDOC", "html4excel": False, - "delimiter": "\\n!?;。;!?", + "delimiter": r"\n", "task_page_size": 12, "raptor": {"use_raptor": False}, }, @@ -530,7 +530,7 @@ class TestUpdateDocumentParserConfig: else: assert res["data"]["docs"][0]["parser_config"] == { "chunk_token_num": 128, - "delimiter": "\\n!?;。;!?", + "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False},