Fix: change create dataset htto api delimiter default value to r'\n' (#7434)

### What problem does this PR solve?

change create dataset delimiter default value to r'\n'

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
liu an 2025-04-30 17:43:42 +08:00 committed by GitHub
parent fea9d970ec
commit fc379e90d1
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 7 additions and 7 deletions

View File

@ -353,7 +353,7 @@ def get_parser_config(chunk_method, parser_config):
if not chunk_method: if not chunk_method:
chunk_method = "naive" chunk_method = "naive"
key_mapping = { key_mapping = {
"naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, "naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
"qa": {"raptor": {"use_raptor": False}}, "qa": {"raptor": {"use_raptor": False}},
"tag": None, "tag": None,
"resume": None, "resume": None,
@ -364,7 +364,7 @@ def get_parser_config(chunk_method, parser_config):
"laws": {"raptor": {"use_raptor": False}}, "laws": {"raptor": {"use_raptor": False}},
"presentation": {"raptor": {"use_raptor": False}}, "presentation": {"raptor": {"use_raptor": False}},
"one": None, "one": None,
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]}, "knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
"email": None, "email": None,
"picture": None, "picture": None,
} }

View File

@ -96,7 +96,7 @@ class ParserConfig(Base):
auto_keywords: int = Field(default=0, ge=0, le=32) auto_keywords: int = Field(default=0, ge=0, le=32)
auto_questions: int = Field(default=0, ge=0, le=10) auto_questions: int = Field(default=0, ge=0, le=10)
chunk_token_num: int = Field(default=128, ge=1, le=2048) chunk_token_num: int = Field(default=128, ge=1, le=2048)
delimiter: str = Field(default=r"\n!?;。;!?", min_length=1) delimiter: str = Field(default=r"\n", min_length=1)
graphrag: Optional[GraphragConfig] = None graphrag: Optional[GraphragConfig] = None
html4excel: bool = False html4excel: bool = False
layout_recognize: str = "DeepDOC" layout_recognize: str = "DeepDOC"

View File

@ -400,7 +400,7 @@ class TestDatasetCreation:
if parser_config is None: if parser_config is None:
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 128,
"delimiter": r"\n!?;。;!?", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False}, "raptor": {"use_raptor": False},
@ -410,7 +410,7 @@ class TestDatasetCreation:
"auto_keywords": 0, "auto_keywords": 0,
"auto_questions": 0, "auto_questions": 0,
"chunk_token_num": 128, "chunk_token_num": 128,
"delimiter": r"\n!?;。;!?", "delimiter": r"\n",
"filename_embd_weight": None, "filename_embd_weight": None,
"graphrag": None, "graphrag": None,
"html4excel": False, "html4excel": False,

View File

@ -303,7 +303,7 @@ class TestUpdateDocumentParserConfig:
"chunk_token_num": 128, "chunk_token_num": 128,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"html4excel": False, "html4excel": False,
"delimiter": "\\n!?;。;!?", "delimiter": r"\n",
"task_page_size": 12, "task_page_size": 12,
"raptor": {"use_raptor": False}, "raptor": {"use_raptor": False},
}, },
@ -530,7 +530,7 @@ class TestUpdateDocumentParserConfig:
else: else:
assert res["data"]["docs"][0]["parser_config"] == { assert res["data"]["docs"][0]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 128,
"delimiter": "\\n!?;。;!?", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False}, "raptor": {"use_raptor": False},