mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-10 21:09:03 +08:00
Fix: change create dataset htto api delimiter default value to r'\n' (#7434)
### What problem does this PR solve? change create dataset delimiter default value to r'\n' ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
fea9d970ec
commit
fc379e90d1
@ -353,7 +353,7 @@ def get_parser_config(chunk_method, parser_config):
|
|||||||
if not chunk_method:
|
if not chunk_method:
|
||||||
chunk_method = "naive"
|
chunk_method = "naive"
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
"naive": {"chunk_token_num": 128, "delimiter": "\\n!?;。;!?", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
|
"naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
|
||||||
"qa": {"raptor": {"use_raptor": False}},
|
"qa": {"raptor": {"use_raptor": False}},
|
||||||
"tag": None,
|
"tag": None,
|
||||||
"resume": None,
|
"resume": None,
|
||||||
@ -364,7 +364,7 @@ def get_parser_config(chunk_method, parser_config):
|
|||||||
"laws": {"raptor": {"use_raptor": False}},
|
"laws": {"raptor": {"use_raptor": False}},
|
||||||
"presentation": {"raptor": {"use_raptor": False}},
|
"presentation": {"raptor": {"use_raptor": False}},
|
||||||
"one": None,
|
"one": None,
|
||||||
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": "\\n!?;。;!?", "entity_types": ["organization", "person", "location", "event", "time"]},
|
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
|
||||||
"email": None,
|
"email": None,
|
||||||
"picture": None,
|
"picture": None,
|
||||||
}
|
}
|
||||||
|
@ -96,7 +96,7 @@ class ParserConfig(Base):
|
|||||||
auto_keywords: int = Field(default=0, ge=0, le=32)
|
auto_keywords: int = Field(default=0, ge=0, le=32)
|
||||||
auto_questions: int = Field(default=0, ge=0, le=10)
|
auto_questions: int = Field(default=0, ge=0, le=10)
|
||||||
chunk_token_num: int = Field(default=128, ge=1, le=2048)
|
chunk_token_num: int = Field(default=128, ge=1, le=2048)
|
||||||
delimiter: str = Field(default=r"\n!?;。;!?", min_length=1)
|
delimiter: str = Field(default=r"\n", min_length=1)
|
||||||
graphrag: Optional[GraphragConfig] = None
|
graphrag: Optional[GraphragConfig] = None
|
||||||
html4excel: bool = False
|
html4excel: bool = False
|
||||||
layout_recognize: str = "DeepDOC"
|
layout_recognize: str = "DeepDOC"
|
||||||
|
@ -400,7 +400,7 @@ class TestDatasetCreation:
|
|||||||
if parser_config is None:
|
if parser_config is None:
|
||||||
assert res["data"]["parser_config"] == {
|
assert res["data"]["parser_config"] == {
|
||||||
"chunk_token_num": 128,
|
"chunk_token_num": 128,
|
||||||
"delimiter": r"\n!?;。;!?",
|
"delimiter": r"\n",
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
@ -410,7 +410,7 @@ class TestDatasetCreation:
|
|||||||
"auto_keywords": 0,
|
"auto_keywords": 0,
|
||||||
"auto_questions": 0,
|
"auto_questions": 0,
|
||||||
"chunk_token_num": 128,
|
"chunk_token_num": 128,
|
||||||
"delimiter": r"\n!?;。;!?",
|
"delimiter": r"\n",
|
||||||
"filename_embd_weight": None,
|
"filename_embd_weight": None,
|
||||||
"graphrag": None,
|
"graphrag": None,
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
|
@ -303,7 +303,7 @@ class TestUpdateDocumentParserConfig:
|
|||||||
"chunk_token_num": 128,
|
"chunk_token_num": 128,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"delimiter": "\\n!?;。;!?",
|
"delimiter": r"\n",
|
||||||
"task_page_size": 12,
|
"task_page_size": 12,
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
},
|
},
|
||||||
@ -530,7 +530,7 @@ class TestUpdateDocumentParserConfig:
|
|||||||
else:
|
else:
|
||||||
assert res["data"]["docs"][0]["parser_config"] == {
|
assert res["data"]["docs"][0]["parser_config"] == {
|
||||||
"chunk_token_num": 128,
|
"chunk_token_num": 128,
|
||||||
"delimiter": "\\n!?;。;!?",
|
"delimiter": r"\n",
|
||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
Loading…
x
Reference in New Issue
Block a user