diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 7dec0b9bc..574683aa2 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -344,8 +344,11 @@ def update(tenant_id, dataset_id): if req.get("parser_config"): req["parser_config"] = deep_merge(kb.parser_config, req["parser_config"]) - if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id and req.get("parser_config") is None: - req["parser_config"] = get_parser_config(chunk_method, None) + if (chunk_method := req.get("parser_id")) and chunk_method != kb.parser_id: + if not req.get("parser_config"): + req["parser_config"] = get_parser_config(chunk_method, None) + elif "parser_config" in req and not req["parser_config"]: + del req["parser_config"] if "name" in req and req["name"].lower() != kb.name.lower(): try: diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index c752fbcd9..8e6c2e01f 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -218,7 +218,7 @@ class CreateDatasetReq(Base): permission: Annotated[PermissionEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=16), Field(default=PermissionEnum.me)] chunk_method: Annotated[ChunkMethodnEnum, StringConstraints(strip_whitespace=True, min_length=1, max_length=32), Field(default=ChunkMethodnEnum.naive, serialization_alias="parser_id")] pagerank: int = Field(default=0, ge=0, le=100) - parser_config: ParserConfig = Field(default_factory=dict) + parser_config: ParserConfig | None = Field(default=None) @field_validator("avatar") @classmethod @@ -330,15 +330,42 @@ class CreateDatasetReq(Base): """ return v.lower() if isinstance(v, str) else v + @field_validator("parser_config", mode="before") + @classmethod + def normalize_empty_parser_config(cls, v: Any) -> Any: + """ + Normalizes empty parser configuration by converting empty dictionaries to None. + + This validator ensures consistent handling of empty parser configurations across + the application by converting empty dicts to None values. + + Args: + v (Any): Raw input value for the parser config field + + Returns: + Any: Returns None if input is an empty dict, otherwise returns the original value + + Example: + >>> normalize_empty_parser_config({}) + None + + >>> normalize_empty_parser_config({"key": "value"}) + {"key": "value"} + """ + if v == {}: + return None + return v + @field_validator("parser_config", mode="after") @classmethod - def validate_parser_config_json_length(cls, v: ParserConfig) -> ParserConfig: + def validate_parser_config_json_length(cls, v: ParserConfig | None) -> ParserConfig | None: """ Validates serialized JSON length constraints for parser configuration. Implements a two-stage validation workflow: - 1. Model serialization - convert Pydantic model to JSON string - 2. Size verification - enforce maximum allowed payload size + 1. Null check - bypass validation for empty configurations + 2. Model serialization - convert Pydantic model to JSON string + 3. Size verification - enforce maximum allowed payload size Args: v (ParserConfig | None): Raw parser configuration object @@ -349,6 +376,9 @@ class CreateDatasetReq(Base): Raises: ValueError: When serialized JSON exceeds 65,535 characters """ + if v is None: + return None + if (json_str := v.model_dump_json()) and len(json_str) > 65535: raise ValueError(f"Parser config exceeds size limit (max 65,535 characters). Current size: {len(json_str):,}") return v diff --git a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py index 3a70d248d..e72fbed64 100644 --- a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -141,7 +141,7 @@ class TestDatasetCreate: def test_avatar(self, get_http_api_auth, tmp_path): fn = create_image_file(tmp_path / "ragflow_test.png") payload = { - "name": "avatar_test", + "name": "avatar", "avatar": f"data:image/png;base64,{encode_avatar(fn)}", } res = create_dataset(get_http_api_auth, payload) @@ -149,14 +149,14 @@ class TestDatasetCreate: @pytest.mark.p2 def test_avatar_exceeds_limit_length(self, get_http_api_auth): - payload = {"name": "exceeds_limit_length_avatar", "avatar": "a" * 65536} + payload = {"name": "avatar_exceeds_limit_length", "avatar": "a" * 65536} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 101, res assert "String should have at most 65535 characters" in res["message"], res @pytest.mark.p3 @pytest.mark.parametrize( - "name, avatar_prefix, expected_message", + "name, prefix, expected_message", [ ("empty_prefix", "", "Missing MIME prefix. Expected format: data:;base64,"), ("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:;base64,"), @@ -165,11 +165,11 @@ class TestDatasetCreate: ], ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"], ) - def test_avatar_invalid_prefix(self, get_http_api_auth, tmp_path, name, avatar_prefix, expected_message): + def test_avatar_invalid_prefix(self, get_http_api_auth, tmp_path, name, prefix, expected_message): fn = create_image_file(tmp_path / "ragflow_test.png") payload = { "name": name, - "avatar": f"{avatar_prefix}{encode_avatar(fn)}", + "avatar": f"{prefix}{encode_avatar(fn)}", } res = create_dataset(get_http_api_auth, payload) assert res["code"] == 101, res @@ -177,42 +177,42 @@ class TestDatasetCreate: @pytest.mark.p3 def test_avatar_unset(self, get_http_api_auth): - payload = {"name": "test_avatar_unset"} + payload = {"name": "avatar_unset"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["avatar"] is None, res @pytest.mark.p3 def test_avatar_none(self, get_http_api_auth): - payload = {"name": "test_avatar_none", "avatar": None} + payload = {"name": "avatar_none", "avatar": None} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["avatar"] is None, res @pytest.mark.p2 def test_description(self, get_http_api_auth): - payload = {"name": "test_description", "description": "description"} + payload = {"name": "description", "description": "description"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["description"] == "description", res @pytest.mark.p2 def test_description_exceeds_limit_length(self, get_http_api_auth): - payload = {"name": "exceeds_limit_length_description", "description": "a" * 65536} + payload = {"name": "description_exceeds_limit_length", "description": "a" * 65536} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 101, res assert "String should have at most 65535 characters" in res["message"], res @pytest.mark.p3 def test_description_unset(self, get_http_api_auth): - payload = {"name": "test_description_unset"} + payload = {"name": "description_unset"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["description"] is None, res @pytest.mark.p3 def test_description_none(self, get_http_api_auth): - payload = {"name": "test_description_none", "description": None} + payload = {"name": "description_none", "description": None} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["description"] is None, res @@ -283,7 +283,7 @@ class TestDatasetCreate: @pytest.mark.p2 def test_embedding_model_none(self, get_http_api_auth): - payload = {"name": "test_embedding_model_none", "embedding_model": None} + payload = {"name": "embedding_model_none", "embedding_model": None} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 101, res assert "Input should be a valid string" in res["message"], res @@ -323,14 +323,14 @@ class TestDatasetCreate: @pytest.mark.p2 def test_permission_unset(self, get_http_api_auth): - payload = {"name": "test_permission_unset"} + payload = {"name": "permission_unset"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["permission"] == "me", res @pytest.mark.p3 def test_permission_none(self, get_http_api_auth): - payload = {"name": "test_permission_none", "permission": None} + payload = {"name": "permission_none", "permission": None} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 101, res assert "Input should be 'me' or 'team'" in res["message"], res @@ -378,7 +378,7 @@ class TestDatasetCreate: @pytest.mark.p2 def test_chunk_method_unset(self, get_http_api_auth): - payload = {"name": "test_chunk_method_unset"} + payload = {"name": "chunk_method_unset"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["chunk_method"] == "naive", res @@ -674,28 +674,20 @@ class TestDatasetCreate: @pytest.mark.p2 def test_parser_config_empty(self, get_http_api_auth): - payload = {"name": "default_empty", "parser_config": {}} + payload = {"name": "parser_config_empty", "parser_config": {}} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["parser_config"] == { - "auto_keywords": 0, - "auto_questions": 0, "chunk_token_num": 128, "delimiter": r"\n", - "filename_embd_weight": None, - "graphrag": None, "html4excel": False, "layout_recognize": "DeepDOC", - "pages": None, - "raptor": None, - "tag_kb_ids": [], - "task_page_size": None, - "topn_tags": 1, - } + "raptor": {"use_raptor": False}, + }, res @pytest.mark.p2 def test_parser_config_unset(self, get_http_api_auth): - payload = {"name": "default_unset"} + payload = {"name": "parser_config_unset"} res = create_dataset(get_http_api_auth, payload) assert res["code"] == 0, res assert res["data"]["parser_config"] == { @@ -708,10 +700,16 @@ class TestDatasetCreate: @pytest.mark.p3 def test_parser_config_none(self, get_http_api_auth): - payload = {"name": "default_none", "parser_config": None} + payload = {"name": "parser_config_none", "parser_config": None} res = create_dataset(get_http_api_auth, payload) - assert res["code"] == 101, res - assert "Input should be a valid dictionary or instance of ParserConfig" in res["message"], res + assert res["code"] == 0, res + assert res["data"]["parser_config"] == { + "chunk_token_num": 128, + "delimiter": "\\n", + "html4excel": False, + "layout_recognize": "DeepDOC", + "raptor": {"use_raptor": False}, + }, res @pytest.mark.p2 @pytest.mark.parametrize( diff --git a/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py b/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py index 07485609d..1ff6a7595 100644 --- a/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py +++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py @@ -91,7 +91,7 @@ class TestCapability: class TestDatasetUpdate: @pytest.mark.p3 def test_dataset_id_not_uuid(self, get_http_api_auth): - payload = {"name": "dataset_id_not_uuid"} + payload = {"name": "not_uuid"} res = update_dataset(get_http_api_auth, "not_uuid", payload) assert res["code"] == 101, res assert "Input should be a valid UUID" in res["message"], res @@ -178,22 +178,19 @@ class TestDatasetUpdate: @pytest.mark.p3 @pytest.mark.parametrize( - "name, avatar_prefix, expected_message", + "avatar_prefix, expected_message", [ - ("empty_prefix", "", "Missing MIME prefix. Expected format: data:;base64,"), - ("missing_comma", "data:image/png;base64", "Missing MIME prefix. Expected format: data:;base64,"), - ("unsupported_mine_type", "invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"), - ("invalid_mine_type", "data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"), + ("", "Missing MIME prefix. Expected format: data:;base64,"), + ("data:image/png;base64", "Missing MIME prefix. Expected format: data:;base64,"), + ("invalid_mine_prefix:image/png;base64,", "Invalid MIME prefix format. Must start with 'data:'"), + ("data:unsupported_mine_type;base64,", "Unsupported MIME type. Allowed: ['image/jpeg', 'image/png']"), ], ids=["empty_prefix", "missing_comma", "unsupported_mine_type", "invalid_mine_type"], ) - def test_avatar_invalid_prefix(self, get_http_api_auth, add_dataset_func, tmp_path, name, avatar_prefix, expected_message): + def test_avatar_invalid_prefix(self, get_http_api_auth, add_dataset_func, tmp_path, avatar_prefix, expected_message): dataset_id = add_dataset_func fn = create_image_file(tmp_path / "ragflow_test.png") - payload = { - "name": name, - "avatar": f"{avatar_prefix}{encode_avatar(fn)}", - } + payload = {"avatar": f"{avatar_prefix}{encode_avatar(fn)}"} res = update_dataset(get_http_api_auth, dataset_id, payload) assert res["code"] == 101, res assert expected_message in res["message"], res @@ -312,18 +309,18 @@ class TestDatasetUpdate: @pytest.mark.p1 @pytest.mark.parametrize( - "name, permission", + "permission", [ - ("me", "me"), - ("team", "team"), - ("me_upercase", "ME"), - ("team_upercase", "TEAM"), + "me", + "team", + "ME", + "TEAM", ], ids=["me", "team", "me_upercase", "team_upercase"], ) - def test_permission(self, get_http_api_auth, add_dataset_func, name, permission): + def test_permission(self, get_http_api_auth, add_dataset_func, permission): dataset_id = add_dataset_func - payload = {"name": name, "permission": permission} + payload = {"permission": permission} res = update_dataset(get_http_api_auth, dataset_id, payload) assert res["code"] == 0, res @@ -351,7 +348,7 @@ class TestDatasetUpdate: @pytest.mark.p3 def test_permission_none(self, get_http_api_auth, add_dataset_func): dataset_id = add_dataset_func - payload = {"name": "test_permission_none", "permission": None} + payload = {"permission": None} res = update_dataset(get_http_api_auth, dataset_id, payload) assert res["code"] == 101, res assert "Input should be 'me' or 'team'" in res["message"], res @@ -697,32 +694,65 @@ class TestDatasetUpdate: res = list_datasets(get_http_api_auth) assert res["code"] == 0, res - assert res["data"][0]["parser_config"] == {} - - # @pytest.mark.p2 - # def test_parser_config_unset(self, get_http_api_auth, add_dataset_func): - # dataset_id = add_dataset_func - # payload = {"name": "default_unset"} - # res = update_dataset(get_http_api_auth, dataset_id, payload) - # assert res["code"] == 0, res - - # res = list_datasets(get_http_api_auth) - # assert res["code"] == 0, res - # assert res["data"][0]["parser_config"] == { - # "chunk_token_num": 128, - # "delimiter": r"\n", - # "html4excel": False, - # "layout_recognize": "DeepDOC", - # "raptor": {"use_raptor": False}, - # }, res + assert res["data"][0]["parser_config"] == { + "chunk_token_num": 128, + "delimiter": r"\n", + "html4excel": False, + "layout_recognize": "DeepDOC", + "raptor": {"use_raptor": False}, + }, res @pytest.mark.p3 def test_parser_config_none(self, get_http_api_auth, add_dataset_func): dataset_id = add_dataset_func payload = {"parser_config": None} res = update_dataset(get_http_api_auth, dataset_id, payload) - assert res["code"] == 101, res - assert "Input should be a valid dictionary or instance of ParserConfig" in res["message"], res + assert res["code"] == 0, res + + res = list_datasets(get_http_api_auth, {"id": dataset_id}) + assert res["code"] == 0, res + assert res["data"][0]["parser_config"] == { + "chunk_token_num": 128, + "delimiter": r"\n", + "html4excel": False, + "layout_recognize": "DeepDOC", + "raptor": {"use_raptor": False}, + }, res + + @pytest.mark.p3 + def test_parser_config_empty_with_chunk_method_change(self, get_http_api_auth, add_dataset_func): + dataset_id = add_dataset_func + payload = {"chunk_method": "qa", "parser_config": {}} + res = update_dataset(get_http_api_auth, dataset_id, payload) + assert res["code"] == 0, res + + res = list_datasets(get_http_api_auth) + print(res) + assert res["code"] == 0, res + assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res + + @pytest.mark.p3 + def test_parser_config_unset_with_chunk_method_change(self, get_http_api_auth, add_dataset_func): + dataset_id = add_dataset_func + payload = {"chunk_method": "qa"} + res = update_dataset(get_http_api_auth, dataset_id, payload) + assert res["code"] == 0, res + + res = list_datasets(get_http_api_auth) + assert res["code"] == 0, res + assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res + + @pytest.mark.p3 + def test_parser_config_none_with_chunk_method_change(self, get_http_api_auth, add_dataset_func): + dataset_id = add_dataset_func + payload = {"chunk_method": "qa", "parser_config": None} + res = update_dataset(get_http_api_auth, dataset_id, payload) + assert res["code"] == 0, res + + res = list_datasets(get_http_api_auth, {"id": dataset_id}) + print(res) + assert res["code"] == 0, res + assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res @pytest.mark.p2 @pytest.mark.parametrize( @@ -742,8 +772,35 @@ class TestDatasetUpdate: {"unknown_field": "unknown_field"}, ], ) - def test_unsupported_field(self, get_http_api_auth, add_dataset_func, payload): + def test_field_unsupported(self, get_http_api_auth, add_dataset_func, payload): dataset_id = add_dataset_func res = update_dataset(get_http_api_auth, dataset_id, payload) assert res["code"] == 101, res assert "Extra inputs are not permitted" in res["message"], res + + @pytest.mark.p2 + def test_field_unset(self, get_http_api_auth, add_dataset_func): + dataset_id = add_dataset_func + res = list_datasets(get_http_api_auth) + assert res["code"] == 0, res + original_data = res["data"][0] + + payload = {"name": "default_unset"} + res = update_dataset(get_http_api_auth, dataset_id, payload) + assert res["code"] == 0, res + + res = list_datasets(get_http_api_auth) + assert res["code"] == 0, res + assert res["data"][0]["avatar"] == original_data["avatar"], res + assert res["data"][0]["description"] == original_data["description"], res + assert res["data"][0]["embedding_model"] == original_data["embedding_model"], res + assert res["data"][0]["permission"] == original_data["permission"], res + assert res["data"][0]["chunk_method"] == original_data["chunk_method"], res + assert res["data"][0]["pagerank"] == original_data["pagerank"], res + assert res["data"][0]["parser_config"] == { + "chunk_token_num": 128, + "delimiter": r"\n", + "html4excel": False, + "layout_recognize": "DeepDOC", + "raptor": {"use_raptor": False}, + }, res