From ff35c140dc33e6b50e814a0737be790da365d22e Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 6 Mar 2025 17:08:28 +0800 Subject: [PATCH] Refa: remove dataset language and validate dataset name length. (#5707) ### What problem does this PR solve? #5686 #5702 ### Type of change - [x] Refactoring --- api/apps/sdk/dataset.py | 24 ++++++++--------------- api/utils/api_utils.py | 4 +--- docs/references/http_api_reference.md | 6 ------ docs/references/python_api_reference.md | 7 ------- sdk/python/ragflow_sdk/modules/dataset.py | 1 - 5 files changed, 9 insertions(+), 33 deletions(-) diff --git a/api/apps/sdk/dataset.py b/api/apps/sdk/dataset.py index 3fdcfebc6..4cd48fe27 100644 --- a/api/apps/sdk/dataset.py +++ b/api/apps/sdk/dataset.py @@ -66,10 +66,6 @@ def create(tenant_id): type: string enum: ['me', 'team'] description: Dataset permission. - language: - type: string - enum: ['Chinese', 'English'] - description: Language of the dataset. chunk_method: type: string enum: ["naive", "manual", "qa", "table", "paper", "book", "laws", @@ -91,11 +87,9 @@ def create(tenant_id): req = request.json e, t = TenantService.get_by_id(tenant_id) permission = req.get("permission") - language = req.get("language") chunk_method = req.get("chunk_method") parser_config = req.get("parser_config") valid_permission = ["me", "team"] - valid_language = ["Chinese", "English"] valid_chunk_method = [ "naive", "manual", @@ -114,8 +108,6 @@ def create(tenant_id): check_validation = valid( permission, valid_permission, - language, - valid_language, chunk_method, valid_chunk_method, ) @@ -134,6 +126,10 @@ def create(tenant_id): req["name"] = req["name"].strip() if req["name"] == "": return get_error_data_result(message="`name` is not empty string!") + if len(req["name"]) >= 128: + return get_error_data_result( + message="Dataset name should not be longer than 128 characters." + ) if KnowledgebaseService.query( name=req["name"], tenant_id=tenant_id, status=StatusEnum.VALID.value ): @@ -297,10 +293,6 @@ def update(tenant_id, dataset_id): type: string enum: ['me', 'team'] description: Updated permission. - language: - type: string - enum: ['Chinese', 'English'] - description: Updated language. chunk_method: type: string enum: ["naive", "manual", "qa", "table", "paper", "book", "laws", @@ -324,11 +316,9 @@ def update(tenant_id, dataset_id): if any(key in req for key in invalid_keys): return get_error_data_result(message="The input parameters are invalid.") permission = req.get("permission") - language = req.get("language") chunk_method = req.get("chunk_method") parser_config = req.get("parser_config") valid_permission = ["me", "team"] - valid_language = ["Chinese", "English"] valid_chunk_method = [ "naive", "manual", @@ -347,8 +337,6 @@ def update(tenant_id, dataset_id): check_validation = valid( permission, valid_permission, - language, - valid_language, chunk_method, valid_chunk_method, ) @@ -416,6 +404,10 @@ def update(tenant_id, dataset_id): req["embd_id"] = req.pop("embedding_model") if "name" in req: req["name"] = req["name"].strip() + if len(req["name"]) >= 128: + return get_error_data_result( + message="Dataset name should not be longer than 128 characters." + ) if ( req["name"].lower() != kb.name.lower() and len( diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index f36919850..070cf397d 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -335,11 +335,9 @@ def generate_confirmation_token(tenent_id): return "ragflow-" + serializer.dumps(get_uuid(), salt=tenent_id)[2:34] -def valid(permission, valid_permission, language, valid_language, chunk_method, valid_chunk_method): +def valid(permission, valid_permission, chunk_method, valid_chunk_method): if valid_parameter(permission, valid_permission): return valid_parameter(permission, valid_permission) - if valid_parameter(language, valid_language): - return valid_parameter(language, valid_language) if valid_parameter(chunk_method, valid_chunk_method): return valid_parameter(chunk_method, valid_chunk_method) diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index a0982f015..1781ad8b6 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -178,7 +178,6 @@ Creates a dataset. - `"name"`: `string` - `"avatar"`: `string` - `"description"`: `string` - - `"language"`: `string` - `"embedding_model"`: `string` - `"permission"`: `string` - `"chunk_method"`: `string` @@ -214,11 +213,6 @@ curl --request POST \ - `"description"`: (*Body parameter*), `string` A brief description of the dataset to create. -- `"language"`: (*Body parameter*), `string` - The language setting of the dataset to create. Available options: - - `"English"` (default) - - `"Chinese"` - - `"embedding_model"`: (*Body parameter*), `string` The name of the embedding model to use. For example: `"BAAI/bge-zh-v1.5"` diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index 33666c782..533ef8020 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -82,7 +82,6 @@ RAGFlow.create_dataset( avatar: str = "", description: str = "", embedding_model: str = "BAAI/bge-large-zh-v1.5", - language: str = "English", permission: str = "me", chunk_method: str = "naive", parser_config: DataSet.ParserConfig = None @@ -108,12 +107,6 @@ Base64 encoding of the avatar. Defaults to `""` A brief description of the dataset to create. Defaults to `""`. -##### language: `str` - -The language setting of the dataset to create. Available options: - -- `"English"` (default) -- `"Chinese"` ##### permission diff --git a/sdk/python/ragflow_sdk/modules/dataset.py b/sdk/python/ragflow_sdk/modules/dataset.py index 57d76d3ae..fdecde4a5 100644 --- a/sdk/python/ragflow_sdk/modules/dataset.py +++ b/sdk/python/ragflow_sdk/modules/dataset.py @@ -30,7 +30,6 @@ class DataSet(Base): self.avatar = "" self.tenant_id = None self.description = "" - self.language = "English" self.embedding_model = "" self.permission = "me" self.document_count = 0