diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 697f8fa99..9772a2ed9 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -13,13 +13,17 @@ # See the License for the specific language governing permissions and # limitations under the License. +import os +import re +import warnings from flask import request from flask_login import login_required, current_user from httpx import HTTPError from api.contants import NAME_LENGTH_LIMIT -from api.db import FileSource, StatusEnum +from api.db import FileType, ParserType, FileSource +from api.db import StatusEnum from api.db.db_models import File from api.db.services import duplicate_name from api.db.services.document_service import DocumentService @@ -29,8 +33,12 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.user_service import TenantService from api.settings import RetCode from api.utils import get_uuid -from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request +from api.utils.api_utils import construct_json_result, construct_error_response +from api.utils.api_utils import construct_result, validate_request +from api.utils.file_utils import filename_type, thumbnail +from rag.utils.minio_conn import MINIO +MAXIMUM_OF_UPLOADING_FILES = 256 # ------------------------------ create a dataset --------------------------------------- @@ -253,3 +261,216 @@ def update_dataset(dataset_id): return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS) except Exception as e: return construct_error_response(e) + +# --------------------------------content management ---------------------------------------------- + +# ----------------------------upload files----------------------------------------------------- +@manager.route('//documents/', methods=['POST']) +@login_required +def upload_documents(dataset_id): + # no files + if not request.files: + return construct_json_result( + message='There is no file!', code=RetCode.ARGUMENT_ERROR) + + # the number of uploading files exceeds the limit + file_objs = request.files.getlist('file') + num_file_objs = len(file_objs) + + if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: + return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " + f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") + + for file_obj in file_objs: + # the content of the file + file_content = file_obj.read() + file_name = file_obj.filename + # no name + if not file_name: + return construct_json_result( + message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) + + # TODO: support the remote files + if 'http' in file_name: + return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") + + # the content is empty, raising a warning + if file_content == b'': + warnings.warn(f"[WARNING]: The file {file_name} is empty.") + + # no dataset + exist, dataset = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) + + # get the root_folder + root_folder = FileService.get_root_folder(current_user.id) + # get the id of the root_folder + parent_file_id = root_folder["id"] # document id + # this is for the new user, create '.knowledgebase' file + FileService.init_knowledgebase_docs(parent_file_id, current_user.id) + # go inside this folder, get the kb_root_folder + kb_root_folder = FileService.get_kb_folder(current_user.id) + # link the file management to the kb_folder + kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) + + # grab all the errs + err = [] + MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) + uploaded_docs_json = [] + for file in file_objs: + try: + # TODO: get this value from the database as some tenants have this limit while others don't + if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: + return construct_json_result(code=RetCode.DATA_ERROR, + message="Exceed the maximum file number of a free user!") + # deal with the duplicate name + filename = duplicate_name( + DocumentService.query, + name=file.filename, + kb_id=dataset.id) + + # deal with the unsupported type + filetype = filename_type(filename) + if filetype == FileType.OTHER.value: + return construct_json_result(code=RetCode.DATA_ERROR, + message="This type of file has not been supported yet!") + + # upload to the minio + location = filename + while MINIO.obj_exist(dataset_id, location): + location += "_" + blob = file.read() + MINIO.put(dataset_id, location, blob) + doc = { + "id": get_uuid(), + "kb_id": dataset.id, + "parser_id": dataset.parser_id, + "parser_config": dataset.parser_config, + "created_by": current_user.id, + "type": filetype, + "name": filename, + "location": location, + "size": len(blob), + "thumbnail": thumbnail(filename, blob) + } + if doc["type"] == FileType.VISUAL: + doc["parser_id"] = ParserType.PICTURE.value + if re.search(r"\.(ppt|pptx|pages)$", filename): + doc["parser_id"] = ParserType.PRESENTATION.value + DocumentService.insert(doc) + + FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) + uploaded_docs_json.append(doc) + except Exception as e: + err.append(file.filename + ": " + str(e)) + + if err: + # return all the errors + return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) + # success + return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) + + +# ----------------------------delete a file----------------------------------------------------- +@manager.route('//documents/', methods=['DELETE']) +@login_required +def delete_document(document_id, dataset_id): # string + # get the root folder + root_folder = FileService.get_root_folder(current_user.id) + # parent file's id + parent_file_id = root_folder["id"] + # consider the new user + FileService.init_knowledgebase_docs(parent_file_id, current_user.id) + # store all the errors that may have + errors = "" + try: + # whether there is this document + exist, doc = DocumentService.get_by_id(document_id) + if not exist: + return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) + # whether this doc is authorized by this tenant + tenant_id = DocumentService.get_tenant_id(document_id) + if not tenant_id: + return construct_json_result( + message=f"You cannot delete this document {document_id} due to the authorization" + f" reason!", code=RetCode.AUTHENTICATION_ERROR) + + # get the doc's id and location + real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id) + + if real_dataset_id != dataset_id: + return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " + f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) + + # there is an issue when removing + if not DocumentService.remove_document(doc, tenant_id): + return construct_json_result( + message="There was an error during the document removal process. Please check the status of the " + "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) + + # fetch the File2Document record associated with the provided document ID. + file_to_doc = File2DocumentService.get_by_document_id(document_id) + # delete the associated File record. + FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) + # delete the File2Document record itself using the document ID. This removes the + # association between the document and the file after the File record has been deleted. + File2DocumentService.delete_by_document_id(document_id) + + # delete it from minio + MINIO.rm(dataset_id, location) + except Exception as e: + errors += str(e) + if errors: + return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) + + return construct_json_result(data=True, code=RetCode.SUCCESS) + + +# ----------------------------list files----------------------------------------------------- +@manager.route('//documents/', methods=['GET']) +@login_required +def list_documents(dataset_id): + if not dataset_id: + return construct_json_result( + data=False, message='Lack of "dataset_id"', code=RetCode.ARGUMENT_ERROR) + + # searching keywords + keywords = request.args.get("keywords", "") + + offset = request.args.get("offset", 0) + count = request.args.get("count", -1) + order_by = request.args.get("order_by", "create_time") + descend = request.args.get("descend", True) + try: + docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by, + descend, keywords) + + return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS) + except Exception as e: + return construct_error_response(e) + +# ----------------------------download a file----------------------------------------------------- + +# ----------------------------enable rename----------------------------------------------------- + +# ----------------------------start parsing----------------------------------------------------- + +# ----------------------------stop parsing----------------------------------------------------- + +# ----------------------------show the status of the file----------------------------------------------------- + +# ----------------------------list the chunks of the file----------------------------------------------------- + +# ----------------------------delete the chunk----------------------------------------------------- + +# ----------------------------edit the status of the chunk----------------------------------------------------- + +# ----------------------------insert a new chunk----------------------------------------------------- + +# ----------------------------upload a file----------------------------------------------------- + +# ----------------------------get a specific chunk----------------------------------------------------- + +# ----------------------------retrieval test----------------------------------------------------- + diff --git a/api/apps/documents_api.py b/api/apps/documents_api.py deleted file mode 100644 index 44e8a4160..000000000 --- a/api/apps/documents_api.py +++ /dev/null @@ -1,228 +0,0 @@ -# -# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License -# - -import os -import re -import warnings - -from flask import request -from flask_login import login_required, current_user - -from api.db import FileType, ParserType -from api.db.services import duplicate_name -from api.db.services.document_service import DocumentService -from api.db.services.file2document_service import File2DocumentService -from api.db.services.file_service import FileService -from api.db.services.knowledgebase_service import KnowledgebaseService -from api.settings import RetCode -from api.utils import get_uuid -from api.utils.api_utils import construct_json_result -from api.utils.file_utils import filename_type, thumbnail -from rag.utils.minio_conn import MINIO -from api.db.db_models import Task, File -from api.db import FileType, TaskStatus, ParserType, FileSource - - -MAXIMUM_OF_UPLOADING_FILES = 256 - - -# ----------------------------upload local files----------------------------------------------------- -@manager.route('/', methods=['POST']) -@login_required -def upload(dataset_id): - # no files - if not request.files: - return construct_json_result( - message='There is no file!', code=RetCode.ARGUMENT_ERROR) - - # the number of uploading files exceeds the limit - file_objs = request.files.getlist('file') - num_file_objs = len(file_objs) - - if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " - f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") - - for file_obj in file_objs: - # the content of the file - file_content = file_obj.read() - file_name = file_obj.filename - # no name - if not file_name: - return construct_json_result( - message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) - - # TODO: support the remote files - if 'http' in file_name: - return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") - - # the content is empty, raising a warning - if file_content == b'': - warnings.warn(f"[WARNING]: The file {file_name} is empty.") - - # no dataset - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) - - # get the root_folder - root_folder = FileService.get_root_folder(current_user.id) - # get the id of the root_folder - parent_file_id = root_folder["id"] # document id - # this is for the new user, create '.knowledgebase' file - FileService.init_knowledgebase_docs(parent_file_id, current_user.id) - # go inside this folder, get the kb_root_folder - kb_root_folder = FileService.get_kb_folder(current_user.id) - # link the file management to the kb_folder - kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) - - # grab all the errs - err = [] - MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) - uploaded_docs_json = [] - for file in file_objs: - try: - # TODO: get this value from the database as some tenants have this limit while others don't - if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: - return construct_json_result(code=RetCode.DATA_ERROR, - message="Exceed the maximum file number of a free user!") - # deal with the duplicate name - filename = duplicate_name( - DocumentService.query, - name=file.filename, - kb_id=dataset.id) - - # deal with the unsupported type - filetype = filename_type(filename) - if filetype == FileType.OTHER.value: - return construct_json_result(code=RetCode.DATA_ERROR, - message="This type of file has not been supported yet!") - - # upload to the minio - location = filename - while MINIO.obj_exist(dataset_id, location): - location += "_" - blob = file.read() - MINIO.put(dataset_id, location, blob) - doc = { - "id": get_uuid(), - "kb_id": dataset.id, - "parser_id": dataset.parser_id, - "parser_config": dataset.parser_config, - "created_by": current_user.id, - "type": filetype, - "name": filename, - "location": location, - "size": len(blob), - "thumbnail": thumbnail(filename, blob) - } - if doc["type"] == FileType.VISUAL: - doc["parser_id"] = ParserType.PICTURE.value - if re.search(r"\.(ppt|pptx|pages)$", filename): - doc["parser_id"] = ParserType.PRESENTATION.value - DocumentService.insert(doc) - - FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) - uploaded_docs_json.append(doc) - except Exception as e: - err.append(file.filename + ": " + str(e)) - - if err: - # return all the errors - return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) - # success - return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) - -# ----------------------------delete a file----------------------------------------------------- -@manager.route('//', methods=['DELETE']) -@login_required -def delete(document_id, dataset_id): # string - # get the root folder - root_folder = FileService.get_root_folder(current_user.id) - # parent file's id - parent_file_id = root_folder["id"] - # consider the new user - FileService.init_knowledgebase_docs(parent_file_id, current_user.id) - # store all the errors that may have - errors = "" - try: - # whether there is this document - exist, doc = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) - # whether this doc is authorized by this tenant - tenant_id = DocumentService.get_tenant_id(document_id) - if not tenant_id: - return construct_json_result(message=f"You cannot delete this document {document_id} due to the authorization" - f" reason!", code=RetCode.AUTHENTICATION_ERROR) - - # get the doc's id and location - real_dataset_id, location = File2DocumentService.get_minio_address(doc_id=document_id) - - if real_dataset_id != dataset_id: - return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " - f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) - - # there is an issue when removing - if not DocumentService.remove_document(doc, tenant_id): - return construct_json_result( - message="There was an error during the document removal process. Please check the status of the " - "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) - - # fetch the File2Document record associated with the provided document ID. - file_to_doc = File2DocumentService.get_by_document_id(document_id) - # delete the associated File record. - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) - # delete the File2Document record itself using the document ID. This removes the - # association between the document and the file after the File record has been deleted. - File2DocumentService.delete_by_document_id(document_id) - - # delete it from minio - MINIO.rm(dataset_id, location) - except Exception as e: - errors += str(e) - if errors: - return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) - - return construct_json_result(data=True, code=RetCode.SUCCESS) - -# ----------------------------upload online files------------------------------------------------ - -# ----------------------------download a file----------------------------------------------------- - -# ----------------------------enable rename----------------------------------------------------- - -# ----------------------------list files----------------------------------------------------- - -# ----------------------------start parsing----------------------------------------------------- - -# ----------------------------stop parsing----------------------------------------------------- - -# ----------------------------show the status of the file----------------------------------------------------- - -# ----------------------------list the chunks of the file----------------------------------------------------- - -# ----------------------------delete the chunk----------------------------------------------------- - -# ----------------------------edit the status of the chunk----------------------------------------------------- - -# ----------------------------insert a new chunk----------------------------------------------------- - -# ----------------------------upload a file----------------------------------------------------- - -# ----------------------------get a specific chunk----------------------------------------------------- - -# ----------------------------retrieval test----------------------------------------------------- diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index ddbc5b606..6a54d22c7 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -59,6 +59,35 @@ class DocumentService(CommonService): return list(docs.dicts()), count + @classmethod + @DB.connection_context() + def list_documents_in_dataset(cls, dataset_id, offset, count, order_by, descend, keywords): + if keywords: + docs = cls.model.select().where( + (cls.model.kb_id == dataset_id), + (fn.LOWER(cls.model.name).contains(keywords.lower())) + ) + else: + docs = cls.model.select().where(cls.model.kb_id == dataset_id) + + total = docs.count() + + if descend == 'True': + docs = docs.order_by(cls.model.getter_by(order_by).desc()) + if descend == 'False': + docs = docs.order_by(cls.model.getter_by(order_by).asc()) + + docs = list(docs.dicts()) + docs_length = len(docs) + + if offset < 0 or offset > docs_length: + raise IndexError("Offset is out of the valid range.") + + if count == -1: + return docs[offset:], total + + return docs[offset:offset + count], total + @classmethod @DB.connection_context() def insert(cls, doc): diff --git a/api/db/services/knowledgebase_service.py b/api/db/services/knowledgebase_service.py index 8ad9bce22..b9c12ef71 100644 --- a/api/db/services/knowledgebase_service.py +++ b/api/db/services/knowledgebase_service.py @@ -60,6 +60,9 @@ class KnowledgebaseService(CommonService): if offset < 0 or offset > kbs_length: raise IndexError("Offset is out of the valid range.") + if count == -1: + return kbs[offset:] + return kbs[offset:offset+count] @classmethod diff --git a/docs/references/ragflow_api.md b/docs/references/ragflow_api.md index 9e260b8e5..fba8d0cc9 100644 --- a/docs/references/ragflow_api.md +++ b/docs/references/ragflow_api.md @@ -274,4 +274,6 @@ You are required to input at least one parameter. "code": 102, "message": "Please input at least one parameter that you want to update!" } -``` \ No newline at end of file +``` + + diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index 1e4ec332a..c6c54668d 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -26,12 +26,11 @@ class RAGFlow: ''' api_url: http:///api/v1 dataset_url: http:///api/v1/dataset - document_url: http:///api/v1/documents + document_url: http:///api/v1/dataset/{dataset_id}/documents ''' self.user_key = user_key self.api_url = f"{base_url}/api/{version}" self.dataset_url = f"{self.api_url}/dataset" - self.document_url = f"{self.api_url}/documents" self.authorization_header = {"Authorization": "{}".format(self.user_key)} def create_dataset(self, dataset_name): @@ -79,7 +78,7 @@ class RAGFlow: response = requests.put(endpoint, json=params, headers=self.authorization_header) return response.json() -# -------------------- content management ----------------------------------------------------- + # -------------------- content management ----------------------------------------------------- # ----------------------------upload local files----------------------------------------------------- def upload_local_file(self, dataset_id, file_paths): @@ -95,7 +94,7 @@ class RAGFlow: else: return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} - res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files, + res = requests.request('POST', url=f"{self.dataset_url}/{dataset_id}/documents", files=files, headers=self.authorization_header) result_dict = json.loads(res.text) @@ -103,16 +102,27 @@ class RAGFlow: # ----------------------------delete a file----------------------------------------------------- def delete_files(self, document_id, dataset_id): - endpoint = f"{self.document_url}/{dataset_id}/{document_id}" + endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}" res = requests.delete(endpoint, headers=self.authorization_header) return res.json() + # ----------------------------list files----------------------------------------------------- + def list_files(self, dataset_id, offset=0, count=-1, order_by="create_time", descend=True, keywords=""): + params = { + "offset": offset, + "count": count, + "order_by": order_by, + "descend": descend, + "keywords": keywords + } + endpoint = f"{self.dataset_url}/{dataset_id}/documents/" + res = requests.get(endpoint, params=params, headers=self.authorization_header) + return res.json() + # ----------------------------download a file----------------------------------------------------- # ----------------------------enable rename----------------------------------------------------- - # ----------------------------list files----------------------------------------------------- - # ----------------------------start parsing----------------------------------------------------- # ----------------------------stop parsing----------------------------------------------------- diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py index a1f34895c..342763537 100644 --- a/sdk/python/test/test_document.py +++ b/sdk/python/test/test_document.py @@ -37,7 +37,7 @@ class TestFile(TestSdk): dataset_id = created_res['data']['dataset_id'] file_paths = ["test_data/test.txt", "test_data/test1.txt"] res = ragflow.upload_local_file(dataset_id, file_paths) - assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' + assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' def test_upload_one_file(self): """ @@ -48,7 +48,7 @@ class TestFile(TestSdk): dataset_id = created_res['data']['dataset_id'] file_paths = ["test_data/test.txt"] res = ragflow.upload_local_file(dataset_id, file_paths) - assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' + assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' def test_upload_nonexistent_files(self): """ @@ -237,12 +237,143 @@ class TestFile(TestSdk): assert (deleted_res['code'] == RetCode.ARGUMENT_ERROR and deleted_res['message'] == f'The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.') +# ----------------------------list files----------------------------------------------------- + def test_list_documents_with_success(self): + """ + Test listing documents with a successful outcome. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + # upload a document + created_res = ragflow.create_dataset("test_list_documents_with_success") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1 + + def test_list_documents_with_checking_size(self): + """ + Test listing documents and verify the size and names of the documents. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + # upload 10 documents + created_res = ragflow.create_dataset("test_list_documents_with_checking_size") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] * 10 + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10 + + def test_list_documents_with_getting_empty_result(self): + """ + Test listing documents that should be empty. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + # upload 0 documents + created_res = ragflow.create_dataset("test_list_documents_with_getting_empty_result") + created_res_id = created_res['data']['dataset_id'] + # Call the list_document method + response = ragflow.list_files(created_res_id) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 0 + + def test_list_documents_with_creating_100_documents(self): + """ + Test listing 100 documents and verify the size of these documents. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + # upload 100 documents + created_res = ragflow.create_dataset("test_list_documents_with_creating_100_documents") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] * 100 + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 100 + + def test_list_document_with_failure(self): + """ + Test listing documents with IndexError. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_list_document_with_failure") + created_res_id = created_res['data']['dataset_id'] + response = ragflow.list_files(created_res_id, offset=-1, count=-1) + assert "IndexError" in response['message'] and response['code'] == RetCode.EXCEPTION_ERROR + + def test_list_document_with_verifying_offset_and_count(self): + """ + Test listing documents with verifying the functionalities of offset and count. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_list_document_with_verifying_offset_and_count") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/empty.txt"] * 10 + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id, offset=2, count=10) + + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 10 + + def test_list_document_with_verifying_keywords(self): + """ + Test listing documents with verifying the functionality of searching keywords. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_list_document_with_verifying_keywords") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/empty.txt"] + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id, keywords="empty") + + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 1 + + def test_list_document_with_verifying_order_by_and_descend(self): + """ + Test listing documents with verifying the functionality of order_by and descend. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_descend") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/empty.txt"] + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 2 + docs = response['data']['docs'] + # reverse + i = 1 + for doc in docs: + assert doc['name'] in file_paths[i] + i -= 1 + + def test_list_document_with_verifying_order_by_and_ascend(self): + """ + Test listing documents with verifying the functionality of order_by and ascend. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_ascend") + created_res_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] + ragflow.upload_local_file(created_res_id, file_paths) + # Call the list_document method + response = ragflow.list_files(created_res_id, descend=False) + assert response['code'] == RetCode.SUCCESS and len(response['data']['docs']) == 3 + + docs = response['data']['docs'] + + i = 0 + for doc in docs: + assert doc['name'] in file_paths[i] + i += 1 + + # TODO: have to set the limitation of the number of documents # ----------------------------download a file----------------------------------------------------- # ----------------------------enable rename----------------------------------------------------- -# ----------------------------list files----------------------------------------------------- - # ----------------------------start parsing----------------------------------------------------- # ----------------------------stop parsing----------------------------------------------------- @@ -257,8 +388,6 @@ class TestFile(TestSdk): # ----------------------------insert a new chunk----------------------------------------------------- -# ----------------------------upload a file----------------------------------------------------- - # ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------retrieval test-----------------------------------------------------