diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py deleted file mode 100644 index f8e2d1930..000000000 --- a/api/apps/dataset_api.py +++ /dev/null @@ -1,880 +0,0 @@ -# -# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import os -import pathlib -import re -import warnings -from functools import partial -from io import BytesIO - -from elasticsearch_dsl import Q -from flask import request, send_file -from flask_login import login_required, current_user -from httpx import HTTPError - -from api.contants import NAME_LENGTH_LIMIT -from api.db import FileType, ParserType, FileSource, TaskStatus -from api.db import StatusEnum -from api.db.db_models import File -from api.db.services import duplicate_name -from api.db.services.document_service import DocumentService -from api.db.services.file2document_service import File2DocumentService -from api.db.services.file_service import FileService -from api.db.services.knowledgebase_service import KnowledgebaseService -from api.db.services.user_service import TenantService -from api.settings import RetCode -from api.utils import get_uuid -from api.utils.api_utils import construct_json_result, construct_error_response -from api.utils.api_utils import construct_result, validate_request -from api.utils.file_utils import filename_type, thumbnail -from rag.app import book, laws, manual, naive, one, paper, presentation, qa, resume, table, picture, audio, email -from rag.nlp import search -from rag.utils.es_conn import ELASTICSEARCH -from rag.utils.storage_factory import STORAGE_IMPL - -MAXIMUM_OF_UPLOADING_FILES = 256 - - -# ------------------------------ create a dataset --------------------------------------- - -@manager.route("/", methods=["POST"]) -@login_required # use login -@validate_request("name") # check name key -def create_dataset(): - # Check if Authorization header is present - authorization_token = request.headers.get("Authorization") - if not authorization_token: - return construct_json_result(code=RetCode.AUTHENTICATION_ERROR, message="Authorization header is missing.") - - # TODO: Login or API key - # objs = APIToken.query(token=authorization_token) - # - # # Authorization error - # if not objs: - # return construct_json_result(code=RetCode.AUTHENTICATION_ERROR, message="Token is invalid.") - # - # tenant_id = objs[0].tenant_id - - tenant_id = current_user.id - request_body = request.json - - # In case that there's no name - if "name" not in request_body: - return construct_json_result(code=RetCode.DATA_ERROR, message="Expected 'name' field in request body") - - dataset_name = request_body["name"] - - # empty dataset_name - if not dataset_name: - return construct_json_result(code=RetCode.DATA_ERROR, message="Empty dataset name") - - # In case that there's space in the head or the tail - dataset_name = dataset_name.strip() - - # In case that the length of the name exceeds the limit - dataset_name_length = len(dataset_name) - if dataset_name_length > NAME_LENGTH_LIMIT: - return construct_json_result( - code=RetCode.DATA_ERROR, - message=f"Dataset name: {dataset_name} with length {dataset_name_length} exceeds {NAME_LENGTH_LIMIT}!") - - # In case that there are other fields in the data-binary - if len(request_body.keys()) > 1: - name_list = [] - for key_name in request_body.keys(): - if key_name != "name": - name_list.append(key_name) - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"fields: {name_list}, are not allowed in request body.") - - # If there is a duplicate name, it will modify it to make it unique - request_body["name"] = duplicate_name( - KnowledgebaseService.query, - name=dataset_name, - tenant_id=tenant_id, - status=StatusEnum.VALID.value) - try: - request_body["id"] = get_uuid() - request_body["tenant_id"] = tenant_id - request_body["created_by"] = tenant_id - exist, t = TenantService.get_by_id(tenant_id) - if not exist: - return construct_result(code=RetCode.AUTHENTICATION_ERROR, message="Tenant not found.") - request_body["embd_id"] = t.embd_id - if not KnowledgebaseService.save(**request_body): - # failed to create new dataset - return construct_result() - return construct_json_result(code=RetCode.SUCCESS, - data={"dataset_name": request_body["name"], "dataset_id": request_body["id"]}) - except Exception as e: - return construct_error_response(e) - - -# -----------------------------list datasets------------------------------------------------------- - -@manager.route("/", methods=["GET"]) -@login_required -def list_datasets(): - offset = request.args.get("offset", 0) - count = request.args.get("count", -1) - orderby = request.args.get("orderby", "create_time") - desc = request.args.get("desc", True) - try: - tenants = TenantService.get_joined_tenants_by_user_id(current_user.id) - datasets = KnowledgebaseService.get_by_tenant_ids_by_offset( - [m["tenant_id"] for m in tenants], current_user.id, int(offset), int(count), orderby, desc) - return construct_json_result(data=datasets, code=RetCode.SUCCESS, message=f"List datasets successfully!") - except Exception as e: - return construct_error_response(e) - except HTTPError as http_err: - return construct_json_result(http_err) - - -# ---------------------------------delete a dataset ---------------------------- - -@manager.route("/", methods=["DELETE"]) -@login_required -def remove_dataset(dataset_id): - try: - datasets = KnowledgebaseService.query(created_by=current_user.id, id=dataset_id) - - # according to the id, searching for the dataset - if not datasets: - return construct_json_result(message=f"The dataset cannot be found for your current account.", - code=RetCode.OPERATING_ERROR) - - # Iterating the documents inside the dataset - for doc in DocumentService.query(kb_id=dataset_id): - if not DocumentService.remove_document(doc, datasets[0].tenant_id): - # the process of deleting failed - return construct_json_result(code=RetCode.DATA_ERROR, - message="There was an error during the document removal process. " - "Please check the status of the RAGFlow server and try the removal again.") - # delete the other files - f2d = File2DocumentService.get_by_document_id(doc.id) - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == f2d[0].file_id]) - File2DocumentService.delete_by_document_id(doc.id) - - # delete the dataset - if not KnowledgebaseService.delete_by_id(dataset_id): - return construct_json_result(code=RetCode.DATA_ERROR, - message="There was an error during the dataset removal process. " - "Please check the status of the RAGFlow server and try the removal again.") - # success - return construct_json_result(code=RetCode.SUCCESS, message=f"Remove dataset: {dataset_id} successfully") - except Exception as e: - return construct_error_response(e) - - -# ------------------------------ get details of a dataset ---------------------------------------- - -@manager.route("/", methods=["GET"]) -@login_required -def get_dataset(dataset_id): - try: - dataset = KnowledgebaseService.get_detail(dataset_id) - if not dataset: - return construct_json_result(code=RetCode.DATA_ERROR, message="Can't find this dataset!") - return construct_json_result(data=dataset, code=RetCode.SUCCESS) - except Exception as e: - return construct_json_result(e) - - -# ------------------------------ update a dataset -------------------------------------------- - -@manager.route("/", methods=["PUT"]) -@login_required -def update_dataset(dataset_id): - req = request.json - try: - # the request cannot be empty - if not req: - return construct_json_result(code=RetCode.DATA_ERROR, message="Please input at least one parameter that " - "you want to update!") - # check whether the dataset can be found - if not KnowledgebaseService.query(created_by=current_user.id, id=dataset_id): - return construct_json_result(message=f"Only the owner of knowledgebase is authorized for this operation!", - code=RetCode.OPERATING_ERROR) - - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) - # check whether there is this dataset - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message="This dataset cannot be found!") - - if "name" in req: - name = req["name"].strip() - # check whether there is duplicate name - if name.lower() != dataset.name.lower() \ - and len(KnowledgebaseService.query(name=name, tenant_id=current_user.id, - status=StatusEnum.VALID.value)) > 1: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"The name: {name.lower()} is already used by other " - f"datasets. Please choose a different name.") - - dataset_updating_data = {} - chunk_num = req.get("chunk_num") - # modify the value of 11 parameters - - # 2 parameters: embedding id and chunk method - # only if chunk_num is 0, the user can update the embedding id - if req.get("embedding_model_id"): - if chunk_num == 0: - dataset_updating_data["embd_id"] = req["embedding_model_id"] - else: - return construct_json_result(code=RetCode.DATA_ERROR, - message="You have already parsed the document in this " - "dataset, so you cannot change the embedding " - "model.") - # only if chunk_num is 0, the user can update the chunk_method - if "chunk_method" in req: - type_value = req["chunk_method"] - if is_illegal_value_for_enum(type_value, ParserType): - return construct_json_result(message=f"Illegal value {type_value} for 'chunk_method' field.", - code=RetCode.DATA_ERROR) - if chunk_num != 0: - construct_json_result(code=RetCode.DATA_ERROR, message="You have already parsed the document " - "in this dataset, so you cannot " - "change the chunk method.") - dataset_updating_data["parser_id"] = req["template_type"] - - # convert the photo parameter to avatar - if req.get("photo"): - dataset_updating_data["avatar"] = req["photo"] - - # layout_recognize - if "layout_recognize" in req: - if "parser_config" not in dataset_updating_data: - dataset_updating_data['parser_config'] = {} - dataset_updating_data['parser_config']['layout_recognize'] = req['layout_recognize'] - - # TODO: updating use_raptor needs to construct a class - - # 6 parameters - for key in ["name", "language", "description", "permission", "id", "token_num"]: - if key in req: - dataset_updating_data[key] = req.get(key) - - # update - if not KnowledgebaseService.update_by_id(dataset.id, dataset_updating_data): - return construct_json_result(code=RetCode.OPERATING_ERROR, message="Failed to update! " - "Please check the status of RAGFlow " - "server and try again!") - - exist, dataset = KnowledgebaseService.get_by_id(dataset.id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message="Failed to get the dataset " - "using the dataset ID.") - - return construct_json_result(data=dataset.to_json(), code=RetCode.SUCCESS) - except Exception as e: - return construct_error_response(e) - - -# --------------------------------content management ---------------------------------------------- - -# ----------------------------upload files----------------------------------------------------- -@manager.route("//documents/", methods=["POST"]) -@login_required -def upload_documents(dataset_id): - # no files - if not request.files: - return construct_json_result( - message="There is no file!", code=RetCode.ARGUMENT_ERROR) - - # the number of uploading files exceeds the limit - file_objs = request.files.getlist("file") - num_file_objs = len(file_objs) - - if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " - f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") - - # no dataset - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) - - for file_obj in file_objs: - file_name = file_obj.filename - # no name - if not file_name: - return construct_json_result( - message="There is a file without name!", code=RetCode.ARGUMENT_ERROR) - - # TODO: support the remote files - if 'http' in file_name: - return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") - - # get the root_folder - root_folder = FileService.get_root_folder(current_user.id) - # get the id of the root_folder - parent_file_id = root_folder["id"] # document id - # this is for the new user, create '.knowledgebase' file - FileService.init_knowledgebase_docs(parent_file_id, current_user.id) - # go inside this folder, get the kb_root_folder - kb_root_folder = FileService.get_kb_folder(current_user.id) - # link the file management to the kb_folder - kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) - - # grab all the errs - err = [] - MAX_FILE_NUM_PER_USER = int(os.environ.get("MAX_FILE_NUM_PER_USER", 0)) - uploaded_docs_json = [] - for file in file_objs: - try: - # TODO: get this value from the database as some tenants have this limit while others don't - if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: - return construct_json_result(code=RetCode.DATA_ERROR, - message="Exceed the maximum file number of a free user!") - # deal with the duplicate name - filename = duplicate_name( - DocumentService.query, - name=file.filename, - kb_id=dataset.id) - - # deal with the unsupported type - filetype = filename_type(filename) - if filetype == FileType.OTHER.value: - return construct_json_result(code=RetCode.DATA_ERROR, - message="This type of file has not been supported yet!") - - # upload to the minio - location = filename - while STORAGE_IMPL.obj_exist(dataset_id, location): - location += "_" - - blob = file.read() - - # the content is empty, raising a warning - if blob == b'': - warnings.warn(f"[WARNING]: The content of the file {filename} is empty.") - - STORAGE_IMPL.put(dataset_id, location, blob) - - doc = { - "id": get_uuid(), - "kb_id": dataset.id, - "parser_id": dataset.parser_id, - "parser_config": dataset.parser_config, - "created_by": current_user.id, - "type": filetype, - "name": filename, - "location": location, - "size": len(blob), - "thumbnail": thumbnail(filename, blob) - } - if doc["type"] == FileType.VISUAL: - doc["parser_id"] = ParserType.PICTURE.value - if doc["type"] == FileType.AURAL: - doc["parser_id"] = ParserType.AUDIO.value - if re.search(r"\.(ppt|pptx|pages)$", filename): - doc["parser_id"] = ParserType.PRESENTATION.value - if re.search(r"\.(eml)$", filename): - doc["parser_id"] = ParserType.EMAIL.value - DocumentService.insert(doc) - - FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) - uploaded_docs_json.append(doc) - except Exception as e: - err.append(file.filename + ": " + str(e)) - - if err: - # return all the errors - return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) - # success - return construct_json_result(data=uploaded_docs_json, code=RetCode.SUCCESS) - - -# ----------------------------delete a file----------------------------------------------------- -@manager.route("//documents/", methods=["DELETE"]) -@login_required -def delete_document(document_id, dataset_id): # string - # get the root folder - root_folder = FileService.get_root_folder(current_user.id) - # parent file's id - parent_file_id = root_folder["id"] - # consider the new user - FileService.init_knowledgebase_docs(parent_file_id, current_user.id) - # store all the errors that may have - errors = "" - try: - # whether there is this document - exist, doc = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(message=f"Document {document_id} not found!", code=RetCode.DATA_ERROR) - # whether this doc is authorized by this tenant - tenant_id = DocumentService.get_tenant_id(document_id) - if not tenant_id: - return construct_json_result( - message=f"You cannot delete this document {document_id} due to the authorization" - f" reason!", code=RetCode.AUTHENTICATION_ERROR) - - # get the doc's id and location - real_dataset_id, location = File2DocumentService.get_storage_address(doc_id=document_id) - - if real_dataset_id != dataset_id: - return construct_json_result(message=f"The document {document_id} is not in the dataset: {dataset_id}, " - f"but in the dataset: {real_dataset_id}.", code=RetCode.ARGUMENT_ERROR) - - # there is an issue when removing - if not DocumentService.remove_document(doc, tenant_id): - return construct_json_result( - message="There was an error during the document removal process. Please check the status of the " - "RAGFlow server and try the removal again.", code=RetCode.OPERATING_ERROR) - - # fetch the File2Document record associated with the provided document ID. - file_to_doc = File2DocumentService.get_by_document_id(document_id) - # delete the associated File record. - FileService.filter_delete([File.source_type == FileSource.KNOWLEDGEBASE, File.id == file_to_doc[0].file_id]) - # delete the File2Document record itself using the document ID. This removes the - # association between the document and the file after the File record has been deleted. - File2DocumentService.delete_by_document_id(document_id) - - # delete it from minio - STORAGE_IMPL.rm(dataset_id, location) - except Exception as e: - errors += str(e) - if errors: - return construct_json_result(data=False, message=errors, code=RetCode.SERVER_ERROR) - - return construct_json_result(data=True, code=RetCode.SUCCESS) - - -# ----------------------------list files----------------------------------------------------- -@manager.route('//documents/', methods=['GET']) -@login_required -def list_documents(dataset_id): - if not dataset_id: - return construct_json_result( - data=False, message="Lack of 'dataset_id'", code=RetCode.ARGUMENT_ERROR) - - # searching keywords - keywords = request.args.get("keywords", "") - - offset = request.args.get("offset", 0) - count = request.args.get("count", -1) - order_by = request.args.get("order_by", "create_time") - descend = request.args.get("descend", True) - try: - docs, total = DocumentService.list_documents_in_dataset(dataset_id, int(offset), int(count), order_by, - descend, keywords) - - return construct_json_result(data={"total": total, "docs": docs}, message=RetCode.SUCCESS) - except Exception as e: - return construct_error_response(e) - - -# ----------------------------update: enable rename----------------------------------------------------- -@manager.route("//documents/", methods=["PUT"]) -@login_required -def update_document(dataset_id, document_id): - req = request.json - try: - legal_parameters = set() - legal_parameters.add("name") - legal_parameters.add("enable") - legal_parameters.add("template_type") - - for key in req.keys(): - if key not in legal_parameters: - return construct_json_result(code=RetCode.ARGUMENT_ERROR, message=f"{key} is an illegal parameter.") - - # The request body cannot be empty - if not req: - return construct_json_result( - code=RetCode.DATA_ERROR, - message="Please input at least one parameter that you want to update!") - - # Check whether there is this dataset - exist, dataset = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset {dataset_id} cannot be found!") - - # The document does not exist - exist, document = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(message=f"This document {document_id} cannot be found!", - code=RetCode.ARGUMENT_ERROR) - - # Deal with the different keys - updating_data = {} - if "name" in req: - new_name = req["name"] - updating_data["name"] = new_name - # Check whether the new_name is suitable - # 1. no name value - if not new_name: - return construct_json_result(code=RetCode.DATA_ERROR, message="There is no new name.") - - # 2. In case that there's space in the head or the tail - new_name = new_name.strip() - - # 3. Check whether the new_name has the same extension of file as before - if pathlib.Path(new_name.lower()).suffix != pathlib.Path( - document.name.lower()).suffix: - return construct_json_result( - data=False, - message="The extension of file cannot be changed", - code=RetCode.ARGUMENT_ERROR) - - # 4. Check whether the new name has already been occupied by other file - for d in DocumentService.query(name=new_name, kb_id=document.kb_id): - if d.name == new_name: - return construct_json_result( - message="Duplicated document name in the same dataset.", - code=RetCode.ARGUMENT_ERROR) - - if "enable" in req: - enable_value = req["enable"] - if is_illegal_value_for_enum(enable_value, StatusEnum): - return construct_json_result(message=f"Illegal value {enable_value} for 'enable' field.", - code=RetCode.DATA_ERROR) - updating_data["status"] = enable_value - - # TODO: Chunk-method - update parameters inside the json object parser_config - if "template_type" in req: - type_value = req["template_type"] - if is_illegal_value_for_enum(type_value, ParserType): - return construct_json_result(message=f"Illegal value {type_value} for 'template_type' field.", - code=RetCode.DATA_ERROR) - updating_data["parser_id"] = req["template_type"] - - # The process of updating - if not DocumentService.update_by_id(document_id, updating_data): - return construct_json_result( - code=RetCode.OPERATING_ERROR, - message="Failed to update document in the database! " - "Please check the status of RAGFlow server and try again!") - - # name part: file service - if "name" in req: - # Get file by document id - file_information = File2DocumentService.get_by_document_id(document_id) - if file_information: - exist, file = FileService.get_by_id(file_information[0].file_id) - FileService.update_by_id(file.id, {"name": req["name"]}) - - exist, document = DocumentService.get_by_id(document_id) - - # Success - return construct_json_result(data=document.to_json(), message="Success", code=RetCode.SUCCESS) - except Exception as e: - return construct_error_response(e) - - -# Helper method to judge whether it's an illegal value -def is_illegal_value_for_enum(value, enum_class): - return value not in enum_class.__members__.values() - - -# ----------------------------download a file----------------------------------------------------- -@manager.route("//documents/", methods=["GET"]) -@login_required -def download_document(dataset_id, document_id): - try: - # Check whether there is this dataset - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset '{dataset_id}' cannot be found!") - - # Check whether there is this document - exist, document = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(message=f"This document '{document_id}' cannot be found!", - code=RetCode.ARGUMENT_ERROR) - - # The process of downloading - doc_id, doc_location = File2DocumentService.get_storage_address(doc_id=document_id) # minio address - file_stream = STORAGE_IMPL.get(doc_id, doc_location) - if not file_stream: - return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR) - - file = BytesIO(file_stream) - - # Use send_file with a proper filename and MIME type - return send_file( - file, - as_attachment=True, - download_name=document.name, - mimetype='application/octet-stream' # Set a default MIME type - ) - - # Error - except Exception as e: - return construct_error_response(e) - - -# ----------------------------start parsing a document----------------------------------------------------- -# helper method for parsing -# callback method -def doc_parse_callback(doc_id, prog=None, msg=""): - cancel = DocumentService.do_cancel(doc_id) - if cancel: - raise Exception("The parsing process has been cancelled!") - -""" -def doc_parse(binary, doc_name, parser_name, tenant_id, doc_id): - match parser_name: - case "book": - book.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "laws": - laws.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "manual": - manual.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "naive": - # It's the mode by default, which is general in the front-end - naive.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "one": - one.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "paper": - paper.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "picture": - picture.chunk(doc_name, binary=binary, tenant_id=tenant_id, lang="Chinese", - callback=partial(doc_parse_callback, doc_id)) - case "presentation": - presentation.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "qa": - qa.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "resume": - resume.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "table": - table.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "audio": - audio.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case "email": - email.chunk(doc_name, binary=binary, callback=partial(doc_parse_callback, doc_id)) - case _: - return False - - return True - """ - - -@manager.route("//documents//status", methods=["POST"]) -@login_required -def parse_document(dataset_id, document_id): - try: - # valid dataset - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset '{dataset_id}' cannot be found!") - - return parsing_document_internal(document_id) - - except Exception as e: - return construct_error_response(e) - - -# ----------------------------start parsing documents----------------------------------------------------- -@manager.route("//documents/status", methods=["POST"]) -@login_required -def parse_documents(dataset_id): - doc_ids = request.json["doc_ids"] - try: - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset '{dataset_id}' cannot be found!") - # two conditions - if not doc_ids: - # documents inside the dataset - docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", - True, "") - doc_ids = [doc["id"] for doc in docs] - - message = "" - # for loop - for id in doc_ids: - res = parsing_document_internal(id) - res_body = res.json - if res_body["code"] == RetCode.SUCCESS: - message += res_body["message"] - else: - return res - return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) - - except Exception as e: - return construct_error_response(e) - - -# helper method for parsing the document -def parsing_document_internal(id): - message = "" - try: - # Check whether there is this document - exist, document = DocumentService.get_by_id(id) - if not exist: - return construct_json_result(message=f"This document '{id}' cannot be found!", - code=RetCode.ARGUMENT_ERROR) - - tenant_id = DocumentService.get_tenant_id(id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - - info = {"run": "1", "progress": 0} - info["progress_msg"] = "" - info["chunk_num"] = 0 - info["token_num"] = 0 - - DocumentService.update_by_id(id, info) - - ELASTICSEARCH.deleteByQuery(Q("match", doc_id=id), idxnm=search.index_name(tenant_id)) - - _, doc_attributes = DocumentService.get_by_id(id) - doc_attributes = doc_attributes.to_dict() - doc_id = doc_attributes["id"] - - bucket, doc_name = File2DocumentService.get_storage_address(doc_id=doc_id) - binary = STORAGE_IMPL.get(bucket, doc_name) - parser_name = doc_attributes["parser_id"] - if binary: - res = doc_parse(binary, doc_name, parser_name, tenant_id, doc_id) - if res is False: - message += f"The parser id: {parser_name} of the document {doc_id} is not supported; " - else: - message += f"Empty data in the document: {doc_name}; " - # failed in parsing - if doc_attributes["status"] == TaskStatus.FAIL.value: - message += f"Failed in parsing the document: {doc_id}; " - return construct_json_result(code=RetCode.SUCCESS, message=message) - except Exception as e: - return construct_error_response(e) - - -# ----------------------------stop parsing a doc----------------------------------------------------- -@manager.route("/documents//status", methods=["DELETE"]) -@login_required -def stop_parsing_document(dataset_id, document_id): - try: - # valid dataset - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset '{dataset_id}' cannot be found!") - - return stop_parsing_document_internal(document_id) - - except Exception as e: - return construct_error_response(e) - - -# ----------------------------stop parsing docs----------------------------------------------------- -@manager.route("/documents/status", methods=["DELETE"]) -@login_required -def stop_parsing_documents(dataset_id): - doc_ids = request.json["doc_ids"] - try: - # valid dataset? - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset '{dataset_id}' cannot be found!") - if not doc_ids: - # documents inside the dataset - docs, total = DocumentService.list_documents_in_dataset(dataset_id, 0, -1, "create_time", - True, "") - doc_ids = [doc["id"] for doc in docs] - - message = "" - # for loop - for id in doc_ids: - res = stop_parsing_document_internal(id) - res_body = res.json - if res_body["code"] == RetCode.SUCCESS: - message += res_body["message"] - else: - return res - return construct_json_result(data=True, code=RetCode.SUCCESS, message=message) - - except Exception as e: - return construct_error_response(e) - - -# Helper method -def stop_parsing_document_internal(document_id): - try: - # valid doc? - exist, doc = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(message=f"This document '{document_id}' cannot be found!", - code=RetCode.ARGUMENT_ERROR) - doc_attributes = doc.to_dict() - - # only when the status is parsing, we need to stop it - if doc_attributes["status"] == TaskStatus.RUNNING.value: - tenant_id = DocumentService.get_tenant_id(document_id) - if not tenant_id: - return construct_json_result(message="Tenant not found!", code=RetCode.AUTHENTICATION_ERROR) - - # update successfully? - if not DocumentService.update_by_id(document_id, {"status": "2"}): # cancel - return construct_json_result( - code=RetCode.OPERATING_ERROR, - message="There was an error during the stopping parsing the document process. " - "Please check the status of the RAGFlow server and try the update again." - ) - - _, doc_attributes = DocumentService.get_by_id(document_id) - doc_attributes = doc_attributes.to_dict() - - # failed in stop parsing - if doc_attributes["status"] == TaskStatus.RUNNING.value: - return construct_json_result(message=f"Failed in parsing the document: {document_id}; ", code=RetCode.SUCCESS) - return construct_json_result(code=RetCode.SUCCESS, message="") - except Exception as e: - return construct_error_response(e) - - -# ----------------------------show the status of the file----------------------------------------------------- -@manager.route("//documents//status", methods=["GET"]) -@login_required -def show_parsing_status(dataset_id, document_id): - try: - # valid dataset - exist, _ = KnowledgebaseService.get_by_id(dataset_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This dataset: '{dataset_id}' cannot be found!") - # valid document - exist, _ = DocumentService.get_by_id(document_id) - if not exist: - return construct_json_result(code=RetCode.DATA_ERROR, - message=f"This document: '{document_id}' is not a valid document.") - - _, doc = DocumentService.get_by_id(document_id) # get doc object - doc_attributes = doc.to_dict() - - return construct_json_result( - data={"progress": doc_attributes["progress"], "status": TaskStatus(doc_attributes["status"]).name}, - code=RetCode.SUCCESS - ) - except Exception as e: - return construct_error_response(e) - -# ----------------------------list the chunks of the file----------------------------------------------------- - -# -- --------------------------delete the chunk----------------------------------------------------- - -# ----------------------------edit the status of the chunk----------------------------------------------------- - -# ----------------------------insert a new chunk----------------------------------------------------- - -# ----------------------------upload a file----------------------------------------------------- - -# ----------------------------get a specific chunk----------------------------------------------------- - -# ----------------------------retrieval test----------------------------------------------------- diff --git a/sdk/python/test/test_basic.py b/sdk/python/test/test_basic.py deleted file mode 100644 index 33cff8729..000000000 --- a/sdk/python/test/test_basic.py +++ /dev/null @@ -1,48 +0,0 @@ -from test_sdkbase import TestSdk -import ragflow -from ragflow.ragflow import RAGFLow -import pytest -from unittest.mock import MagicMock -from common import API_KEY, HOST_ADDRESS - - -class TestBasic(TestSdk): - - def test_version(self): - print(ragflow.__version__) - - # def test_create_dataset(self): - # res = RAGFLow(API_KEY, HOST_ADDRESS).create_dataset('abc') - # print(res) - # - # def test_delete_dataset(self): - # assert RAGFLow('123', 'url').delete_dataset('abc') == 'abc' - # - # def test_list_dataset_success(self, ragflow_instance, monkeypatch): - # # Mocking the response of requests.get method - # mock_response = MagicMock() - # mock_response.status_code = 200 - # mock_response.json.return_value = {'datasets': [{'id': 1, 'name': 'dataset1'}, {'id': 2, 'name': 'dataset2'}]} - # - # # Patching requests.get to return the mock_response - # monkeypatch.setattr("requests.get", MagicMock(return_value=mock_response)) - # - # # Call the method under test - # result = ragflow_instance.list_dataset() - # - # # Assertion - # assert result == [{'id': 1, 'name': 'dataset1'}, {'id': 2, 'name': 'dataset2'}] - # - # def test_list_dataset_failure(self, ragflow_instance, monkeypatch): - # # Mocking the response of requests.get method - # mock_response = MagicMock() - # mock_response.status_code = 404 # Simulating a failed request - # - # # Patching requests.get to return the mock_response - # monkeypatch.setattr("requests.get", MagicMock(return_value=mock_response)) - # - # # Call the method under test - # result = ragflow_instance.list_dataset() - # - # # Assertion - # assert result is None diff --git a/sdk/python/test/test_dataset.py b/sdk/python/test/test_dataset.py deleted file mode 100644 index 8c2084a90..000000000 --- a/sdk/python/test/test_dataset.py +++ /dev/null @@ -1,468 +0,0 @@ -from api.settings import RetCode -from test_sdkbase import TestSdk -from ragflow import RAGFlow -import pytest -from common import API_KEY, HOST_ADDRESS -from api.contants import NAME_LENGTH_LIMIT - - -class TestDataset(TestSdk): - """ - This class contains a suite of tests for the dataset management functionality within the RAGFlow system. - It ensures that the following functionalities as expected: - 1. create a kb - 2. list the kb - 3. get the detail info according to the kb id - 4. update the kb - 5. delete the kb - """ - - def setup_method(self): - """ - Delete all the datasets. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - listed_data = ragflow.list_dataset() - listed_data = listed_data['data'] - - listed_names = {d['name'] for d in listed_data} - for name in listed_names: - ragflow.delete_dataset(name) - - # -----------------------create_dataset--------------------------------- - def test_create_dataset_with_success(self): - """ - Test the creation of a new dataset with success. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # create a kb - res = ragflow.create_dataset("kb1") - assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' - - def test_create_dataset_with_empty_name(self): - """ - Test the creation of a new dataset with an empty name. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset("") - assert res['message'] == 'Empty dataset name' and res['code'] == RetCode.DATA_ERROR - - def test_create_dataset_with_name_exceeding_limit(self): - """ - Test the creation of a new dataset with the length of name exceeding the limit. - """ - name = "k" * NAME_LENGTH_LIMIT + "b" - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(name) - assert (res['message'] == f"Dataset name: {name} with length {len(name)} exceeds {NAME_LENGTH_LIMIT}!" - and res['code'] == RetCode.DATA_ERROR) - - def test_create_dataset_name_with_space_in_the_middle(self): - """ - Test the creation of a new dataset whose name has space in the middle. - """ - name = "k b" - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_name_with_space_in_the_head(self): - """ - Test the creation of a new dataset whose name has space in the head. - """ - name = " kb" - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_name_with_space_in_the_tail(self): - """ - Test the creation of a new dataset whose name has space in the tail. - """ - name = "kb " - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_name_with_space_in_the_head_and_tail_and_length_exceed_limit(self): - """ - Test the creation of a new dataset whose name has space in the head and tail, - and the length of the name exceeds the limit. - """ - name = " " + "k" * NAME_LENGTH_LIMIT + " " - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_with_two_same_name(self): - """ - Test the creation of two new datasets with the same name. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset("kb") - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - res = ragflow.create_dataset("kb") - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_with_only_space_in_the_name(self): - """ - Test the creation of a dataset whose name only has space. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset(" ") - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_with_space_number_exceeding_limit(self): - """ - Test the creation of a dataset with a name that only has space exceeds the allowed limit. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - name = " " * NAME_LENGTH_LIMIT - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_with_name_having_return(self): - """ - Test the creation of a dataset with a name that has return symbol. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - name = "kb\n" - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - def test_create_dataset_with_name_having_the_null_character(self): - """ - Test the creation of a dataset with a name that has the null character. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - name = "kb\0" - res = ragflow.create_dataset(name) - assert (res['code'] == RetCode.SUCCESS and res['message'] == 'success') - - # -----------------------list_dataset--------------------------------- - def test_list_dataset_success(self): - """ - Test listing datasets with a successful outcome. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # Call the list_datasets method - response = ragflow.list_dataset() - assert response['code'] == RetCode.SUCCESS - - def test_list_dataset_with_checking_size_and_name(self): - """ - Test listing datasets and verify the size and names of the datasets. - """ - datasets_to_create = ["dataset1", "dataset2", "dataset3"] - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_response = [ragflow.create_dataset(name) for name in datasets_to_create] - - real_name_to_create = set() - for response in created_response: - assert 'data' in response, "Response is missing 'data' key" - dataset_name = response['data']['dataset_name'] - real_name_to_create.add(dataset_name) - - response = ragflow.list_dataset(0, 3) - listed_data = response['data'] - - listed_names = {d['name'] for d in listed_data} - assert listed_names == real_name_to_create - assert response['code'] == RetCode.SUCCESS - assert len(listed_data) == len(datasets_to_create) - - def test_list_dataset_with_getting_empty_result(self): - """ - Test listing datasets that should be empty. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - datasets_to_create = [] - created_response = [ragflow.create_dataset(name) for name in datasets_to_create] - - real_name_to_create = set() - for response in created_response: - assert 'data' in response, "Response is missing 'data' key" - dataset_name = response['data']['dataset_name'] - real_name_to_create.add(dataset_name) - - response = ragflow.list_dataset(0, 0) - listed_data = response['data'] - - listed_names = {d['name'] for d in listed_data} - - assert listed_names == real_name_to_create - assert response['code'] == RetCode.SUCCESS - assert len(listed_data) == 0 - - def test_list_dataset_with_creating_100_knowledge_bases(self): - """ - Test listing 100 datasets and verify the size and names of these datasets. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - datasets_to_create = ["dataset1"] * 100 - created_response = [ragflow.create_dataset(name) for name in datasets_to_create] - - real_name_to_create = set() - for response in created_response: - assert 'data' in response, "Response is missing 'data' key" - dataset_name = response['data']['dataset_name'] - real_name_to_create.add(dataset_name) - - res = ragflow.list_dataset(0, 100) - listed_data = res['data'] - - listed_names = {d['name'] for d in listed_data} - assert listed_names == real_name_to_create - assert res['code'] == RetCode.SUCCESS - assert len(listed_data) == 100 - - def test_list_dataset_with_showing_one_dataset(self): - """ - Test listing one dataset and verify the size of the dataset. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - response = ragflow.list_dataset(0, 1) - datasets = response['data'] - assert len(datasets) == 1 and response['code'] == RetCode.SUCCESS - - def test_list_dataset_failure(self): - """ - Test listing datasets with IndexError. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - response = ragflow.list_dataset(-1, -1) - assert "IndexError" in response['message'] and response['code'] == RetCode.EXCEPTION_ERROR - - def test_list_dataset_for_empty_datasets(self): - """ - Test listing datasets when the datasets are empty. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - response = ragflow.list_dataset() - datasets = response['data'] - assert len(datasets) == 0 and response['code'] == RetCode.SUCCESS - - # TODO: have to set the limitation of the number of datasets - - # -----------------------delete_dataset--------------------------------- - def test_delete_one_dataset_with_success(self): - """ - Test deleting a dataset with success. - """ - # get the real name of the created dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset("kb0") - real_dataset_name = res['data']['dataset_name'] - # delete this dataset - res = ragflow.delete_dataset(real_dataset_name) - assert res['code'] == RetCode.SUCCESS and 'successfully' in res['message'] - - def test_delete_dataset_with_not_existing_dataset(self): - """ - Test deleting a dataset that does not exist with failure. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.delete_dataset("weird_dataset") - assert res['code'] == RetCode.OPERATING_ERROR and res['message'] == 'The dataset cannot be found for your current account.' - - def test_delete_dataset_with_creating_100_datasets_and_deleting_100_datasets(self): - """ - Test deleting a dataset when creating 100 datasets and deleting 100 datasets. - """ - # create 100 datasets - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - datasets_to_create = ["dataset1"] * 100 - created_response = [ragflow.create_dataset(name) for name in datasets_to_create] - - real_name_to_create = set() - for response in created_response: - assert 'data' in response, "Response is missing 'data' key" - dataset_name = response['data']['dataset_name'] - real_name_to_create.add(dataset_name) - - for name in real_name_to_create: - res = ragflow.delete_dataset(name) - assert res['code'] == RetCode.SUCCESS and 'successfully' in res['message'] - - def test_delete_dataset_with_space_in_the_middle_of_the_name(self): - """ - Test deleting a dataset when its name has space in the middle. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("k b") - res = ragflow.delete_dataset("k b") - assert res['code'] == RetCode.SUCCESS and 'successfully' in res['message'] - - def test_delete_dataset_with_space_in_the_head_of_the_name(self): - """ - Test deleting a dataset when its name has space in the head. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset(" kb") - res = ragflow.delete_dataset(" kb") - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'The dataset cannot be found for your current account.') - - def test_delete_dataset_with_space_in_the_tail_of_the_name(self): - """ - Test deleting a dataset when its name has space in the tail. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("kb ") - res = ragflow.delete_dataset("kb ") - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'The dataset cannot be found for your current account.') - - def test_delete_dataset_with_only_space_in_the_name(self): - """ - Test deleting a dataset when its name only has space. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset(" ") - res = ragflow.delete_dataset(" ") - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'The dataset cannot be found for your current account.') - - def test_delete_dataset_with_only_exceeding_limit_space_in_the_name(self): - """ - Test deleting a dataset when its name only has space and the number of it exceeds the limit. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - name = " " * (NAME_LENGTH_LIMIT + 1) - ragflow.create_dataset(name) - res = ragflow.delete_dataset(name) - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'The dataset cannot be found for your current account.') - - def test_delete_dataset_with_name_with_space_in_the_head_and_tail_and_length_exceed_limit(self): - """ - Test deleting a dataset whose name has space in the head and tail, - and the length of the name exceeds the limit. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - name = " " + "k" * NAME_LENGTH_LIMIT + " " - ragflow.create_dataset(name) - res = ragflow.delete_dataset(name) - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'The dataset cannot be found for your current account.') - -# ---------------------------------get_dataset----------------------------------------- - - def test_get_dataset_with_success(self): - """ - Test getting a dataset which exists. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - response = ragflow.create_dataset("test") - dataset_name = response['data']['dataset_name'] - res = ragflow.get_dataset(dataset_name) - assert res['code'] == RetCode.SUCCESS and res['data']['name'] == dataset_name - - def test_get_dataset_with_failure(self): - """ - Test getting a dataset which does not exist. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.get_dataset("weird_dataset") - assert res['code'] == RetCode.DATA_ERROR and res['message'] == "Can't find this dataset!" - -# ---------------------------------update a dataset----------------------------------- - - def test_update_dataset_without_existing_dataset(self): - """ - Test updating a dataset which does not exist. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - params = { - 'name': 'new_name3', - 'description': 'new_description', - "permission": 'me', - "parser_id": 'naive', - "language": 'English' - } - res = ragflow.update_dataset("weird_dataset", **params) - assert (res['code'] == RetCode.OPERATING_ERROR - and res['message'] == 'Only the owner of knowledgebase is authorized for this operation!') - - def test_update_dataset_with_updating_six_parameters(self): - """ - Test updating a dataset when updating six parameters. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("new_name1") - params = { - 'name': 'new_name', - 'description': 'new_description1', - "permission": 'me', - "parser_id": 'naive', - "language": 'English' - } - res = ragflow.update_dataset("new_name1", **params) - assert res['code'] == RetCode.SUCCESS - assert (res['data']['description'] == 'new_description1' - and res['data']['name'] == 'new_name' and res['data']['permission'] == 'me' - and res['data']['language'] == 'English' and res['data']['parser_id'] == 'naive') - - def test_update_dataset_with_updating_two_parameters(self): - """ - Test updating a dataset when updating two parameters. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("new_name2") - params = { - "name": "new_name3", - "language": 'English' - } - res = ragflow.update_dataset("new_name2", **params) - assert (res['code'] == RetCode.SUCCESS and res['data']['name'] == "new_name3" - and res['data']['language'] == 'English') - - def test_update_dataset_with_updating_layout_recognize(self): - """Test updating a dataset with only updating the layout_recognize""" - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("test_update_dataset_with_updating_layout_recognize") - params = { - "layout_recognize": False - } - res = ragflow.update_dataset("test_update_dataset_with_updating_layout_recognize", **params) - assert res['code'] == RetCode.SUCCESS and res['data']['parser_config']['layout_recognize'] is False - - def test_update_dataset_with_empty_parameter(self): - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - ragflow.create_dataset("test_update_dataset_with_empty_parameter") - params = {} - res = ragflow.update_dataset("test_update_dataset_with_empty_parameter", **params) - assert (res['code'] == RetCode.DATA_ERROR - and res['message'] == 'Please input at least one parameter that you want to update!') - -# ---------------------------------mix the different methods-------------------------- - - def test_create_and_delete_dataset_together(self): - """ - Test creating 1 dataset, and then deleting 1 dataset. - Test creating 10 datasets, and then deleting 10 datasets. - """ - # create 1 dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - res = ragflow.create_dataset("ddd") - assert res['code'] == RetCode.SUCCESS and res['message'] == 'success' - - # delete 1 dataset - res = ragflow.delete_dataset("ddd") - assert res["code"] == RetCode.SUCCESS - - # create 10 datasets - datasets_to_create = ["dataset1"] * 10 - created_response = [ragflow.create_dataset(name) for name in datasets_to_create] - - real_name_to_create = set() - for response in created_response: - assert 'data' in response, "Response is missing 'data' key" - dataset_name = response['data']['dataset_name'] - real_name_to_create.add(dataset_name) - - # delete 10 datasets - for name in real_name_to_create: - res = ragflow.delete_dataset(name) - assert res["code"] == RetCode.SUCCESS - diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py deleted file mode 100644 index efc2430d4..000000000 --- a/sdk/python/test/test_document.py +++ /dev/null @@ -1,1130 +0,0 @@ -from api.settings import RetCode -from test_sdkbase import TestSdk -from ragflow import RAGFlow -import pytest -from common import API_KEY, HOST_ADDRESS - - -class TestFile(TestSdk): - """ - This class contains a suite of tests for the content management functionality within the dataset. - It ensures that the following functionalities as expected: - 1. upload local files - 2. upload remote files - 3. download a file - 4. delete a file - 5. enable rename - 6. list files - 7. start parsing - 8. end parsing - 9. check the status of the file - 10. list the chunks - 11. delete a chunk - 12. insert a new chunk - 13. edit the status of chunk - 14. get the specific chunk - 15. retrieval test - """ - -# ----------------------------upload local files----------------------------------------------------- - def test_upload_two_files(self): - """ - Test uploading two files with success. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_two_files") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.SUCCESS and res["message"] == "success" - - def test_upload_one_file(self): - """ - Test uploading one file with success. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_one_file") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.SUCCESS and res["message"] == "success" - - def test_upload_nonexistent_files(self): - """ - Test uploading a file which does not exist. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_nonexistent_files") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/imagination.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.DATA_ERROR and "does not exist" in res["message"] - - def test_upload_file_if_dataset_does_not_exist(self): - """ - Test uploading files if the dataset id does not exist. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - file_paths = ["test_data/test.txt"] - res = ragflow.upload_local_file("111", file_paths) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Can't find this dataset" - - def test_upload_file_without_name(self): - """ - Test uploading files that do not have name. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_file_without_name") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.SUCCESS - - def test_upload_file_without_name1(self): - """ - Test uploading files that do not have name. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_file_without_name") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/.txt", "test_data/empty.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.SUCCESS - - def test_upload_files_exceeding_the_number_limit(self): - """ - Test uploading files whose number exceeds the limit. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_files_exceeding_the_number_limit") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/test1.txt"] * 256 - res = ragflow.upload_local_file(dataset_id, file_paths) - assert (res["message"] == - "You try to upload 512 files, which exceeds the maximum number of uploading files: 256" - and res["code"] == RetCode.DATA_ERROR) - - def test_upload_files_without_files(self): - """ - Test uploading files without files. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_files_without_files") - dataset_id = created_res["data"]["dataset_id"] - file_paths = [None] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert (res["message"] == "None is not string." and res["code"] == RetCode.ARGUMENT_ERROR) - - def test_upload_files_with_two_files_with_same_name(self): - """ - Test uploading files with the same name. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_files_with_two_files_with_same_name") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] * 2 - res = ragflow.upload_local_file(dataset_id, file_paths) - assert (res["message"] == "success" and res["code"] == RetCode.SUCCESS) - - def test_upload_files_with_file_paths(self): - """ - Test uploading files with only specifying the file path's repo. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_files_with_file_paths") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert (res["message"] == "The file test_data/ does not exist" and res["code"] == RetCode.DATA_ERROR) - - def test_upload_files_with_remote_file_path(self): - """ - Test uploading files with remote files. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_upload_files_with_remote_file_path") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["https://github.com/genostack/ragflow"] - res = ragflow.upload_local_file(dataset_id, file_paths) - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "Remote files have not unsupported." - -# ----------------------------delete a file----------------------------------------------------- - def test_delete_one_file(self): - """ - Test deleting one file with success. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_delete_one_file") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - # get the doc_id - data = res["data"][0] - doc_id = data["id"] - # delete the files - deleted_res = ragflow.delete_files(doc_id, dataset_id) - # assert value - assert deleted_res["code"] == RetCode.SUCCESS and deleted_res["data"] is True - - def test_delete_document_with_not_existing_document(self): - """ - Test deleting a document that does not exist with failure. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_delete_document_with_not_existing_document") - dataset_id = created_res["data"]["dataset_id"] - res = ragflow.delete_files("111", dataset_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Document 111 not found!" - - def test_delete_document_with_creating_100_documents_and_deleting_100_documents(self): - """ - Test deleting documents when uploading 100 docs and deleting 100 docs. - """ - # upload 100 docs - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_delete_one_file") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] * 100 - res = ragflow.upload_local_file(dataset_id, file_paths) - - # get the doc_id - data = res["data"] - for d in data: - doc_id = d["id"] - # delete the files - deleted_res = ragflow.delete_files(doc_id, dataset_id) - # assert value - assert deleted_res["code"] == RetCode.SUCCESS and deleted_res["data"] is True - - def test_delete_document_from_nonexistent_dataset(self): - """ - Test deleting documents from a non-existent dataset - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_delete_one_file") - dataset_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] - res = ragflow.upload_local_file(dataset_id, file_paths) - # get the doc_id - data = res["data"][0] - doc_id = data["id"] - # delete the files - deleted_res = ragflow.delete_files(doc_id, "000") - # assert value - assert (deleted_res["code"] == RetCode.ARGUMENT_ERROR and deleted_res["message"] == - f"The document {doc_id} is not in the dataset: 000, but in the dataset: {dataset_id}.") - - def test_delete_document_which_is_located_in_other_dataset(self): - """ - Test deleting a document which is located in other dataset. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # upload a document - created_res = ragflow.create_dataset("test_delete_document_which_is_located_in_other_dataset") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] - res = ragflow.upload_local_file(created_res_id, file_paths) - # other dataset - other_res = ragflow.create_dataset("other_dataset") - other_dataset_id = other_res["data"]["dataset_id"] - # get the doc_id - data = res["data"][0] - doc_id = data["id"] - # delete the files from the other dataset - deleted_res = ragflow.delete_files(doc_id, other_dataset_id) - # assert value - assert (deleted_res["code"] == RetCode.ARGUMENT_ERROR and deleted_res["message"] == - f"The document {doc_id} is not in the dataset: {other_dataset_id}, but in the dataset: {created_res_id}.") - -# ----------------------------list files----------------------------------------------------- - def test_list_documents_with_success(self): - """ - Test listing documents with a successful outcome. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # upload a document - created_res = ragflow.create_dataset("test_list_documents_with_success") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 1 - - def test_list_documents_with_checking_size(self): - """ - Test listing documents and verify the size and names of the documents. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # upload 10 documents - created_res = ragflow.create_dataset("test_list_documents_with_checking_size") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] * 10 - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 10 - - def test_list_documents_with_getting_empty_result(self): - """ - Test listing documents that should be empty. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # upload 0 documents - created_res = ragflow.create_dataset("test_list_documents_with_getting_empty_result") - created_res_id = created_res["data"]["dataset_id"] - # Call the list_document method - response = ragflow.list_files(created_res_id) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 0 - - def test_list_documents_with_creating_100_documents(self): - """ - Test listing 100 documents and verify the size of these documents. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - # upload 100 documents - created_res = ragflow.create_dataset("test_list_documents_with_creating_100_documents") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt"] * 100 - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 100 - - def test_list_document_with_failure(self): - """ - Test listing documents with IndexError. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_list_document_with_failure") - created_res_id = created_res["data"]["dataset_id"] - response = ragflow.list_files(created_res_id, offset=-1, count=-1) - assert "IndexError" in response["message"] and response["code"] == RetCode.EXCEPTION_ERROR - - def test_list_document_with_verifying_offset_and_count(self): - """ - Test listing documents with verifying the functionalities of offset and count. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_list_document_with_verifying_offset_and_count") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/empty.txt"] * 10 - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id, offset=2, count=10) - - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 10 - - def test_list_document_with_verifying_keywords(self): - """ - Test listing documents with verifying the functionality of searching keywords. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_list_document_with_verifying_keywords") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/empty.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id, keywords="empty") - - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 1 - - def test_list_document_with_verifying_order_by_and_descend(self): - """ - Test listing documents with verifying the functionality of order_by and descend. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_descend") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/empty.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 2 - docs = response["data"]["docs"] - # reverse - i = 1 - for doc in docs: - assert doc["name"] in file_paths[i] - i -= 1 - - def test_list_document_with_verifying_order_by_and_ascend(self): - """ - Test listing documents with verifying the functionality of order_by and ascend. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_list_document_with_verifying_order_by_and_ascend") - created_res_id = created_res["data"]["dataset_id"] - file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - # Call the list_document method - response = ragflow.list_files(created_res_id, descend=False) - assert response["code"] == RetCode.SUCCESS and len(response["data"]["docs"]) == 3 - - docs = response["data"]["docs"] - - i = 0 - for doc in docs: - assert doc["name"] in file_paths[i] - i += 1 - -# ----------------------------update files: enable, rename, template_type------------------------------------------- - - def test_update_nonexistent_document(self): - """ - Test updating a document which does not exist. - """ - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - params = { - "name": "new_name" - } - res = ragflow.update_file(created_res_id, "weird_doc_id", **params) - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document weird_doc_id cannot be found!" - - def test_update_document_without_parameters(self): - """ - Test updating a document without giving parameters. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_without_parameters") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.DATA_ERROR and - update_res["message"] == "Please input at least one parameter that you want to update!") - - def test_update_document_in_nonexistent_dataset(self): - """ - Test updating a document in the nonexistent dataset. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_in_nonexistent_dataset") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "name": "new_name" - } - update_res = ragflow.update_file("fake_dataset_id", doc_id, **params) - assert (update_res["code"] == RetCode.DATA_ERROR and - update_res["message"] == f"This dataset fake_dataset_id cannot be found!") - - def test_update_document_with_different_extension_name(self): - """ - Test the updating of a document with an extension name that differs from its original. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_different_extension_name") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "name": "new_name.doc" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.ARGUMENT_ERROR and - update_res["message"] == "The extension of file cannot be changed") - - def test_update_document_with_duplicate_name(self): - """ - Test the updating of a document with a duplicate name. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_different_extension_name") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "name": "test.txt" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.ARGUMENT_ERROR and - update_res["message"] == "Duplicated document name in the same dataset.") - - def test_update_document_with_updating_its_name_with_success(self): - """ - Test the updating of a document's name with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "name": "new_name.txt" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.SUCCESS and - update_res["message"] == "Success" and update_res["data"]["name"] == "new_name.txt") - - def test_update_document_with_updating_its_template_type_with_success(self): - """ - Test the updating of a document's template type with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_template_type_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "template_type": "laws" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.SUCCESS and - update_res["message"] == "Success" and update_res["data"]["parser_id"] == "laws") - - def test_update_document_with_updating_its_enable_value_with_success(self): - """ - Test the updating of a document's enable value with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_enable_value_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "enable": "0" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.SUCCESS and - update_res["message"] == "Success" and update_res["data"]["status"] == "0") - - def test_update_document_with_updating_illegal_parameter(self): - """ - Test the updating of a document's illegal parameter. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_illegal_parameter") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "illegal_parameter": "0" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.ARGUMENT_ERROR and - update_res["message"] == "illegal_parameter is an illegal parameter.") - - def test_update_document_with_giving_its_name_value(self): - """ - Test the updating of a document's name without its name value. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "name": "" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.DATA_ERROR and - update_res["message"] == "There is no new name.") - - def test_update_document_with_giving_illegal_value_for_enable(self): - """ - Test the updating of a document's with giving illegal enable's value. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "enable": "?" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.DATA_ERROR and - update_res["message"] == "Illegal value ? for 'enable' field.") - - def test_update_document_with_giving_illegal_value_for_type(self): - """ - Test the updating of a document's with giving illegal type's value. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_update_document_with_updating_its_name_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # update file - params = { - "template_type": "?" - } - update_res = ragflow.update_file(created_res_id, doc_id, **params) - assert (update_res["code"] == RetCode.DATA_ERROR and - update_res["message"] == "Illegal value ? for 'template_type' field.") - -# ----------------------------download a file----------------------------------------------------- - - def test_download_nonexistent_document(self): - """ - Test downloading a document which does not exist. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - res = ragflow.download_file(created_res_id, "imagination") - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document 'imagination' cannot be found!" - - def test_download_document_in_nonexistent_dataset(self): - """ - Test downloading a document whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # download file - res = ragflow.download_file("imagination", doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset 'imagination' cannot be found!" - - def test_download_document_with_success(self): - """ - Test the downloading of a document with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # download file - with open("test_data/test.txt", "rb") as file: - binary_data = file.read() - res = ragflow.download_file(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data - - def test_download_an_empty_document(self): - """ - Test the downloading of an empty document. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/empty.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # download file - res = ragflow.download_file(created_res_id, doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty." - -# ----------------------------start parsing----------------------------------------------------- - def test_start_parsing_document_with_success(self): - """ - Test the parsing of a document with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_document_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/lol.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse file - res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_parsing_nonexistent_document(self): - """ - Test the parsing a document which does not exist. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - res = ragflow.start_parsing_document(created_res_id, "imagination") - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination' cannot be found!" - - def test_start_parsing_document_in_nonexistent_dataset(self): - """ - Test the parsing a document whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse - res = ragflow.start_parsing_document("imagination", doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" - - def test_start_parsing_an_empty_document(self): - """ - Test the parsing of an empty document. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/empty.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " - - # ------------------------parsing multiple documents---------------------------- - def test_start_parsing_documents_in_nonexistent_dataset(self): - """ - Test the parsing documents whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # parse - res = ragflow.start_parsing_documents("imagination") - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" - - def test_start_parsing_multiple_documents(self): - """ - Test the parsing documents with a success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" - - def test_start_parsing_multiple_documents_with_one_empty_file(self): - """ - Test the parsing documents, one of which is empty. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " - - def test_start_parsing_multiple_specific_documents(self): - """ - Test the parsing documents whose document ids are specified. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_re_parsing_multiple_specific_documents(self): - """ - Test the re-parsing documents. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - # re-parse - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_re_parsing_multiple_specific_documents_with_changing_parser_id(self): - """ - Test the re-parsing documents after changing the parser id. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - # general -> laws - params = { - "template_type": "laws" - } - ragflow.update_file(created_res_id, doc_ids[0], **params) - # re-parse - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_re_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self): - """ - Test the re-parsing documents after changing an illegal parser id. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - # general -> illegal - params = { - "template_type": "illegal" - } - res = ragflow.update_file(created_res_id, doc_ids[0], **params) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field." - # re-parse - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_parsing_multiple_specific_documents_with_changing_illegal_parser_id(self): - """ - Test the parsing documents after changing an illegal parser id. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - # general -> illegal - params = { - "template_type": "illegal" - } - res = ragflow.update_file(created_res_id, doc_ids[0], **params) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'template_type' field." - # re-parse - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal(self): - """ - Test the parsing documents whose dataset's parser id is illegal. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal") - created_res_id = created_res["data"]["dataset_id"] - # update the parser id - params = { - "chunk_method": "illegal" - } - res = ragflow.update_dataset("test_start_parsing_multiple_documents_in_the_dataset_whose_parser_id_is_illegal", **params) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "Illegal value illegal for 'chunk_method' field." - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - # parse - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - -# ----------------------------stop parsing----------------------------------------------------- - def test_stop_parsing_document_with_success(self): - """ - Test the stopping parsing of a document with success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_document_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/lol.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse file - res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - res = ragflow.stop_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - - def test_stop_parsing_nonexistent_document(self): - """ - Test the stopping parsing a document which does not exist. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - res = ragflow.stop_parsing_document(created_res_id, "imagination.txt") - assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == "This document 'imagination.txt' cannot be found!" - - def test_stop_parsing_document_in_nonexistent_dataset(self): - """ - Test the stopping parsing a document whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse - res = ragflow.stop_parsing_document("imagination", doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" - - # ------------------------stop parsing multiple documents---------------------------- - def test_stop_parsing_documents_in_nonexistent_dataset(self): - """ - Test the stopping parsing documents whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_download_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # parse - res = ragflow.stop_parsing_documents("imagination") - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset 'imagination' cannot be found!" - - def test_stop_parsing_multiple_documents(self): - """ - Test the stopping parsing documents with a success. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" - - res = ragflow.stop_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" - - def test_stop_parsing_multiple_documents_with_one_empty_file(self): - """ - Test the stopping parsing documents, one of which is empty. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt", "test_data/empty.txt"] - ragflow.upload_local_file(created_res_id, file_paths) - res = ragflow.start_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "Empty data in the document: empty.txt; " - res = ragflow.stop_parsing_documents(created_res_id) - assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" - - def test_stop_parsing_multiple_specific_documents(self): - """ - Test the stopping parsing documents whose document ids are specified. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset(" test_start_parsing_multiple_documents") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt", "test_data/test1.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"] - doc_ids = [] - for d in data: - doc_ids.append(d["id"]) - res = ragflow.start_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - res = ragflow.stop_parsing_documents(created_res_id, doc_ids) - assert res["code"] == RetCode.SUCCESS and res["data"] is True and res["message"] == "" - -# ----------------------------show the status of the file----------------------------------------------------- - def test_show_status_with_success(self): - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_show_status_with_success") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/lol.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse file - res = ragflow.start_parsing_document(created_res_id, doc_id) - assert res["code"] == RetCode.SUCCESS and res["message"] == "" - # show status - status_res = ragflow.show_parsing_status(created_res_id, doc_id) - assert status_res["code"] == RetCode.SUCCESS and status_res["data"]["status"] == "RUNNING" - - def test_show_status_nonexistent_document(self): - """ - Test showing the status of a document which does not exist. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_show_status_nonexistent_document") - created_res_id = created_res["data"]["dataset_id"] - res = ragflow.show_parsing_status(created_res_id, "imagination") - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This document: 'imagination' is not a valid document." - - def test_show_status_document_in_nonexistent_dataset(self): - """ - Test showing the status of a document whose dataset is nonexistent. - """ - # create a dataset - ragflow = RAGFlow(API_KEY, HOST_ADDRESS) - created_res = ragflow.create_dataset("test_show_status_document_in_nonexistent_dataset") - created_res_id = created_res["data"]["dataset_id"] - # upload files - file_paths = ["test_data/test.txt"] - uploading_res = ragflow.upload_local_file(created_res_id, file_paths) - # get the doc_id - data = uploading_res["data"][0] - doc_id = data["id"] - # parse - res = ragflow.show_parsing_status("imagination", doc_id) - assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This dataset: 'imagination' cannot be found!" -# ----------------------------list the chunks of the file----------------------------------------------------- - -# ----------------------------delete the chunk----------------------------------------------------- - -# ----------------------------edit the status of the chunk----------------------------------------------------- - -# ----------------------------insert a new chunk----------------------------------------------------- - -# ----------------------------get a specific chunk----------------------------------------------------- - -# ----------------------------retrieval test-----------------------------------------------------