diff --git a/api/apps/dataset_api.py b/api/apps/dataset_api.py index 0f7c8efe6..697f8fa99 100644 --- a/api/apps/dataset_api.py +++ b/api/apps/dataset_api.py @@ -14,36 +14,23 @@ # limitations under the License. -import json -import os -import re -from datetime import datetime, timedelta -from flask import request, Response +from flask import request from flask_login import login_required, current_user from httpx import HTTPError -from api.db import FileType, ParserType, FileSource, StatusEnum -from api.db.db_models import APIToken, API4Conversation, Task, File +from api.contants import NAME_LENGTH_LIMIT +from api.db import FileSource, StatusEnum +from api.db.db_models import File from api.db.services import duplicate_name -from api.db.services.api_service import APITokenService, API4ConversationService -from api.db.services.dialog_service import DialogService, chat from api.db.services.document_service import DocumentService from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService -from api.db.services.task_service import queue_tasks, TaskService -from api.db.services.user_service import UserTenantService, TenantService -from api.settings import RetCode, retrievaler -from api.utils import get_uuid, current_timestamp, datetime_format -# from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request -from itsdangerous import URLSafeTimedSerializer - -from api.utils.file_utils import filename_type, thumbnail -from rag.utils.minio_conn import MINIO - -# import library +from api.db.services.user_service import TenantService +from api.settings import RetCode +from api.utils import get_uuid from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request -from api.contants import NAME_LENGTH_LIMIT + # ------------------------------ create a dataset --------------------------------------- diff --git a/api/apps/documents_api.py b/api/apps/documents_api.py new file mode 100644 index 000000000..7338eeb66 --- /dev/null +++ b/api/apps/documents_api.py @@ -0,0 +1,172 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +# + +import os +import re +import warnings + +from flask import request +from flask_login import login_required, current_user + +from api.db import FileType, ParserType +from api.db.services import duplicate_name +from api.db.services.document_service import DocumentService +from api.db.services.file_service import FileService +from api.db.services.knowledgebase_service import KnowledgebaseService +from api.settings import RetCode +from api.utils import get_uuid +from api.utils.api_utils import construct_json_result +from api.utils.file_utils import filename_type, thumbnail +from rag.utils.minio_conn import MINIO + + +MAXIMUM_OF_UPLOADING_FILES = 256 + + +# ----------------------------upload local files----------------------------------------------------- +@manager.route('/', methods=['POST']) +@login_required +def upload(dataset_id): + # no files + if not request.files: + return construct_json_result( + message='There is no file!', code=RetCode.ARGUMENT_ERROR) + + # the number of uploading files exceeds the limit + file_objs = request.files.getlist('file') + num_file_objs = len(file_objs) + + if num_file_objs > MAXIMUM_OF_UPLOADING_FILES: + return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " + f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") + + for file_obj in file_objs: + # the content of the file + file_content = file_obj.read() + file_name = file_obj.filename + # no name + if not file_name: + return construct_json_result( + message='There is a file without name!', code=RetCode.ARGUMENT_ERROR) + + # TODO: support the remote files + if 'http' in file_name: + return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") + + # the content is empty, raising a warning + if file_content == b'': + warnings.warn(f"[WARNING]: The file {file_name} is empty.") + + # no dataset + exist, dataset = KnowledgebaseService.get_by_id(dataset_id) + if not exist: + return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR) + + # get the root_folder + root_folder = FileService.get_root_folder(current_user.id) + # get the id of the root_folder + parent_file_id = root_folder["id"] # document id + # this is for the new user, create '.knowledgebase' file + FileService.init_knowledgebase_docs(parent_file_id, current_user.id) + # go inside this folder, get the kb_root_folder + kb_root_folder = FileService.get_kb_folder(current_user.id) + # link the file management to the kb_folder + kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"]) + + # grab all the errs + err = [] + MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0)) + for file in file_objs: + try: + # TODO: get this value from the database as some tenants have this limit while others don't + if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER: + return construct_json_result(code=RetCode.DATA_ERROR, + message="Exceed the maximum file number of a free user!") + # deal with the duplicate name + filename = duplicate_name( + DocumentService.query, + name=file.filename, + kb_id=dataset.id) + + # deal with the unsupported type + filetype = filename_type(filename) + if filetype == FileType.OTHER.value: + return construct_json_result(code=RetCode.DATA_ERROR, + message="This type of file has not been supported yet!") + + # upload to the minio + location = filename + while MINIO.obj_exist(dataset_id, location): + location += "_" + blob = file.read() + MINIO.put(dataset_id, location, blob) + doc = { + "id": get_uuid(), + "kb_id": dataset.id, + "parser_id": dataset.parser_id, + "parser_config": dataset.parser_config, + "created_by": current_user.id, + "type": filetype, + "name": filename, + "location": location, + "size": len(blob), + "thumbnail": thumbnail(filename, blob) + } + if doc["type"] == FileType.VISUAL: + doc["parser_id"] = ParserType.PICTURE.value + if re.search(r"\.(ppt|pptx|pages)$", filename): + doc["parser_id"] = ParserType.PRESENTATION.value + DocumentService.insert(doc) + + FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id) + except Exception as e: + err.append(file.filename + ": " + str(e)) + + if err: + # return all the errors + return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR) + # success + return construct_json_result(data=True, code=RetCode.SUCCESS) + +# ----------------------------upload online files------------------------------------------------ + +# ----------------------------download a file----------------------------------------------------- + +# ----------------------------delete a file----------------------------------------------------- + +# ----------------------------enable rename----------------------------------------------------- + +# ----------------------------list files----------------------------------------------------- + +# ----------------------------start parsing----------------------------------------------------- + +# ----------------------------stop parsing----------------------------------------------------- + +# ----------------------------show the status of the file----------------------------------------------------- + +# ----------------------------list the chunks of the file----------------------------------------------------- + +# ----------------------------delete the chunk----------------------------------------------------- + +# ----------------------------edit the status of the chunk----------------------------------------------------- + +# ----------------------------insert a new chunk----------------------------------------------------- + +# ----------------------------upload a file----------------------------------------------------- + +# ----------------------------get a specific chunk----------------------------------------------------- + +# ----------------------------retrieval test----------------------------------------------------- diff --git a/sdk/python/ragflow/ragflow.py b/sdk/python/ragflow/ragflow.py index fe3477479..4277480c6 100644 --- a/sdk/python/ragflow/ragflow.py +++ b/sdk/python/ragflow/ragflow.py @@ -13,9 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -import requests import json +import os + +import requests + +from api.settings import RetCode class RAGFlow: @@ -23,10 +26,12 @@ class RAGFlow: ''' api_url: http:///api/v1 dataset_url: http:///api/v1/dataset + document_url: http:///api/v1/documents ''' self.user_key = user_key self.api_url = f"{base_url}/api/{version}" self.dataset_url = f"{self.api_url}/dataset" + self.document_url = f"{self.api_url}/documents" self.authorization_header = {"Authorization": "{}".format(self.user_key)} def create_dataset(self, dataset_name): @@ -73,3 +78,54 @@ class RAGFlow: endpoint = f"{self.dataset_url}/{dataset_id}" response = requests.put(endpoint, json=params, headers=self.authorization_header) return response.json() + +# -------------------- content management ----------------------------------------------------- + + # ----------------------------upload local files----------------------------------------------------- + def upload_local_file(self, dataset_id, file_paths): + files = [] + + for file_path in file_paths: + if not isinstance(file_path, str): + return {'code': RetCode.ARGUMENT_ERROR, 'message': f"{file_path} is not string."} + if 'http' in file_path: + return {'code': RetCode.ARGUMENT_ERROR, 'message': "Remote files have not unsupported."} + if os.path.isfile(file_path): + files.append(('file', open(file_path, 'rb'))) + else: + return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"} + + res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files, + headers=self.authorization_header) + + result_dict = json.loads(res.text) + return result_dict + + # ----------------------------upload remote files----------------------------------------------------- + # ----------------------------download a file----------------------------------------------------- + + # ----------------------------delete a file----------------------------------------------------- + + # ----------------------------enable rename----------------------------------------------------- + + # ----------------------------list files----------------------------------------------------- + + # ----------------------------start parsing----------------------------------------------------- + + # ----------------------------stop parsing----------------------------------------------------- + + # ----------------------------show the status of the file----------------------------------------------------- + + # ----------------------------list the chunks of the file----------------------------------------------------- + + # ----------------------------delete the chunk----------------------------------------------------- + + # ----------------------------edit the status of the chunk----------------------------------------------------- + + # ----------------------------insert a new chunk----------------------------------------------------- + + # ----------------------------upload a file----------------------------------------------------- + + # ----------------------------get a specific chunk----------------------------------------------------- + + # ----------------------------retrieval test----------------------------------------------------- diff --git a/sdk/python/test/test_data/.txt b/sdk/python/test/test_data/.txt new file mode 100644 index 000000000..b0b611ed5 --- /dev/null +++ b/sdk/python/test/test_data/.txt @@ -0,0 +1,2 @@ +hhh +hhh \ No newline at end of file diff --git a/sdk/python/test/test_data/empty.txt b/sdk/python/test/test_data/empty.txt new file mode 100644 index 000000000..e69de29bb diff --git a/sdk/python/test/test_data/test.txt b/sdk/python/test/test_data/test.txt new file mode 100644 index 000000000..30fde28b9 --- /dev/null +++ b/sdk/python/test/test_data/test.txt @@ -0,0 +1,3 @@ +test +test +test \ No newline at end of file diff --git a/sdk/python/test/test_data/test1.txt b/sdk/python/test/test_data/test1.txt new file mode 100644 index 000000000..90b65adde --- /dev/null +++ b/sdk/python/test/test_data/test1.txt @@ -0,0 +1,2 @@ +test1 +test1 \ No newline at end of file diff --git a/sdk/python/test/test_document.py b/sdk/python/test/test_document.py new file mode 100644 index 000000000..f22ebca70 --- /dev/null +++ b/sdk/python/test/test_document.py @@ -0,0 +1,180 @@ +from api.settings import RetCode +from test_sdkbase import TestSdk +from ragflow import RAGFlow +import pytest +from common import API_KEY, HOST_ADDRESS +from api.contants import NAME_LENGTH_LIMIT + + +class TestFile(TestSdk): + """ + This class contains a suite of tests for the content management functionality within the dataset. + It ensures that the following functionalities as expected: + 1. upload local files + 2. upload remote files + 3. download a file + 4. delete a file + 5. enable rename + 6. list files + 7. start parsing + 8. end parsing + 9. check the status of the file + 10. list the chunks + 11. delete a chunk + 12. insert a new chunk + 13. edit the status of chunk + 14. get the specific chunk + 15. retrieval test + """ + +# ----------------------------upload local files----------------------------------------------------- + def test_upload_two_files(self): + """ + Test uploading two files with success. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_two_files") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/test1.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' + + def test_upload_one_file(self): + """ + Test uploading one file with success. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_one_file") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success' + + def test_upload_nonexistent_files(self): + """ + Test uploading a file which does not exist. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_nonexistent_files") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/imagination.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.DATA_ERROR and "does not exist" in res['message'] + + def test_upload_file_if_dataset_does_not_exist(self): + """ + Test uploading files if the dataset id does not exist. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + file_paths = ["test_data/test.txt"] + res = ragflow.upload_local_file("111", file_paths) + assert res['code'] == RetCode.DATA_ERROR and res['message'] == "Can't find this dataset" + + def test_upload_file_without_name(self): + """ + Test uploading files that do not have name. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_file_without_name") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.SUCCESS + + def test_upload_file_without_name1(self): + """ + Test uploading files that do not have name. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_file_without_name") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/.txt", "test_data/empty.txt"] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.SUCCESS + + def test_upload_files_exceeding_the_number_limit(self): + """ + Test uploading files whose number exceeds the limit. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_files_exceeding_the_number_limit") + dataset_id = created_res['data']['dataset_id'] + file_paths = ["test_data/test.txt", "test_data/test1.txt"] * 256 + res = ragflow.upload_local_file(dataset_id, file_paths) + assert (res['message'] == + 'You try to upload 512 files, which exceeds the maximum number of uploading files: 256' + and res['code'] == RetCode.DATA_ERROR) + + def test_upload_files_without_files(self): + """ + Test uploading files without files. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_files_without_files") + dataset_id = created_res['data']['dataset_id'] + file_paths = [None] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert (res['message'] == 'None is not string.' and res['code'] == RetCode.ARGUMENT_ERROR) + + def test_upload_files_with_two_files_with_same_name(self): + """ + Test uploading files with the same name. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_files_with_two_files_with_same_name") + dataset_id = created_res['data']['dataset_id'] + file_paths = ['test_data/test.txt'] * 2 + res = ragflow.upload_local_file(dataset_id, file_paths) + assert (res['message'] == 'success' and res['code'] == RetCode.SUCCESS) + + def test_upload_files_with_file_paths(self): + """ + Test uploading files with only specifying the file path's repo. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_files_with_file_paths") + dataset_id = created_res['data']['dataset_id'] + file_paths = ['test_data/'] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert (res['message'] == 'The file test_data/ does not exist' and res['code'] == RetCode.DATA_ERROR) + + def test_upload_files_with_remote_file_path(self): + """ + Test uploading files with remote files. + """ + ragflow = RAGFlow(API_KEY, HOST_ADDRESS) + created_res = ragflow.create_dataset("test_upload_files_with_remote_file_path") + dataset_id = created_res['data']['dataset_id'] + file_paths = ['https://github.com/genostack/ragflow'] + res = ragflow.upload_local_file(dataset_id, file_paths) + assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.' + +# ----------------------------upload remote files----------------------------------------------------- + +# ----------------------------download a file----------------------------------------------------- + +# ----------------------------delete a file----------------------------------------------------- + +# ----------------------------enable rename----------------------------------------------------- + +# ----------------------------list files----------------------------------------------------- + +# ----------------------------start parsing----------------------------------------------------- + +# ----------------------------stop parsing----------------------------------------------------- + +# ----------------------------show the status of the file----------------------------------------------------- + +# ----------------------------list the chunks of the file----------------------------------------------------- + +# ----------------------------delete the chunk----------------------------------------------------- + +# ----------------------------edit the status of the chunk----------------------------------------------------- + +# ----------------------------insert a new chunk----------------------------------------------------- + +# ----------------------------upload a file----------------------------------------------------- + +# ----------------------------get a specific chunk----------------------------------------------------- + +# ----------------------------retrieval test-----------------------------------------------------