API: upload document api (#1264)

### What problem does this PR solve? API: Adds the feature of uploading document. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-08-13 11:39:03 +08:00 · 2024-06-25 12:16:28 +08:00 · 2024-06-25 12:16:28 +08:00 · f6ae8fcb71
commit f6ae8fcb71
parent d1ea429bdd
8 changed files with 425 additions and 23 deletions
--- a/api/apps/dataset_api.py
+++ b/api/apps/dataset_api.py
@ -14,36 +14,23 @@
 #  limitations under the License.


-import json
-import os
-import re
-from datetime import datetime, timedelta
-from flask import request, Response
+from flask import request
 from flask_login import login_required, current_user
 from httpx import HTTPError

-from api.db import FileType, ParserType, FileSource, StatusEnum
-from api.db.db_models import APIToken, API4Conversation, Task, File
+from api.contants import NAME_LENGTH_LIMIT
+from api.db import FileSource, StatusEnum
+from api.db.db_models import File
 from api.db.services import duplicate_name
-from api.db.services.api_service import APITokenService, API4ConversationService
-from api.db.services.dialog_service import DialogService, chat
 from api.db.services.document_service import DocumentService
 from api.db.services.file2document_service import File2DocumentService
 from api.db.services.file_service import FileService
 from api.db.services.knowledgebase_service import KnowledgebaseService
-from api.db.services.task_service import queue_tasks, TaskService
-from api.db.services.user_service import UserTenantService, TenantService
-from api.settings import RetCode, retrievaler
-from api.utils import get_uuid, current_timestamp, datetime_format
-# from api.utils.api_utils import server_error_response, get_data_error_result, get_json_result, validate_request
-from itsdangerous import URLSafeTimedSerializer
-
-from api.utils.file_utils import filename_type, thumbnail
-from rag.utils.minio_conn import MINIO
-
-# import library
+from api.db.services.user_service import TenantService
+from api.settings import RetCode
+from api.utils import get_uuid
 from api.utils.api_utils import construct_json_result, construct_result, construct_error_response, validate_request
-from api.contants import NAME_LENGTH_LIMIT
+

 # ------------------------------ create a dataset ---------------------------------------

--- a/api/apps/documents_api.py
+++ b/api/apps/documents_api.py
@ -0,0 +1,172 @@
+#
+#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License
+#
+
+import os
+import re
+import warnings
+
+from flask import request
+from flask_login import login_required, current_user
+
+from api.db import FileType, ParserType
+from api.db.services import duplicate_name
+from api.db.services.document_service import DocumentService
+from api.db.services.file_service import FileService
+from api.db.services.knowledgebase_service import KnowledgebaseService
+from api.settings import RetCode
+from api.utils import get_uuid
+from api.utils.api_utils import construct_json_result
+from api.utils.file_utils import filename_type, thumbnail
+from rag.utils.minio_conn import MINIO
+
+
+MAXIMUM_OF_UPLOADING_FILES = 256
+
+
+# ----------------------------upload local files-----------------------------------------------------
+@manager.route('/<dataset_id>', methods=['POST'])
+@login_required
+def upload(dataset_id):
+    # no files
+    if not request.files:
+        return construct_json_result(
+            message='There is no file!', code=RetCode.ARGUMENT_ERROR)
+
+    # the number of uploading files exceeds the limit
+    file_objs = request.files.getlist('file')
+    num_file_objs = len(file_objs)
+
+    if num_file_objs > MAXIMUM_OF_UPLOADING_FILES:
+        return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
+                                                                      f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
+
+    for file_obj in file_objs:
+        # the content of the file
+        file_content = file_obj.read()
+        file_name = file_obj.filename
+        # no name
+        if not file_name:
+            return construct_json_result(
+                message='There is a file without name!', code=RetCode.ARGUMENT_ERROR)
+
+        # TODO: support the remote files
+        if 'http' in file_name:
+            return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
+
+        # the content is empty, raising a warning
+        if file_content == b'':
+            warnings.warn(f"[WARNING]: The file {file_name} is empty.")
+
+    # no dataset
+    exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
+    if not exist:
+        return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
+
+    # get the root_folder
+    root_folder = FileService.get_root_folder(current_user.id)
+    # get the id of the root_folder
+    parent_file_id = root_folder["id"]  # document id
+    # this is for the new user, create '.knowledgebase' file
+    FileService.init_knowledgebase_docs(parent_file_id, current_user.id)
+    # go inside this folder, get the kb_root_folder
+    kb_root_folder = FileService.get_kb_folder(current_user.id)
+    # link the file management to the kb_folder
+    kb_folder = FileService.new_a_file_from_kb(dataset.tenant_id, dataset.name, kb_root_folder["id"])
+
+    # grab all the errs
+    err = []
+    MAX_FILE_NUM_PER_USER = int(os.environ.get('MAX_FILE_NUM_PER_USER', 0))
+    for file in file_objs:
+        try:
+            # TODO: get this value from the database as some tenants have this limit while others don't
+            if MAX_FILE_NUM_PER_USER > 0 and DocumentService.get_doc_count(dataset.tenant_id) >= MAX_FILE_NUM_PER_USER:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="Exceed the maximum file number of a free user!")
+            # deal with the duplicate name
+            filename = duplicate_name(
+                DocumentService.query,
+                name=file.filename,
+                kb_id=dataset.id)
+
+            # deal with the unsupported type
+            filetype = filename_type(filename)
+            if filetype == FileType.OTHER.value:
+                return construct_json_result(code=RetCode.DATA_ERROR,
+                                             message="This type of file has not been supported yet!")
+
+            # upload to the minio
+            location = filename
+            while MINIO.obj_exist(dataset_id, location):
+                location += "_"
+            blob = file.read()
+            MINIO.put(dataset_id, location, blob)
+            doc = {
+                "id": get_uuid(),
+                "kb_id": dataset.id,
+                "parser_id": dataset.parser_id,
+                "parser_config": dataset.parser_config,
+                "created_by": current_user.id,
+                "type": filetype,
+                "name": filename,
+                "location": location,
+                "size": len(blob),
+                "thumbnail": thumbnail(filename, blob)
+            }
+            if doc["type"] == FileType.VISUAL:
+                doc["parser_id"] = ParserType.PICTURE.value
+            if re.search(r"\.(ppt|pptx|pages)$", filename):
+                doc["parser_id"] = ParserType.PRESENTATION.value
+            DocumentService.insert(doc)
+
+            FileService.add_file_from_kb(doc, kb_folder["id"], dataset.tenant_id)
+        except Exception as e:
+            err.append(file.filename + ": " + str(e))
+
+    if err:
+        # return all the errors
+        return construct_json_result(message="\n".join(err), code=RetCode.SERVER_ERROR)
+    # success
+    return construct_json_result(data=True, code=RetCode.SUCCESS)
+
+# ----------------------------upload online files------------------------------------------------
+
+# ----------------------------download a file-----------------------------------------------------
+
+# ----------------------------delete a file-----------------------------------------------------
+
+# ----------------------------enable rename-----------------------------------------------------
+
+# ----------------------------list files-----------------------------------------------------
+
+# ----------------------------start parsing-----------------------------------------------------
+
+# ----------------------------stop parsing-----------------------------------------------------
+
+# ----------------------------show the status of the file-----------------------------------------------------
+
+# ----------------------------list the chunks of the file-----------------------------------------------------
+
+# ----------------------------delete the chunk-----------------------------------------------------
+
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+
+# ----------------------------insert a new chunk-----------------------------------------------------
+
+# ----------------------------upload a file-----------------------------------------------------
+
+# ----------------------------get a specific chunk-----------------------------------------------------
+
+# ----------------------------retrieval test-----------------------------------------------------
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@ -13,9 +13,12 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.

-import os
-import requests
 import json
+import os
+
+import requests
+
+from api.settings import RetCode


 class RAGFlow:
@ -23,10 +26,12 @@ class RAGFlow:
        '''
        api_url: http://<host_address>/api/v1
        dataset_url: http://<host_address>/api/v1/dataset
+        document_url: http://<host_address>/api/v1/documents
        '''
        self.user_key = user_key
        self.api_url = f"{base_url}/api/{version}"
        self.dataset_url = f"{self.api_url}/dataset"
+        self.document_url = f"{self.api_url}/documents"
        self.authorization_header = {"Authorization": "{}".format(self.user_key)}

    def create_dataset(self, dataset_name):
@ -73,3 +78,54 @@ class RAGFlow:
        endpoint = f"{self.dataset_url}/{dataset_id}"
        response = requests.put(endpoint, json=params, headers=self.authorization_header)
        return response.json()
+
+# -------------------- content management -----------------------------------------------------
+
+    # ----------------------------upload local files-----------------------------------------------------
+    def upload_local_file(self, dataset_id, file_paths):
+        files = []
+
+        for file_path in file_paths:
+            if not isinstance(file_path, str):
+                return {'code': RetCode.ARGUMENT_ERROR, 'message': f"{file_path} is not string."}
+            if 'http' in file_path:
+                return {'code': RetCode.ARGUMENT_ERROR, 'message': "Remote files have not unsupported."}
+            if os.path.isfile(file_path):
+                files.append(('file', open(file_path, 'rb')))
+            else:
+                return {'code': RetCode.DATA_ERROR, 'message': f"The file {file_path} does not exist"}
+
+        res = requests.request('POST', url=f"{self.document_url}/{dataset_id}", files=files,
+                               headers=self.authorization_header)
+
+        result_dict = json.loads(res.text)
+        return result_dict
+
+    # ----------------------------upload remote files-----------------------------------------------------
+    # ----------------------------download a file-----------------------------------------------------
+
+    # ----------------------------delete a file-----------------------------------------------------
+
+    # ----------------------------enable rename-----------------------------------------------------
+
+    # ----------------------------list files-----------------------------------------------------
+
+    # ----------------------------start parsing-----------------------------------------------------
+
+    # ----------------------------stop parsing-----------------------------------------------------
+
+    # ----------------------------show the status of the file-----------------------------------------------------
+
+    # ----------------------------list the chunks of the file-----------------------------------------------------
+
+    # ----------------------------delete the chunk-----------------------------------------------------
+
+    # ----------------------------edit the status of the chunk-----------------------------------------------------
+
+    # ----------------------------insert a new chunk-----------------------------------------------------
+
+    # ----------------------------upload a file-----------------------------------------------------
+
+    # ----------------------------get a specific chunk-----------------------------------------------------
+
+    # ----------------------------retrieval test-----------------------------------------------------
--- a/sdk/python/test/test_data/.txt
+++ b/sdk/python/test/test_data/.txt
@ -0,0 +1,2 @@
+hhh
+hhh
--- a/sdk/python/test/test_data/empty.txt
+++ b/sdk/python/test/test_data/empty.txt
--- a/sdk/python/test/test_data/test.txt
+++ b/sdk/python/test/test_data/test.txt
@ -0,0 +1,3 @@
+test
+test
+test
--- a/sdk/python/test/test_data/test1.txt
+++ b/sdk/python/test/test_data/test1.txt
@ -0,0 +1,2 @@
+test1
+test1
--- a/sdk/python/test/test_document.py
+++ b/sdk/python/test/test_document.py
@ -0,0 +1,180 @@
+from api.settings import RetCode
+from test_sdkbase import TestSdk
+from ragflow import RAGFlow
+import pytest
+from common import API_KEY, HOST_ADDRESS
+from api.contants import NAME_LENGTH_LIMIT
+
+
+class TestFile(TestSdk):
+    """
+    This class contains a suite of tests for the content management functionality within the dataset.
+    It ensures that the following functionalities as expected:
+        1. upload local files
+        2. upload remote files
+        3. download a file
+        4. delete a file
+        5. enable rename
+        6. list files
+        7. start parsing
+        8. end parsing
+        9. check the status of the file
+        10. list the chunks
+        11. delete a chunk
+        12. insert a new chunk
+        13. edit the status of chunk
+        14. get the specific chunk
+        15. retrieval test
+    """
+
+# ----------------------------upload local files-----------------------------------------------------
+    def test_upload_two_files(self):
+        """
+        Test uploading two files with success.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_two_files")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
+
+    def test_upload_one_file(self):
+        """
+        Test uploading one file with success.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_one_file")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt"]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS and res['data'] is True and res['message'] == 'success'
+
+    def test_upload_nonexistent_files(self):
+        """
+        Test uploading a file which does not exist.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_nonexistent_files")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/imagination.txt"]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.DATA_ERROR and "does not exist" in res['message']
+
+    def test_upload_file_if_dataset_does_not_exist(self):
+        """
+        Test uploading files if the dataset id does not exist.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        file_paths = ["test_data/test.txt"]
+        res = ragflow.upload_local_file("111", file_paths)
+        assert res['code'] == RetCode.DATA_ERROR and res['message'] == "Can't find this dataset"
+
+    def test_upload_file_without_name(self):
+        """
+        Test uploading files that do not have name.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_file_without_name")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/.txt"]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS
+
+    def test_upload_file_without_name1(self):
+        """
+        Test uploading files that do not have name.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_file_without_name")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/.txt", "test_data/empty.txt"]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.SUCCESS
+
+    def test_upload_files_exceeding_the_number_limit(self):
+        """
+        Test uploading files whose number exceeds the limit.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_files_exceeding_the_number_limit")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ["test_data/test.txt", "test_data/test1.txt"] * 256
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert (res['message'] ==
+                'You try to upload 512 files, which exceeds the maximum number of uploading files: 256'
+                and res['code'] == RetCode.DATA_ERROR)
+
+    def test_upload_files_without_files(self):
+        """
+        Test uploading files without files.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_files_without_files")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = [None]
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert (res['message'] == 'None is not string.' and res['code'] == RetCode.ARGUMENT_ERROR)
+
+    def test_upload_files_with_two_files_with_same_name(self):
+        """
+        Test uploading files with the same name.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_files_with_two_files_with_same_name")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ['test_data/test.txt'] * 2
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert (res['message'] == 'success' and res['code'] == RetCode.SUCCESS)
+
+    def test_upload_files_with_file_paths(self):
+        """
+        Test uploading files with only specifying the file path's repo.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_files_with_file_paths")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ['test_data/']
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert (res['message'] == 'The file test_data/ does not exist' and res['code'] == RetCode.DATA_ERROR)
+
+    def test_upload_files_with_remote_file_path(self):
+        """
+        Test uploading files with remote files.
+        """
+        ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
+        created_res = ragflow.create_dataset("test_upload_files_with_remote_file_path")
+        dataset_id = created_res['data']['dataset_id']
+        file_paths = ['https://github.com/genostack/ragflow']
+        res = ragflow.upload_local_file(dataset_id, file_paths)
+        assert res['code'] == RetCode.ARGUMENT_ERROR and res['message'] == 'Remote files have not unsupported.'
+
+# ----------------------------upload remote files-----------------------------------------------------
+
+# ----------------------------download a file-----------------------------------------------------
+
+# ----------------------------delete a file-----------------------------------------------------
+
+# ----------------------------enable rename-----------------------------------------------------
+
+# ----------------------------list files-----------------------------------------------------
+
+# ----------------------------start parsing-----------------------------------------------------
+
+# ----------------------------stop parsing-----------------------------------------------------
+
+# ----------------------------show the status of the file-----------------------------------------------------
+
+# ----------------------------list the chunks of the file-----------------------------------------------------
+
+# ----------------------------delete the chunk-----------------------------------------------------
+
+# ----------------------------edit the status of the chunk-----------------------------------------------------
+
+# ----------------------------insert a new chunk-----------------------------------------------------
+
+# ----------------------------upload a file-----------------------------------------------------
+
+# ----------------------------get a specific chunk-----------------------------------------------------
+
+# ----------------------------retrieval test-----------------------------------------------------