API: Download doc api (#1354)

### What problem does this PR solve?

Adds download_document api

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
cecilia-uu 2024-07-04 16:33:55 +08:00 committed by GitHub
parent daaf6aed50
commit 306108fe0e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 137 additions and 17 deletions

View File

@ -12,15 +12,16 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import pathlib import pathlib
import re import re
import warnings import warnings
from io import BytesIO
from flask import request from flask import request, send_file
from flask_login import login_required, current_user from flask_login import login_required, current_user
from httpx import HTTPError from httpx import HTTPError
from minio import S3Error
from api.contants import NAME_LENGTH_LIMIT from api.contants import NAME_LENGTH_LIMIT
from api.db import FileType, ParserType, FileSource from api.db import FileType, ParserType, FileSource
@ -283,9 +284,12 @@ def upload_documents(dataset_id):
return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, " return construct_json_result(code=RetCode.DATA_ERROR, message=f"You try to upload {num_file_objs} files, "
f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}") f"which exceeds the maximum number of uploading files: {MAXIMUM_OF_UPLOADING_FILES}")
# no dataset
exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
if not exist:
return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
for file_obj in file_objs: for file_obj in file_objs:
# the content of the file
file_content = file_obj.read()
file_name = file_obj.filename file_name = file_obj.filename
# no name # no name
if not file_name: if not file_name:
@ -296,15 +300,6 @@ def upload_documents(dataset_id):
if 'http' in file_name: if 'http' in file_name:
return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.") return construct_json_result(code=RetCode.ARGUMENT_ERROR, message="Remote files have not unsupported.")
# the content is empty, raising a warning
if file_content == b'':
warnings.warn(f"[WARNING]: The file {file_name} is empty.")
# no dataset
exist, dataset = KnowledgebaseService.get_by_id(dataset_id)
if not exist:
return construct_json_result(message="Can't find this dataset", code=RetCode.DATA_ERROR)
# get the root_folder # get the root_folder
root_folder = FileService.get_root_folder(current_user.id) root_folder = FileService.get_root_folder(current_user.id)
# get the id of the root_folder # get the id of the root_folder
@ -342,8 +337,14 @@ def upload_documents(dataset_id):
location = filename location = filename
while MINIO.obj_exist(dataset_id, location): while MINIO.obj_exist(dataset_id, location):
location += "_" location += "_"
blob = file.read() blob = file.read()
# the content is empty, raising a warning
if blob == b'':
warnings.warn(f"[WARNING]: The file {filename} is empty.")
MINIO.put(dataset_id, location, blob) MINIO.put(dataset_id, location, blob)
doc = { doc = {
"id": get_uuid(), "id": get_uuid(),
"kb_id": dataset.id, "kb_id": dataset.id,
@ -555,6 +556,40 @@ def is_illegal_value_for_enum(value, enum_class):
return value not in enum_class.__members__.values() return value not in enum_class.__members__.values()
# ----------------------------download a file----------------------------------------------------- # ----------------------------download a file-----------------------------------------------------
@manager.route("/<dataset_id>/documents/<document_id>", methods=["GET"])
@login_required
def download_document(dataset_id, document_id):
try:
# Check whether there is this dataset
exist, _ = KnowledgebaseService.get_by_id(dataset_id)
if not exist:
return construct_json_result(code=RetCode.DATA_ERROR, message=f"This dataset '{dataset_id}' cannot be found!")
# Check whether there is this document
exist, document = DocumentService.get_by_id(document_id)
if not exist:
return construct_json_result(message=f"This document '{document_id}' cannot be found!",
code=RetCode.ARGUMENT_ERROR)
# The process of downloading
doc_id, doc_location = File2DocumentService.get_minio_address(doc_id=document_id) # minio address
file_stream = MINIO.get(doc_id, doc_location)
if not file_stream:
return construct_json_result(message="This file is empty.", code=RetCode.DATA_ERROR)
file = BytesIO(file_stream)
# Use send_file with a proper filename and MIME type
return send_file(
file,
as_attachment=True,
download_name=document.name,
mimetype='application/octet-stream' # Set a default MIME type
)
# Error
except Exception as e:
return construct_error_response(e)
# ----------------------------start parsing----------------------------------------------------- # ----------------------------start parsing-----------------------------------------------------
@ -576,3 +611,5 @@ def is_illegal_value_for_enum(value, enum_class):
# ----------------------------retrieval test----------------------------------------------------- # ----------------------------retrieval test-----------------------------------------------------

View File

@ -12,12 +12,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import json import json
import os import os
import requests import requests
from api.db.services.document_service import DocumentService
from api.settings import RetCode from api.settings import RetCode
@ -126,7 +126,22 @@ class RAGFlow:
return response.json() return response.json()
# ----------------------------download a file----------------------------------------------------- # ----------------------------download a file-----------------------------------------------------
def download_file(self, dataset_id, document_id):
endpoint = f"{self.dataset_url}/{dataset_id}/documents/{document_id}"
res = requests.get(endpoint, headers=self.authorization_header)
content = res.content # binary data
# decode the binary data
try:
decoded_content = content.decode("utf-8")
json_data = json.loads(decoded_content)
return json_data # message
except json.JSONDecodeError: # binary data
_, document = DocumentService.get_by_id(document_id)
file_path = os.path.join(os.getcwd(), document.name)
with open(file_path, "wb") as file:
file.write(content)
return {"code": RetCode.SUCCESS, "data": content}
# ----------------------------start parsing----------------------------------------------------- # ----------------------------start parsing-----------------------------------------------------
# ----------------------------stop parsing----------------------------------------------------- # ----------------------------stop parsing-----------------------------------------------------
@ -144,3 +159,4 @@ class RAGFlow:
# ----------------------------get a specific chunk----------------------------------------------------- # ----------------------------get a specific chunk-----------------------------------------------------
# ----------------------------retrieval test----------------------------------------------------- # ----------------------------retrieval test-----------------------------------------------------

View File

@ -1,4 +1,4 @@
API_KEY = 'ImFhMmJhZmUwMmQxNzExZWZhZDdmMzA0M2Q3ZWU1MzdlIg.ZnDsIQ.u-0-_qCRU6a4WICxyAPsjaafyOo' API_KEY = 'IjJkOGQ4ZDE2MzkyMjExZWZhYTk0MzA0M2Q3ZWU1MzdlIg.ZoUfug.RmqcYyCrlAnLtkzk6bYXiXN3eEY'
HOST_ADDRESS = 'http://127.0.0.1:9380' HOST_ADDRESS = 'http://127.0.0.1:9380'

View File

@ -3,7 +3,6 @@ from test_sdkbase import TestSdk
from ragflow import RAGFlow from ragflow import RAGFlow
import pytest import pytest
from common import API_KEY, HOST_ADDRESS from common import API_KEY, HOST_ADDRESS
from api.contants import NAME_LENGTH_LIMIT
class TestFile(TestSdk): class TestFile(TestSdk):
@ -625,8 +624,76 @@ class TestFile(TestSdk):
update_res = ragflow.update_file(created_res_id, doc_id, **params) update_res = ragflow.update_file(created_res_id, doc_id, **params)
assert (update_res["code"] == RetCode.DATA_ERROR and assert (update_res["code"] == RetCode.DATA_ERROR and
update_res["message"] == "Illegal value ? for 'template_type' field.") update_res["message"] == "Illegal value ? for 'template_type' field.")
# ----------------------------download a file----------------------------------------------------- # ----------------------------download a file-----------------------------------------------------
def test_download_nonexistent_document(self):
"""
Test downloading a document which does not exist.
"""
# create a dataset
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_download_nonexistent_document")
created_res_id = created_res["data"]["dataset_id"]
res = ragflow.download_file(created_res_id, "imagination")
assert res["code"] == RetCode.ARGUMENT_ERROR and res["message"] == f"This document 'imagination' cannot be found!"
def test_download_document_in_nonexistent_dataset(self):
"""
Test downloading a document whose dataset is nonexistent.
"""
# create a dataset
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_download_nonexistent_document")
created_res_id = created_res["data"]["dataset_id"]
# upload files
file_paths = ["test_data/test.txt"]
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
# get the doc_id
data = uploading_res["data"][0]
doc_id = data["id"]
# download file
res = ragflow.download_file("imagination", doc_id)
assert res["code"] == RetCode.DATA_ERROR and res["message"] == f"This dataset 'imagination' cannot be found!"
def test_download_document_with_success(self):
"""
Test the downloading of a document with success.
"""
# create a dataset
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_download_nonexistent_document")
created_res_id = created_res["data"]["dataset_id"]
# upload files
file_paths = ["test_data/test.txt"]
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
# get the doc_id
data = uploading_res["data"][0]
doc_id = data["id"]
# download file
with open("test_data/test.txt", "rb") as file:
binary_data = file.read()
res = ragflow.download_file(created_res_id, doc_id)
assert res["code"] == RetCode.SUCCESS and res["data"] == binary_data
def test_download_an_empty_document(self):
"""
Test the downloading of an empty document.
"""
# create a dataset
ragflow = RAGFlow(API_KEY, HOST_ADDRESS)
created_res = ragflow.create_dataset("test_download_nonexistent_document")
created_res_id = created_res["data"]["dataset_id"]
# upload files
file_paths = ["test_data/empty.txt"]
uploading_res = ragflow.upload_local_file(created_res_id, file_paths)
# get the doc_id
data = uploading_res["data"][0]
doc_id = data["id"]
# download file
res = ragflow.download_file(created_res_id, doc_id)
assert res["code"] == RetCode.DATA_ERROR and res["message"] == "This file is empty."
# ----------------------------start parsing----------------------------------------------------- # ----------------------------start parsing-----------------------------------------------------
# ----------------------------stop parsing----------------------------------------------------- # ----------------------------stop parsing-----------------------------------------------------