mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 22:19:01 +08:00
Test: Added test cases for Download Documents HTTP API (#6032)
### What problem does this PR solve? cover [download docments endpoints](https://ragflow.io/docs/dev/http_api_reference#download-document) ### Type of change - [x] add test cases
This commit is contained in:
parent
baf3b9be7c
commit
0a877941f4
@ -15,6 +15,7 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import base64
|
import base64
|
||||||
|
import hashlib
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
@ -23,3 +24,14 @@ def encode_avatar(image_path):
|
|||||||
binary_data = file.read()
|
binary_data = file.read()
|
||||||
base64_encoded = base64.b64encode(binary_data).decode("utf-8")
|
base64_encoded = base64.b64encode(binary_data).decode("utf-8")
|
||||||
return base64_encoded
|
return base64_encoded
|
||||||
|
|
||||||
|
|
||||||
|
def compare_by_hash(file1, file2, algorithm="sha256"):
|
||||||
|
def _calc_hash(file_path):
|
||||||
|
hash_func = hashlib.new(algorithm)
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
while chunk := f.read(8192):
|
||||||
|
hash_func.update(chunk)
|
||||||
|
return hash_func.hexdigest()
|
||||||
|
|
||||||
|
return _calc_hash(file1) == _calc_hash(file2)
|
||||||
|
@ -18,6 +18,7 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from libs.utils.file_utils import create_txt_file
|
||||||
from requests_toolbelt import MultipartEncoder
|
from requests_toolbelt import MultipartEncoder
|
||||||
|
|
||||||
HEADERS = {"Content-Type": "application/json"}
|
HEADERS = {"Content-Type": "application/json"}
|
||||||
@ -99,3 +100,46 @@ def upload_documnets(auth, dataset_id, files_path=None):
|
|||||||
data=m,
|
data=m,
|
||||||
)
|
)
|
||||||
return res.json()
|
return res.json()
|
||||||
|
|
||||||
|
|
||||||
|
def batch_upload_documents(auth, dataset_id, num, tmp_path):
|
||||||
|
fps = []
|
||||||
|
for i in range(num):
|
||||||
|
fp = create_txt_file(tmp_path / f"ragflow_test_upload_{i}.txt")
|
||||||
|
fps.append(fp)
|
||||||
|
res = upload_documnets(auth, dataset_id, fps)
|
||||||
|
document_ids = []
|
||||||
|
for document in res["data"]:
|
||||||
|
document_ids.append(document["id"])
|
||||||
|
return document_ids
|
||||||
|
|
||||||
|
|
||||||
|
def download_document(auth, dataset_id, document_id, save_path):
|
||||||
|
url = f"{HOST_ADDRESS}{FILE_API_URL}/{document_id}".format(dataset_id=dataset_id)
|
||||||
|
res = requests.get(url=url, auth=auth, stream=True)
|
||||||
|
try:
|
||||||
|
if res.status_code == 200:
|
||||||
|
with open(save_path, "wb") as f:
|
||||||
|
for chunk in res.iter_content(chunk_size=8192):
|
||||||
|
f.write(chunk)
|
||||||
|
finally:
|
||||||
|
res.close()
|
||||||
|
|
||||||
|
return res
|
||||||
|
|
||||||
|
|
||||||
|
def list_documnet(auth, dataset_id, params=None):
|
||||||
|
url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=dataset_id)
|
||||||
|
res = requests.get(
|
||||||
|
url=url,
|
||||||
|
headers=HEADERS,
|
||||||
|
auth=auth,
|
||||||
|
params=params,
|
||||||
|
)
|
||||||
|
return res.json()
|
||||||
|
|
||||||
|
|
||||||
|
def update_documnet(auth, dataset_id, document_id, payload):
|
||||||
|
url = f"{HOST_ADDRESS}{FILE_API_URL}/{document_id}".format(dataset_id=dataset_id)
|
||||||
|
res = requests.put(url=url, headers=HEADERS, auth=auth, json=payload)
|
||||||
|
return res.json()
|
||||||
|
@ -38,36 +38,23 @@ def clear_datasets(get_http_api_auth):
|
|||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def generate_test_files(tmp_path):
|
def generate_test_files(request, tmp_path):
|
||||||
|
file_creators = {
|
||||||
|
"docx": (tmp_path / "ragflow_test.docx", create_docx_file),
|
||||||
|
"excel": (tmp_path / "ragflow_test.xlsx", create_excel_file),
|
||||||
|
"ppt": (tmp_path / "ragflow_test.pptx", create_ppt_file),
|
||||||
|
"image": (tmp_path / "ragflow_test.png", create_image_file),
|
||||||
|
"pdf": (tmp_path / "ragflow_test.pdf", create_pdf_file),
|
||||||
|
"txt": (tmp_path / "ragflow_test.txt", create_txt_file),
|
||||||
|
"md": (tmp_path / "ragflow_test.md", create_md_file),
|
||||||
|
"json": (tmp_path / "ragflow_test.json", create_json_file),
|
||||||
|
"eml": (tmp_path / "ragflow_test.eml", create_eml_file),
|
||||||
|
"html": (tmp_path / "ragflow_test.html", create_html_file),
|
||||||
|
}
|
||||||
|
|
||||||
files = {}
|
files = {}
|
||||||
files["docx"] = tmp_path / "ragflow_test.docx"
|
for file_type, (file_path, creator_func) in file_creators.items():
|
||||||
create_docx_file(files["docx"])
|
if request.param in ["", file_type]:
|
||||||
|
creator_func(file_path)
|
||||||
files["excel"] = tmp_path / "ragflow_test.xlsx"
|
files[file_type] = file_path
|
||||||
create_excel_file(files["excel"])
|
|
||||||
|
|
||||||
files["ppt"] = tmp_path / "ragflow_test.pptx"
|
|
||||||
create_ppt_file(files["ppt"])
|
|
||||||
|
|
||||||
files["image"] = tmp_path / "ragflow_test.png"
|
|
||||||
create_image_file(files["image"])
|
|
||||||
|
|
||||||
files["pdf"] = tmp_path / "ragflow_test.pdf"
|
|
||||||
create_pdf_file(files["pdf"])
|
|
||||||
|
|
||||||
files["txt"] = tmp_path / "ragflow_test.txt"
|
|
||||||
create_txt_file(files["txt"])
|
|
||||||
|
|
||||||
files["md"] = tmp_path / "ragflow_test.md"
|
|
||||||
create_md_file(files["md"])
|
|
||||||
|
|
||||||
files["json"] = tmp_path / "ragflow_test.json"
|
|
||||||
create_json_file(files["json"])
|
|
||||||
|
|
||||||
files["eml"] = tmp_path / "ragflow_test.eml"
|
|
||||||
create_eml_file(files["eml"])
|
|
||||||
|
|
||||||
files["html"] = tmp_path / "ragflow_test.html"
|
|
||||||
create_html_file(files["html"])
|
|
||||||
|
|
||||||
return files
|
return files
|
||||||
|
@ -0,0 +1,193 @@
|
|||||||
|
#
|
||||||
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
import json
|
||||||
|
from concurrent.futures import ThreadPoolExecutor
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from common import (
|
||||||
|
INVALID_API_TOKEN,
|
||||||
|
batch_upload_documents,
|
||||||
|
create_datasets,
|
||||||
|
download_document,
|
||||||
|
upload_documnets,
|
||||||
|
)
|
||||||
|
from libs.auth import RAGFlowHttpApiAuth
|
||||||
|
from libs.utils import compare_by_hash
|
||||||
|
from requests import codes
|
||||||
|
|
||||||
|
|
||||||
|
class TestAuthorization:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"auth, expected_code, expected_message",
|
||||||
|
[
|
||||||
|
(None, 0, "`Authorization` can't be empty"),
|
||||||
|
(
|
||||||
|
RAGFlowHttpApiAuth(INVALID_API_TOKEN),
|
||||||
|
109,
|
||||||
|
"Authentication error: API key is invalid!",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_auth(
|
||||||
|
self, get_http_api_auth, tmp_path, auth, expected_code, expected_message
|
||||||
|
):
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
document_ids = batch_upload_documents(get_http_api_auth, ids[0], 1, tmp_path)
|
||||||
|
res = download_document(
|
||||||
|
auth, ids[0], document_ids[0], tmp_path / "ragflow_tes.txt"
|
||||||
|
)
|
||||||
|
assert res.status_code == codes.ok
|
||||||
|
with (tmp_path / "ragflow_tes.txt").open("r") as f:
|
||||||
|
response_json = json.load(f)
|
||||||
|
assert response_json["code"] == expected_code
|
||||||
|
assert response_json["message"] == expected_message
|
||||||
|
|
||||||
|
|
||||||
|
class TestDownloadDocument:
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"generate_test_files",
|
||||||
|
[
|
||||||
|
"docx",
|
||||||
|
"excel",
|
||||||
|
"ppt",
|
||||||
|
"image",
|
||||||
|
"pdf",
|
||||||
|
"txt",
|
||||||
|
"md",
|
||||||
|
"json",
|
||||||
|
"eml",
|
||||||
|
"html",
|
||||||
|
],
|
||||||
|
indirect=True,
|
||||||
|
)
|
||||||
|
def test_file_type_validation(
|
||||||
|
self, get_http_api_auth, generate_test_files, request
|
||||||
|
):
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
fp = generate_test_files[request.node.callspec.params["generate_test_files"]]
|
||||||
|
res = upload_documnets(get_http_api_auth, ids[0], [fp])
|
||||||
|
document_id = res["data"][0]["id"]
|
||||||
|
|
||||||
|
res = download_document(
|
||||||
|
get_http_api_auth,
|
||||||
|
ids[0],
|
||||||
|
document_id,
|
||||||
|
fp.with_stem("ragflow_test_download"),
|
||||||
|
)
|
||||||
|
assert res.status_code == codes.ok
|
||||||
|
assert compare_by_hash(
|
||||||
|
fp,
|
||||||
|
fp.with_stem("ragflow_test_download"),
|
||||||
|
)
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"docment_id, expected_code, expected_message",
|
||||||
|
[
|
||||||
|
pytest.param("", 0, "", marks=pytest.mark.xfail(reason="issue#6031")),
|
||||||
|
(
|
||||||
|
"invalid_document_id",
|
||||||
|
102,
|
||||||
|
"The dataset not own the document invalid_document_id.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_docment_id(
|
||||||
|
self, get_http_api_auth, tmp_path, docment_id, expected_code, expected_message
|
||||||
|
):
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
res = download_document(
|
||||||
|
get_http_api_auth,
|
||||||
|
ids[0],
|
||||||
|
docment_id,
|
||||||
|
tmp_path / "ragflow_test_download_1.txt",
|
||||||
|
)
|
||||||
|
assert res.status_code == codes.ok
|
||||||
|
with (tmp_path / "ragflow_test_download_1.txt").open("r") as f:
|
||||||
|
response_json = json.load(f)
|
||||||
|
assert response_json["code"] == expected_code
|
||||||
|
assert response_json["message"] == expected_message
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"dataset_id, expected_code, expected_message",
|
||||||
|
[
|
||||||
|
("", 100, "<NotFound '404: Not Found'>"),
|
||||||
|
(
|
||||||
|
"invalid_dataset_id",
|
||||||
|
102,
|
||||||
|
"You do not own the dataset invalid_dataset_id.",
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_invalid_dataset_id(
|
||||||
|
self, get_http_api_auth, tmp_path, dataset_id, expected_code, expected_message
|
||||||
|
):
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
document_ids = batch_upload_documents(get_http_api_auth, ids[0], 1, tmp_path)
|
||||||
|
res = download_document(
|
||||||
|
get_http_api_auth,
|
||||||
|
dataset_id,
|
||||||
|
document_ids[0],
|
||||||
|
tmp_path / "ragflow_test_download_1.txt",
|
||||||
|
)
|
||||||
|
assert res.status_code == codes.ok
|
||||||
|
with (tmp_path / "ragflow_test_download_1.txt").open("r") as f:
|
||||||
|
response_json = json.load(f)
|
||||||
|
assert response_json["code"] == expected_code
|
||||||
|
assert response_json["message"] == expected_message
|
||||||
|
|
||||||
|
def test_same_file_repeat(self, get_http_api_auth, tmp_path):
|
||||||
|
num = 5
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
document_ids = batch_upload_documents(get_http_api_auth, ids[0], 1, tmp_path)
|
||||||
|
for i in range(num):
|
||||||
|
res = download_document(
|
||||||
|
get_http_api_auth,
|
||||||
|
ids[0],
|
||||||
|
document_ids[0],
|
||||||
|
tmp_path / f"ragflow_test_download_{i}.txt",
|
||||||
|
)
|
||||||
|
assert res.status_code == codes.ok
|
||||||
|
assert compare_by_hash(
|
||||||
|
tmp_path / "ragflow_test_upload_0.txt",
|
||||||
|
tmp_path / f"ragflow_test_download_{i}.txt",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_concurrent_download(self, get_http_api_auth, tmp_path):
|
||||||
|
document_count = 20
|
||||||
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
|
document_ids = batch_upload_documents(
|
||||||
|
get_http_api_auth, ids[0], document_count, tmp_path
|
||||||
|
)
|
||||||
|
|
||||||
|
with ThreadPoolExecutor(max_workers=5) as executor:
|
||||||
|
futures = [
|
||||||
|
executor.submit(
|
||||||
|
download_document,
|
||||||
|
get_http_api_auth,
|
||||||
|
ids[0],
|
||||||
|
document_ids[i],
|
||||||
|
tmp_path / f"ragflow_test_download_{i}.txt",
|
||||||
|
)
|
||||||
|
for i in range(document_count)
|
||||||
|
]
|
||||||
|
responses = [f.result() for f in futures]
|
||||||
|
assert all(r.status_code == codes.ok for r in responses)
|
||||||
|
for i in range(document_count):
|
||||||
|
assert compare_by_hash(
|
||||||
|
tmp_path / f"ragflow_test_upload_{i}.txt",
|
||||||
|
tmp_path / f"ragflow_test_download_{i}.txt",
|
||||||
|
)
|
@ -64,7 +64,7 @@ class TestUploadDocuments:
|
|||||||
assert res["data"][0]["name"] == fp.name
|
assert res["data"][0]["name"] == fp.name
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"file_type",
|
"generate_test_files",
|
||||||
[
|
[
|
||||||
"docx",
|
"docx",
|
||||||
"excel",
|
"excel",
|
||||||
@ -77,12 +77,13 @@ class TestUploadDocuments:
|
|||||||
"eml",
|
"eml",
|
||||||
"html",
|
"html",
|
||||||
],
|
],
|
||||||
|
indirect=True,
|
||||||
)
|
)
|
||||||
def test_file_type_validation(
|
def test_file_type_validation(
|
||||||
self, get_http_api_auth, generate_test_files, file_type
|
self, get_http_api_auth, generate_test_files, request
|
||||||
):
|
):
|
||||||
ids = create_datasets(get_http_api_auth, 1)
|
ids = create_datasets(get_http_api_auth, 1)
|
||||||
fp = generate_test_files[file_type]
|
fp = generate_test_files[request.node.callspec.params["generate_test_files"]]
|
||||||
res = upload_documnets(get_http_api_auth, ids[0], [fp])
|
res = upload_documnets(get_http_api_auth, ids[0], [fp])
|
||||||
assert res["code"] == 0
|
assert res["code"] == 0
|
||||||
assert res["data"][0]["dataset_id"] == ids[0]
|
assert res["data"][0]["dataset_id"] == ids[0]
|
||||||
|
Loading…
x
Reference in New Issue
Block a user