Test: Added test cases for Delete Chunks HTTP API (#6612)

### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] add test cases
2025-08-10 21:49:00 +08:00 · 2025-03-28 09:33:23 +08:00 · 2025-03-28 09:33:23 +08:00 · fd614a7aef
commit fd614a7aef
parent 0758c04941
5 changed files with 244 additions and 40 deletions
--- a/sdk/python/test/test_http_api/common.py
+++ b/sdk/python/test/test_http_api/common.py
@ -35,42 +35,22 @@ DOCUMENT_NAME_LIMIT = 128

 # DATASET MANAGEMENT
 def create_dataset(auth, payload=None):
-    res = requests.post(
-        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
-        headers=HEADERS,
-        auth=auth,
-        json=payload,
-    )
+    res = requests.post(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, json=payload)
    return res.json()


 def list_dataset(auth, params=None):
-    res = requests.get(
-        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
-        headers=HEADERS,
-        auth=auth,
-        params=params,
-    )
+    res = requests.get(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, params=params)
    return res.json()


 def update_dataset(auth, dataset_id, payload=None):
-    res = requests.put(
-        url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}",
-        headers=HEADERS,
-        auth=auth,
-        json=payload,
-    )
+    res = requests.put(url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}", headers=HEADERS, auth=auth, json=payload)
    return res.json()


 def delete_dataset(auth, payload=None):
-    res = requests.delete(
-        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
-        headers=HEADERS,
-        auth=auth,
-        json=payload,
-    )
+    res = requests.delete(url=f"{HOST_ADDRESS}{DATASETS_API_URL}", headers=HEADERS, auth=auth, json=payload)
    return res.json()


@ -127,12 +107,7 @@ def download_document(auth, dataset_id, document_id, save_path):

 def list_documnet(auth, dataset_id, params=None):
    url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=dataset_id)
-    res = requests.get(
-        url=url,
-        headers=HEADERS,
-        auth=auth,
-        params=params,
-    )
+    res = requests.get(url=url, headers=HEADERS, auth=auth, params=params)
    return res.json()


@ -181,12 +156,7 @@ def add_chunk(auth, dataset_id, document_id, payload=None):

 def list_chunks(auth, dataset_id, document_id, params=None):
    url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id)
-    res = requests.get(
-        url=url,
-        headers=HEADERS,
-        auth=auth,
-        params=params,
-    )
+    res = requests.get(url=url, headers=HEADERS, auth=auth, params=params)
    return res.json()


@ -196,6 +166,12 @@ def update_chunk(auth, dataset_id, document_id, chunk_id, payload=None):
    return res.json()


+def delete_chunks(auth, dataset_id, document_id, payload=None):
+    url = f"{HOST_ADDRESS}{CHUNK_API_URL}".format(dataset_id=dataset_id, document_id=document_id)
+    res = requests.delete(url=url, headers=HEADERS, auth=auth, json=payload)
+    return res.json()
+
+
 def batch_add_chunks(auth, dataset_id, document_id, num):
    chunk_ids = []
    for i in range(num):
--- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py
+++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/conftest.py
@ -16,7 +16,7 @@


 import pytest
-from common import add_chunk, batch_create_datasets, bulk_upload_documents, delete_dataset, list_documnet, parse_documnet
+from common import add_chunk, batch_create_datasets, bulk_upload_documents, delete_chunks, delete_dataset, list_documnet, parse_documnet
 from libs.utils import wait_for


@ -62,4 +62,25 @@ def add_chunks(get_http_api_auth, get_dataset_id_and_document_id):
    from time import sleep

    sleep(1)
-    yield dataset_id, document_id, chunk_ids
+    return dataset_id, document_id, chunk_ids
+
+
+@pytest.fixture(scope="function")
+def add_chunks_func(get_http_api_auth, get_dataset_id_and_document_id, request):
+    dataset_id, document_id = get_dataset_id_and_document_id
+
+    chunk_ids = []
+    for i in range(4):
+        res = add_chunk(get_http_api_auth, dataset_id, document_id, {"content": f"chunk test {i}"})
+        chunk_ids.append(res["data"]["chunk"]["id"])
+
+    # issues/6487
+    from time import sleep
+
+    sleep(1)
+
+    def cleanup():
+        delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids})
+
+    request.addfinalizer(cleanup)
+    return dataset_id, document_id, chunk_ids
--- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py
+++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_add_chunk.py
@ -133,7 +133,6 @@ class TestAddChunk:
            assert False, res
        chunks_count = res["data"]["doc"]["chunk_count"]
        res = add_chunk(get_http_api_auth, dataset_id, document_id, payload)
-        print(res)
        assert res["code"] == expected_code
        if expected_code == 0:
            validate_chunk_details(dataset_id, document_id, payload, res)
--- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py
+++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_delete_chunks.py
@ -0,0 +1,208 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import os
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+from common import INVALID_API_TOKEN, batch_add_chunks, delete_chunks, list_chunks
+from libs.auth import RAGFlowHttpApiAuth
+
+
+class TestAuthorization:
+    @pytest.mark.parametrize(
+        "auth, expected_code, expected_message",
+        [
+            (None, 0, "`Authorization` can't be empty"),
+            (
+                RAGFlowHttpApiAuth(INVALID_API_TOKEN),
+                109,
+                "Authentication error: API key is invalid!",
+            ),
+        ],
+    )
+    def test_invalid_auth(self, auth, expected_code, expected_message):
+        res = delete_chunks(auth, "dataset_id", "document_id")
+        assert res["code"] == expected_code
+        assert res["message"] == expected_message
+
+
+class TestChunkstDeletion:
+    @pytest.mark.parametrize(
+        "dataset_id, expected_code, expected_message",
+        [
+            ("", 100, "<NotFound '404: Not Found'>"),
+            (
+                "invalid_dataset_id",
+                102,
+                "You don't own the dataset invalid_dataset_id.",
+            ),
+        ],
+    )
+    def test_invalid_dataset_id(self, get_http_api_auth, add_chunks_func, dataset_id, expected_code, expected_message):
+        _, document_id, chunk_ids = add_chunks_func
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids})
+        assert res["code"] == expected_code
+        assert res["message"] == expected_message
+
+    @pytest.mark.parametrize(
+        "document_id, expected_code, expected_message",
+        [
+            ("", 100, "<MethodNotAllowed '405: Method Not Allowed'>"),
+            pytest.param(
+                "invalid_document_id",
+                100,
+                "LookupError('Document not found which is supposed to be there')",
+                marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6611"),
+            ),
+            pytest.param(
+                "invalid_document_id",
+                102,
+                "rm_chunk deleted chunks 0, expect 4",
+                marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="issues/6611"),
+            ),
+        ],
+    )
+    def test_invalid_document_id(self, get_http_api_auth, add_chunks_func, document_id, expected_code, expected_message):
+        dataset_id, _, chunk_ids = add_chunks_func
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids})
+        assert res["code"] == expected_code
+        assert res["message"] == expected_message
+
+    @pytest.mark.parametrize(
+        "payload",
+        [
+            lambda r: {"chunk_ids": ["invalid_id"] + r},
+            lambda r: {"chunk_ids": r[:1] + ["invalid_id"] + r[1:4]},
+            lambda r: {"chunk_ids": r + ["invalid_id"]},
+        ],
+    )
+    def test_delete_partial_invalid_id(self, get_http_api_auth, add_chunks_func, payload):
+        dataset_id, document_id, chunk_ids = add_chunks_func
+        if callable(payload):
+            payload = payload(chunk_ids)
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload)
+        assert res["code"] == 102
+        assert res["message"] == "rm_chunk deleted chunks 4, expect 5"
+
+        res = list_chunks(get_http_api_auth, dataset_id, document_id)
+        if res["code"] != 0:
+            assert False, res
+        assert len(res["data"]["chunks"]) == 1
+        assert res["data"]["total"] == 1
+
+    def test_repeated_deletion(self, get_http_api_auth, add_chunks_func):
+        dataset_id, document_id, chunk_ids = add_chunks_func
+        payload = {"chunk_ids": chunk_ids}
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload)
+        assert res["code"] == 0
+
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload)
+        assert res["code"] == 102
+        assert res["message"] == "rm_chunk deleted chunks 0, expect 4"
+
+    def test_duplicate_deletion(self, get_http_api_auth, add_chunks_func):
+        dataset_id, document_id, chunk_ids = add_chunks_func
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids * 2})
+        assert res["code"] == 0
+        assert "Duplicate chunk ids" in res["data"]["errors"][0]
+        assert res["data"]["success_count"] == 4
+
+        res = list_chunks(get_http_api_auth, dataset_id, document_id)
+        if res["code"] != 0:
+            assert False, res
+        assert len(res["data"]["chunks"]) == 1
+        assert res["data"]["total"] == 1
+
+    @pytest.mark.slow
+    def test_concurrent_deletion(self, get_http_api_auth, get_dataset_id_and_document_id):
+        chunks_num = 100
+        dataset_id, document_id = get_dataset_id_and_document_id
+        chunk_ids = batch_add_chunks(get_http_api_auth, dataset_id, document_id, chunks_num)
+
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = [
+                executor.submit(
+                    delete_chunks,
+                    get_http_api_auth,
+                    dataset_id,
+                    document_id,
+                    {"chunk_ids": chunk_ids[i : i + 1]},
+                )
+                for i in range(chunks_num)
+            ]
+        responses = [f.result() for f in futures]
+        assert all(r["code"] == 0 for r in responses)
+
+    @pytest.mark.slow
+    def test_delete_1k(self, get_http_api_auth, get_dataset_id_and_document_id):
+        chunks_num = 1_000
+        dataset_id, document_id = get_dataset_id_and_document_id
+        chunk_ids = batch_add_chunks(get_http_api_auth, dataset_id, document_id, chunks_num)
+
+        # issues/6487
+        from time import sleep
+
+        sleep(1)
+
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, {"chunk_ids": chunk_ids})
+        assert res["code"] == 0
+
+        res = list_chunks(get_http_api_auth, dataset_id, document_id)
+        if res["code"] != 0:
+            assert False, res
+        assert len(res["data"]["chunks"]) == 1
+        assert res["data"]["total"] == 1
+
+    @pytest.mark.parametrize(
+        "payload, expected_code, expected_message, remaining",
+        [
+            pytest.param(None, 100, """TypeError("argument of type \'NoneType\' is not iterable")""", 5, marks=pytest.mark.skip),
+            ({"chunk_ids": ["invalid_id"]}, 102, "rm_chunk deleted chunks 0, expect 1", 5),
+            pytest.param(
+                "not json",
+                100,
+                """UnboundLocalError("local variable \'duplicate_messages\' referenced before assignment")""",
+                5,
+                marks=pytest.mark.skip(reason="pull/6376"),
+            ),
+            (lambda r: {"chunk_ids": r[:1]}, 0, "", 4),
+            (lambda r: {"chunk_ids": r}, 0, "", 1),
+            pytest.param({"chunk_ids": []}, 0, "", 5, marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") == "infinity", reason="issues/6607")),
+            pytest.param({"chunk_ids": []}, 102, "rm_chunk deleted chunks 5, expect 0", 0, marks=pytest.mark.skipif(os.getenv("DOC_ENGINE") in [None, "elasticsearch"], reason="issues/6607")),
+        ],
+    )
+    def test_basic_scenarios(
+        self,
+        get_http_api_auth,
+        add_chunks_func,
+        payload,
+        expected_code,
+        expected_message,
+        remaining,
+    ):
+        dataset_id, document_id, chunk_ids = add_chunks_func
+        if callable(payload):
+            payload = payload(chunk_ids)
+        res = delete_chunks(get_http_api_auth, dataset_id, document_id, payload)
+        assert res["code"] == expected_code
+        if res["code"] != 0:
+            assert res["message"] == expected_message
+
+        res = list_chunks(get_http_api_auth, dataset_id, document_id)
+        if res["code"] != 0:
+            assert False, res
+        assert len(res["data"]["chunks"]) == remaining
+        assert res["data"]["total"] == remaining
--- a/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py
+++ b/sdk/python/test/test_http_api/test_chunk_management_within_dataset/test_update_chunk.py
@ -54,7 +54,7 @@ class TestUpdatedChunk:
            pytest.param(
                {"content": 1},
                100,
-                """TypeError("unsupported operand type(s) for +: \'int\' and \'str\'")""",
+                "TypeError('expected string or bytes-like object')",
                marks=pytest.mark.skip,
            ),
            ({"content": "update chunk"}, 0, ""),