From d42e78bce2efb188a5d2429504ed898dbf9f5875 Mon Sep 17 00:00:00 2001
From: liuhua <10215101452@stu.ecnu.edu.cn>
Date: Mon, 30 Dec 2024 19:01:44 +0800
Subject: [PATCH] Fix bugs in chunk api (#4293)

### What problem does this PR solve?

Fix bugs in chunk api #4149

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)

---------

Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
---
 api/apps/chunk_app.py                 |  2 +-
 api/apps/sdk/doc.py                   | 79 +++++++++++++--------------
 docs/references/http_api_reference.md | 18 +++---
 3 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
index feeb52f6e..2edf69902 100644
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -220,7 +220,7 @@ def create():
         e, doc = DocumentService.get_by_id(req["doc_id"])
         if not e:
             return get_data_error_result(message="Document not found!")
-        d["kb_id"] = [doc.kb_id]
+        d["kb_id"] = doc.kb_id
         d["docnm_kwd"] = doc.name
         d["title_tks"] = rag_tokenizer.tokenize(doc.name)
         d["doc_id"] = doc.id
diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 423248da0..998e4abaa 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -847,59 +847,55 @@ def list_chunks(tenant_id, dataset_id, document_id):
             renamed_doc["run"] = run_mapping.get(str(value))
 
     res = {"total": 0, "chunks": [], "doc": renamed_doc}
-    origin_chunks = []
-    if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
+    if req.get("id"):
+        chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
+        k = []
+        for n in chunk.keys():
+            if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
+                k.append(n)
+        for n in k:
+            del chunk[n]
+        if not chunk:
+            return get_error_data_result(f"Chunk `{req.get('id')}` not found.")
+        res['total'] = 1
+        final_chunk = {
+            "id":chunk.get("id",chunk.get("chunk_id")),
+            "content":chunk["content_with_weight"],
+            "document_id":chunk.get("doc_id",chunk.get("document_id")),
+            "docnm_kwd":chunk["docnm_kwd"],
+            "important_keywords":chunk.get("important_kwd",[]),
+            "questions":chunk.get("question_kwd",[]),
+            "dataset_id":chunk.get("kb_id",chunk.get("dataset_id")),
+            "image_id":chunk["img_id"],
+            "available":bool(chunk.get("available_int",1)),
+            "positions":chunk.get("position_int",[]),
+        }
+        res["chunks"].append(final_chunk)
+        _ = Chunk(**final_chunk)
+
+    elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
         sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
                                            highlight=True)
         res["total"] = sres.total
-        sign = 0
         for id in sres.ids:
             d = {
                 "id": id,
-                "content_with_weight": (
+                "content": (
                     rmSpace(sres.highlight[id])
                     if question and id in sres.highlight
                     else sres.field[id].get("content_with_weight", "")
                 ),
-                "doc_id": sres.field[id]["doc_id"],
+                "document_id": sres.field[id]["doc_id"],
                 "docnm_kwd": sres.field[id]["docnm_kwd"],
-                "important_kwd": sres.field[id].get("important_kwd", []),
-                "question_kwd": sres.field[id].get("question_kwd", []),
-                "img_id": sres.field[id].get("img_id", ""),
-                "available_int": sres.field[id].get("available_int", 1),
-                "positions": sres.field[id].get("position_int", []),
+                "important_keywords": sres.field[id].get("important_kwd", []),
+                "questions": sres.field[id].get("question_kwd", []),
+                "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
+                "image_id": sres.field[id].get("img_id", ""),
+                "available": bool(sres.field[id].get("available_int", 1)),
+                "positions": sres.field[id].get("position_int",[]),
             }
-            origin_chunks.append(d)
-            if req.get("id"):
-                if req.get("id") == id:
-                    origin_chunks.clear()
-                    origin_chunks.append(d)
-                    sign = 1
-                    break
-        if req.get("id"):
-            if sign == 0:
-                return get_error_data_result(f"Can't find this chunk {req.get('id')}")
-
-    for chunk in origin_chunks:
-        key_mapping = {
-            "id": "id",
-            "content_with_weight": "content",
-            "doc_id": "document_id",
-            "important_kwd": "important_keywords",
-            "question_kwd": "questions",
-            "img_id": "image_id",
-            "available_int": "available",
-        }
-        renamed_chunk = {}
-        for key, value in chunk.items():
-            new_key = key_mapping.get(key, key)
-            renamed_chunk[new_key] = value
-        if renamed_chunk["available"] == 0:
-            renamed_chunk["available"] = False
-        if renamed_chunk["available"] == 1:
-            renamed_chunk["available"] = True
-        res["chunks"].append(renamed_chunk)
-        _ = Chunk(**renamed_chunk) # validate the chunk
+            res["chunks"].append(d)
+            _ = Chunk(**d) # validate the chunk
     return get_result(data=res)
 
 
@@ -1377,6 +1373,7 @@ def retrieval_test(tenant_id):
                 "important_kwd": "important_keywords",
                 "question_kwd": "questions",
                 "docnm_kwd": "document_keyword",
+                "kb_id":"dataset_id"
             }
             rename_chunk = {}
             for key, value in chunk.items():
diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md
index d1b1d14cd..09beb726c 100644
--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@@ -927,7 +927,8 @@ curl --request POST \
   The text content of the chunk.
 - `"important_keywords`(*Body parameter*), `list[string]`  
   The key terms or phrases to tag with the chunk.
-
+- `"questions"`(*Body parameter*), `list[string]`
+  If there is a given question, the embedded chunks will be based on them
 #### Response
 
 Success:
@@ -937,13 +938,14 @@ Success:
     "code": 0,
     "data": {
         "chunk": {
-            "content": "ragflow content",
-            "create_time": "2024-10-16 08:05:04",
-            "create_timestamp": 1729065904.581025,
-            "dataset_id": "c7ee74067a2c11efb21c0242ac120006",
-            "document_id": "5c5999ec7be811ef9cab0242ac120005",
-            "id": "d78435d142bd5cf6704da62c778795c5",
-            "important_keywords": []
+            "content": "who are you",
+            "create_time": "2024-12-30 16:59:55",
+            "create_timestamp": 1735549195.969164,
+            "dataset_id": "72f36e1ebdf411efb7250242ac120006",
+            "document_id": "61d68474be0111ef98dd0242ac120006",
+            "id": "12ccdc56e59837e5",
+            "important_keywords": [],
+            "questions": []
         }
     }
 }