From d42e78bce2efb188a5d2429504ed898dbf9f5875 Mon Sep 17 00:00:00 2001 From: liuhua <10215101452@stu.ecnu.edu.cn> Date: Mon, 30 Dec 2024 19:01:44 +0800 Subject: [PATCH] Fix bugs in chunk api (#4293) ### What problem does this PR solve? Fix bugs in chunk api #4149 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn> --- api/apps/chunk_app.py | 2 +- api/apps/sdk/doc.py | 79 +++++++++++++-------------- docs/references/http_api_reference.md | 18 +++--- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index feeb52f6e..2edf69902 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -220,7 +220,7 @@ def create(): e, doc = DocumentService.get_by_id(req["doc_id"]) if not e: return get_data_error_result(message="Document not found!") - d["kb_id"] = [doc.kb_id] + d["kb_id"] = doc.kb_id d["docnm_kwd"] = doc.name d["title_tks"] = rag_tokenizer.tokenize(doc.name) d["doc_id"] = doc.id diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 423248da0..998e4abaa 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -847,59 +847,55 @@ def list_chunks(tenant_id, dataset_id, document_id): renamed_doc["run"] = run_mapping.get(str(value)) res = {"total": 0, "chunks": [], "doc": renamed_doc} - origin_chunks = [] - if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id): + if req.get("id"): + chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id]) + k = [] + for n in chunk.keys(): + if re.search(r"(_vec$|_sm_|_tks|_ltks)", n): + k.append(n) + for n in k: + del chunk[n] + if not chunk: + return get_error_data_result(f"Chunk `{req.get('id')}` not found.") + res['total'] = 1 + final_chunk = { + "id":chunk.get("id",chunk.get("chunk_id")), + "content":chunk["content_with_weight"], + "document_id":chunk.get("doc_id",chunk.get("document_id")), + "docnm_kwd":chunk["docnm_kwd"], + "important_keywords":chunk.get("important_kwd",[]), + "questions":chunk.get("question_kwd",[]), + "dataset_id":chunk.get("kb_id",chunk.get("dataset_id")), + "image_id":chunk["img_id"], + "available":bool(chunk.get("available_int",1)), + "positions":chunk.get("position_int",[]), + } + res["chunks"].append(final_chunk) + _ = Chunk(**final_chunk) + + elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id): sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None, highlight=True) res["total"] = sres.total - sign = 0 for id in sres.ids: d = { "id": id, - "content_with_weight": ( + "content": ( rmSpace(sres.highlight[id]) if question and id in sres.highlight else sres.field[id].get("content_with_weight", "") ), - "doc_id": sres.field[id]["doc_id"], + "document_id": sres.field[id]["doc_id"], "docnm_kwd": sres.field[id]["docnm_kwd"], - "important_kwd": sres.field[id].get("important_kwd", []), - "question_kwd": sres.field[id].get("question_kwd", []), - "img_id": sres.field[id].get("img_id", ""), - "available_int": sres.field[id].get("available_int", 1), - "positions": sres.field[id].get("position_int", []), + "important_keywords": sres.field[id].get("important_kwd", []), + "questions": sres.field[id].get("question_kwd", []), + "dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")), + "image_id": sres.field[id].get("img_id", ""), + "available": bool(sres.field[id].get("available_int", 1)), + "positions": sres.field[id].get("position_int",[]), } - origin_chunks.append(d) - if req.get("id"): - if req.get("id") == id: - origin_chunks.clear() - origin_chunks.append(d) - sign = 1 - break - if req.get("id"): - if sign == 0: - return get_error_data_result(f"Can't find this chunk {req.get('id')}") - - for chunk in origin_chunks: - key_mapping = { - "id": "id", - "content_with_weight": "content", - "doc_id": "document_id", - "important_kwd": "important_keywords", - "question_kwd": "questions", - "img_id": "image_id", - "available_int": "available", - } - renamed_chunk = {} - for key, value in chunk.items(): - new_key = key_mapping.get(key, key) - renamed_chunk[new_key] = value - if renamed_chunk["available"] == 0: - renamed_chunk["available"] = False - if renamed_chunk["available"] == 1: - renamed_chunk["available"] = True - res["chunks"].append(renamed_chunk) - _ = Chunk(**renamed_chunk) # validate the chunk + res["chunks"].append(d) + _ = Chunk(**d) # validate the chunk return get_result(data=res) @@ -1377,6 +1373,7 @@ def retrieval_test(tenant_id): "important_kwd": "important_keywords", "question_kwd": "questions", "docnm_kwd": "document_keyword", + "kb_id":"dataset_id" } rename_chunk = {} for key, value in chunk.items(): diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index d1b1d14cd..09beb726c 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -927,7 +927,8 @@ curl --request POST \ The text content of the chunk. - `"important_keywords`(*Body parameter*), `list[string]` The key terms or phrases to tag with the chunk. - +- `"questions"`(*Body parameter*), `list[string]` + If there is a given question, the embedded chunks will be based on them #### Response Success: @@ -937,13 +938,14 @@ Success: "code": 0, "data": { "chunk": { - "content": "ragflow content", - "create_time": "2024-10-16 08:05:04", - "create_timestamp": 1729065904.581025, - "dataset_id": "c7ee74067a2c11efb21c0242ac120006", - "document_id": "5c5999ec7be811ef9cab0242ac120005", - "id": "d78435d142bd5cf6704da62c778795c5", - "important_keywords": [] + "content": "who are you", + "create_time": "2024-12-30 16:59:55", + "create_timestamp": 1735549195.969164, + "dataset_id": "72f36e1ebdf411efb7250242ac120006", + "document_id": "61d68474be0111ef98dd0242ac120006", + "id": "12ccdc56e59837e5", + "important_keywords": [], + "questions": [] } } }