mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 00:58:58 +08:00
Fix bugs in chunk api (#4293)
### What problem does this PR solve? Fix bugs in chunk api #4149 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --------- Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
This commit is contained in:
parent
8fb18f37f6
commit
d42e78bce2
@ -220,7 +220,7 @@ def create():
|
|||||||
e, doc = DocumentService.get_by_id(req["doc_id"])
|
e, doc = DocumentService.get_by_id(req["doc_id"])
|
||||||
if not e:
|
if not e:
|
||||||
return get_data_error_result(message="Document not found!")
|
return get_data_error_result(message="Document not found!")
|
||||||
d["kb_id"] = [doc.kb_id]
|
d["kb_id"] = doc.kb_id
|
||||||
d["docnm_kwd"] = doc.name
|
d["docnm_kwd"] = doc.name
|
||||||
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
d["title_tks"] = rag_tokenizer.tokenize(doc.name)
|
||||||
d["doc_id"] = doc.id
|
d["doc_id"] = doc.id
|
||||||
|
@ -847,59 +847,55 @@ def list_chunks(tenant_id, dataset_id, document_id):
|
|||||||
renamed_doc["run"] = run_mapping.get(str(value))
|
renamed_doc["run"] = run_mapping.get(str(value))
|
||||||
|
|
||||||
res = {"total": 0, "chunks": [], "doc": renamed_doc}
|
res = {"total": 0, "chunks": [], "doc": renamed_doc}
|
||||||
origin_chunks = []
|
if req.get("id"):
|
||||||
if settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
|
chunk = settings.docStoreConn.get(req.get("id"), search.index_name(tenant_id), [dataset_id])
|
||||||
|
k = []
|
||||||
|
for n in chunk.keys():
|
||||||
|
if re.search(r"(_vec$|_sm_|_tks|_ltks)", n):
|
||||||
|
k.append(n)
|
||||||
|
for n in k:
|
||||||
|
del chunk[n]
|
||||||
|
if not chunk:
|
||||||
|
return get_error_data_result(f"Chunk `{req.get('id')}` not found.")
|
||||||
|
res['total'] = 1
|
||||||
|
final_chunk = {
|
||||||
|
"id":chunk.get("id",chunk.get("chunk_id")),
|
||||||
|
"content":chunk["content_with_weight"],
|
||||||
|
"document_id":chunk.get("doc_id",chunk.get("document_id")),
|
||||||
|
"docnm_kwd":chunk["docnm_kwd"],
|
||||||
|
"important_keywords":chunk.get("important_kwd",[]),
|
||||||
|
"questions":chunk.get("question_kwd",[]),
|
||||||
|
"dataset_id":chunk.get("kb_id",chunk.get("dataset_id")),
|
||||||
|
"image_id":chunk["img_id"],
|
||||||
|
"available":bool(chunk.get("available_int",1)),
|
||||||
|
"positions":chunk.get("position_int",[]),
|
||||||
|
}
|
||||||
|
res["chunks"].append(final_chunk)
|
||||||
|
_ = Chunk(**final_chunk)
|
||||||
|
|
||||||
|
elif settings.docStoreConn.indexExist(search.index_name(tenant_id), dataset_id):
|
||||||
sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
|
sres = settings.retrievaler.search(query, search.index_name(tenant_id), [dataset_id], emb_mdl=None,
|
||||||
highlight=True)
|
highlight=True)
|
||||||
res["total"] = sres.total
|
res["total"] = sres.total
|
||||||
sign = 0
|
|
||||||
for id in sres.ids:
|
for id in sres.ids:
|
||||||
d = {
|
d = {
|
||||||
"id": id,
|
"id": id,
|
||||||
"content_with_weight": (
|
"content": (
|
||||||
rmSpace(sres.highlight[id])
|
rmSpace(sres.highlight[id])
|
||||||
if question and id in sres.highlight
|
if question and id in sres.highlight
|
||||||
else sres.field[id].get("content_with_weight", "")
|
else sres.field[id].get("content_with_weight", "")
|
||||||
),
|
),
|
||||||
"doc_id": sres.field[id]["doc_id"],
|
"document_id": sres.field[id]["doc_id"],
|
||||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
"important_keywords": sres.field[id].get("important_kwd", []),
|
||||||
"question_kwd": sres.field[id].get("question_kwd", []),
|
"questions": sres.field[id].get("question_kwd", []),
|
||||||
"img_id": sres.field[id].get("img_id", ""),
|
"dataset_id": sres.field[id].get("kb_id", sres.field[id].get("dataset_id")),
|
||||||
"available_int": sres.field[id].get("available_int", 1),
|
"image_id": sres.field[id].get("img_id", ""),
|
||||||
"positions": sres.field[id].get("position_int", []),
|
"available": bool(sres.field[id].get("available_int", 1)),
|
||||||
|
"positions": sres.field[id].get("position_int",[]),
|
||||||
}
|
}
|
||||||
origin_chunks.append(d)
|
res["chunks"].append(d)
|
||||||
if req.get("id"):
|
_ = Chunk(**d) # validate the chunk
|
||||||
if req.get("id") == id:
|
|
||||||
origin_chunks.clear()
|
|
||||||
origin_chunks.append(d)
|
|
||||||
sign = 1
|
|
||||||
break
|
|
||||||
if req.get("id"):
|
|
||||||
if sign == 0:
|
|
||||||
return get_error_data_result(f"Can't find this chunk {req.get('id')}")
|
|
||||||
|
|
||||||
for chunk in origin_chunks:
|
|
||||||
key_mapping = {
|
|
||||||
"id": "id",
|
|
||||||
"content_with_weight": "content",
|
|
||||||
"doc_id": "document_id",
|
|
||||||
"important_kwd": "important_keywords",
|
|
||||||
"question_kwd": "questions",
|
|
||||||
"img_id": "image_id",
|
|
||||||
"available_int": "available",
|
|
||||||
}
|
|
||||||
renamed_chunk = {}
|
|
||||||
for key, value in chunk.items():
|
|
||||||
new_key = key_mapping.get(key, key)
|
|
||||||
renamed_chunk[new_key] = value
|
|
||||||
if renamed_chunk["available"] == 0:
|
|
||||||
renamed_chunk["available"] = False
|
|
||||||
if renamed_chunk["available"] == 1:
|
|
||||||
renamed_chunk["available"] = True
|
|
||||||
res["chunks"].append(renamed_chunk)
|
|
||||||
_ = Chunk(**renamed_chunk) # validate the chunk
|
|
||||||
return get_result(data=res)
|
return get_result(data=res)
|
||||||
|
|
||||||
|
|
||||||
@ -1377,6 +1373,7 @@ def retrieval_test(tenant_id):
|
|||||||
"important_kwd": "important_keywords",
|
"important_kwd": "important_keywords",
|
||||||
"question_kwd": "questions",
|
"question_kwd": "questions",
|
||||||
"docnm_kwd": "document_keyword",
|
"docnm_kwd": "document_keyword",
|
||||||
|
"kb_id":"dataset_id"
|
||||||
}
|
}
|
||||||
rename_chunk = {}
|
rename_chunk = {}
|
||||||
for key, value in chunk.items():
|
for key, value in chunk.items():
|
||||||
|
@ -927,7 +927,8 @@ curl --request POST \
|
|||||||
The text content of the chunk.
|
The text content of the chunk.
|
||||||
- `"important_keywords`(*Body parameter*), `list[string]`
|
- `"important_keywords`(*Body parameter*), `list[string]`
|
||||||
The key terms or phrases to tag with the chunk.
|
The key terms or phrases to tag with the chunk.
|
||||||
|
- `"questions"`(*Body parameter*), `list[string]`
|
||||||
|
If there is a given question, the embedded chunks will be based on them
|
||||||
#### Response
|
#### Response
|
||||||
|
|
||||||
Success:
|
Success:
|
||||||
@ -937,13 +938,14 @@ Success:
|
|||||||
"code": 0,
|
"code": 0,
|
||||||
"data": {
|
"data": {
|
||||||
"chunk": {
|
"chunk": {
|
||||||
"content": "ragflow content",
|
"content": "who are you",
|
||||||
"create_time": "2024-10-16 08:05:04",
|
"create_time": "2024-12-30 16:59:55",
|
||||||
"create_timestamp": 1729065904.581025,
|
"create_timestamp": 1735549195.969164,
|
||||||
"dataset_id": "c7ee74067a2c11efb21c0242ac120006",
|
"dataset_id": "72f36e1ebdf411efb7250242ac120006",
|
||||||
"document_id": "5c5999ec7be811ef9cab0242ac120005",
|
"document_id": "61d68474be0111ef98dd0242ac120006",
|
||||||
"id": "d78435d142bd5cf6704da62c778795c5",
|
"id": "12ccdc56e59837e5",
|
||||||
"important_keywords": []
|
"important_keywords": [],
|
||||||
|
"questions": []
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user