mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-11 04:18:58 +08:00
Fix chunk number error after re-parsing. (#4043)
### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
parent
44ac87aef4
commit
7fb67c4f67
@ -356,12 +356,11 @@ def run():
|
|||||||
try:
|
try:
|
||||||
for id in req["doc_ids"]:
|
for id in req["doc_ids"]:
|
||||||
info = {"run": str(req["run"]), "progress": 0}
|
info = {"run": str(req["run"]), "progress": 0}
|
||||||
if str(req["run"]) == TaskStatus.RUNNING.value:
|
if str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False):
|
||||||
info["progress_msg"] = ""
|
info["progress_msg"] = ""
|
||||||
info["chunk_num"] = 0
|
info["chunk_num"] = 0
|
||||||
info["token_num"] = 0
|
info["token_num"] = 0
|
||||||
DocumentService.update_by_id(id, info)
|
DocumentService.update_by_id(id, info)
|
||||||
# if str(req["run"]) == TaskStatus.CANCEL.value:
|
|
||||||
tenant_id = DocumentService.get_tenant_id(id)
|
tenant_id = DocumentService.get_tenant_id(id)
|
||||||
if not tenant_id:
|
if not tenant_id:
|
||||||
return get_data_error_result(message="Tenant not found!")
|
return get_data_error_result(message="Tenant not found!")
|
||||||
|
@ -248,8 +248,9 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|||||||
|
|
||||||
prev_tasks = TaskService.get_tasks(doc["id"])
|
prev_tasks = TaskService.get_tasks(doc["id"])
|
||||||
if prev_tasks:
|
if prev_tasks:
|
||||||
|
ck_num = 0
|
||||||
for task in tsks:
|
for task in tsks:
|
||||||
reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
ck_num += reuse_prev_task_chunks(task, prev_tasks, chunking_config)
|
||||||
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
TaskService.filter_delete([Task.doc_id == doc["id"]])
|
||||||
chunk_ids = []
|
chunk_ids = []
|
||||||
for task in prev_tasks:
|
for task in prev_tasks:
|
||||||
@ -257,6 +258,7 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|||||||
chunk_ids.extend(task["chunk_ids"].split())
|
chunk_ids.extend(task["chunk_ids"].split())
|
||||||
if chunk_ids:
|
if chunk_ids:
|
||||||
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
settings.docStoreConn.delete({"id": chunk_ids}, search.index_name(chunking_config["tenant_id"]), chunking_config["kb_id"])
|
||||||
|
DocumentService.update_by_id(doc["id"], {"chunk_num": ck_num})
|
||||||
|
|
||||||
bulk_insert_into_db(Task, tsks, True)
|
bulk_insert_into_db(Task, tsks, True)
|
||||||
DocumentService.begin2parse(doc["id"])
|
DocumentService.begin2parse(doc["id"])
|
||||||
@ -267,14 +269,17 @@ def queue_tasks(doc: dict, bucket: str, name: str):
|
|||||||
SVR_QUEUE_NAME, message=t
|
SVR_QUEUE_NAME, message=t
|
||||||
), "Can't access Redis. Please check the Redis' status."
|
), "Can't access Redis. Please check the Redis' status."
|
||||||
|
|
||||||
|
|
||||||
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
def reuse_prev_task_chunks(task: dict, prev_tasks: list[dict], chunking_config: dict):
|
||||||
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
|
idx = bisect.bisect_left(prev_tasks, task["from_page"], key=lambda x: x["from_page"])
|
||||||
if idx >= len(prev_tasks):
|
if idx >= len(prev_tasks):
|
||||||
return
|
return 0
|
||||||
prev_task = prev_tasks[idx]
|
prev_task = prev_tasks[idx]
|
||||||
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
|
if prev_task["progress"] < 1.0 or prev_task["digest"] != task["digest"] or not prev_task["chunk_ids"]:
|
||||||
return
|
return 0
|
||||||
task["chunk_ids"] = prev_task["chunk_ids"]
|
task["chunk_ids"] = prev_task["chunk_ids"]
|
||||||
task["progress"] = 1.0
|
task["progress"] = 1.0
|
||||||
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
|
task["progress_msg"] = f"Page({task['from_page']}~{task['to_page']}): reused previous task's chunks"
|
||||||
prev_task["chunk_ids"] = ""
|
prev_task["chunk_ids"] = ""
|
||||||
|
|
||||||
|
return len(task["chunk_ids"].split())
|
Loading…
x
Reference in New Issue
Block a user