mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-11 08:19:02 +08:00
Update file parsing progress info (#3780)
### What problem does this PR solve? Refine the file parsing progress info ### Type of change - [x] Refactoring Signed-off-by: jinhai <haijin.chn@gmail.com>
This commit is contained in:
parent
b5f643681f
commit
ea84cc2e33
@ -370,72 +370,86 @@ def run_raptor(row, chat_mdl, embd_mdl, callback=None):
|
|||||||
return res, tk_count, vector_size
|
return res, tk_count, vector_size
|
||||||
|
|
||||||
|
|
||||||
def do_handle_task(r):
|
def do_handle_task(task):
|
||||||
callback = partial(set_progress, r["id"], r["from_page"], r["to_page"])
|
task_id = task["id"]
|
||||||
|
task_from_page = task["from_page"]
|
||||||
|
task_to_page = task["to_page"]
|
||||||
|
task_tenant_id = task["tenant_id"]
|
||||||
|
task_embedding_id = task["embd_id"]
|
||||||
|
task_language = task["language"]
|
||||||
|
task_llm_id = task["llm_id"]
|
||||||
|
task_dataset_id = task["kb_id"]
|
||||||
|
task_doc_id = task["doc_id"]
|
||||||
|
task_document_name = task["name"]
|
||||||
|
task_parser_config = task["parser_config"]
|
||||||
|
|
||||||
|
# prepare the progress callback function
|
||||||
|
progress_callback = partial(set_progress, task_id, task_from_page, task_to_page)
|
||||||
try:
|
try:
|
||||||
embd_mdl = LLMBundle(r["tenant_id"], LLMType.EMBEDDING, llm_name=r["embd_id"], lang=r["language"])
|
# bind embedding model
|
||||||
|
embedding_model = LLMBundle(task_tenant_id, LLMType.EMBEDDING, llm_name=task_embedding_id, lang=task_language)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
callback(-1, msg=str(e))
|
progress_callback(-1, msg=f'Fail to bind embedding model: {str(e)}')
|
||||||
raise
|
raise
|
||||||
if r.get("task_type", "") == "raptor":
|
|
||||||
|
# Either using RAPTOR or Standard chunking methods
|
||||||
|
if task.get("task_type", "") == "raptor":
|
||||||
try:
|
try:
|
||||||
chat_mdl = LLMBundle(r["tenant_id"], LLMType.CHAT, llm_name=r["llm_id"], lang=r["language"])
|
# bind LLM for raptor
|
||||||
cks, tk_count, vector_size = run_raptor(r, chat_mdl, embd_mdl, callback)
|
chat_model = LLMBundle(task_tenant_id, LLMType.CHAT, llm_name=task_llm_id, lang=task_language)
|
||||||
|
|
||||||
|
# run RAPTOR
|
||||||
|
chunks, tk_count, vector_size = run_raptor(task, chat_model, embedding_model, progress_callback)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
callback(-1, msg=str(e))
|
progress_callback(-1, msg=f'Fail to bind LLM used by RAPTOR: {str(e)}')
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
st = timer()
|
# Standard chunking methods
|
||||||
cks = build(r)
|
start_ts = timer()
|
||||||
logging.info("Build chunks({}): {}".format(r["name"], timer() - st))
|
chunks = build(task)
|
||||||
if cks is None:
|
logging.info("Build document {}: {:.2f}s".format(task_document_name, timer() - start_ts))
|
||||||
|
if chunks is None:
|
||||||
return
|
return
|
||||||
if not cks:
|
if not chunks:
|
||||||
callback(1., "No chunk! Done!")
|
progress_callback(1., msg=f"No chunk built from {task_document_name}")
|
||||||
return
|
return
|
||||||
# TODO: exception handler
|
# TODO: exception handler
|
||||||
## set_progress(r["did"], -1, "ERROR: ")
|
## set_progress(task["did"], -1, "ERROR: ")
|
||||||
callback(
|
progress_callback(msg="Generate {} chunks".format(len(chunks)))
|
||||||
msg="Generate {} chunks ({:.2f}s). Embedding chunks.".format(len(cks), timer() - st)
|
start_ts = timer()
|
||||||
)
|
|
||||||
st = timer()
|
|
||||||
try:
|
try:
|
||||||
tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback)
|
tk_count, vector_size = embedding(chunks, embedding_model, task_parser_config, progress_callback)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
callback(-1, "Embedding error:{}".format(str(e)))
|
progress_callback(-1, "Generate embedding error:{}".format(str(e)))
|
||||||
logging.exception("run_rembedding got exception")
|
logging.exception("run_embedding got exception")
|
||||||
tk_count = 0
|
tk_count = 0
|
||||||
raise
|
raise
|
||||||
logging.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
logging.info("Embedding {} elapsed: {:.2f}".format(task_document_name, timer() - start_ts))
|
||||||
callback(msg="Finished embedding ({:.2f}s)!".format(timer() - st))
|
progress_callback(msg="Embedding chunks ({:.2f}s)".format(timer() - start_ts))
|
||||||
# logging.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}")
|
# logging.info(f"task_executor init_kb index {search.index_name(task_tenant_id)} embedding_model {embedding_model.llm_name} vector length {vector_size}")
|
||||||
init_kb(r, vector_size)
|
init_kb(task, vector_size)
|
||||||
chunk_count = len(set([c["id"] for c in cks]))
|
chunk_count = len(set([chunk["id"] for chunk in chunks]))
|
||||||
st = timer()
|
start_ts = timer()
|
||||||
es_r = ""
|
es_r = ""
|
||||||
es_bulk_size = 4
|
es_bulk_size = 4
|
||||||
for b in range(0, len(cks), es_bulk_size):
|
for b in range(0, len(chunks), es_bulk_size):
|
||||||
es_r = settings.docStoreConn.insert(cks[b:b + es_bulk_size], search.index_name(r["tenant_id"]), r["kb_id"])
|
es_r = settings.docStoreConn.insert(chunks[b:b + es_bulk_size], search.index_name(task_tenant_id), task_dataset_id)
|
||||||
if b % 128 == 0:
|
if b % 128 == 0:
|
||||||
callback(prog=0.8 + 0.1 * (b + 1) / len(cks), msg="")
|
progress_callback(prog=0.8 + 0.1 * (b + 1) / len(chunks), msg="")
|
||||||
logging.info("Indexing elapsed({}): {:.2f}".format(r["name"], timer() - st))
|
logging.info("Indexing {} elapsed: {:.2f}".format(task_document_name, timer() - start_ts))
|
||||||
if es_r:
|
if es_r:
|
||||||
callback(-1,
|
progress_callback(-1, "Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
|
||||||
"Insert chunk error, detail info please check log file. Please also check Elasticsearch/Infinity status!")
|
settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
|
||||||
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
|
||||||
logging.error('Insert chunk error: ' + str(es_r))
|
logging.error('Insert chunk error: ' + str(es_r))
|
||||||
raise Exception('Insert chunk error: ' + str(es_r))
|
raise Exception('Insert chunk error: ' + str(es_r))
|
||||||
|
|
||||||
if TaskService.do_cancel(r["id"]):
|
if TaskService.do_cancel(task_id):
|
||||||
settings.docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"])
|
settings.docStoreConn.delete({"doc_id": task_doc_id}, search.index_name(task_tenant_id), task_dataset_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
callback(1., msg="Index cost {:.2f}s.".format(timer() - st))
|
progress_callback(1., msg="Finish Index ({:.2f}s)".format(timer() - start_ts))
|
||||||
DocumentService.increment_chunk_num(
|
DocumentService.increment_chunk_num(task_doc_id, task_dataset_id, tk_count, chunk_count, 0)
|
||||||
r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)
|
logging.info("Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(task_id, tk_count, len(chunks), timer() - start_ts))
|
||||||
logging.info(
|
|
||||||
"Chunk doc({}), token({}), chunks({}), elapsed:{:.2f}".format(
|
|
||||||
r["id"], tk_count, len(cks), timer() - st))
|
|
||||||
|
|
||||||
|
|
||||||
def handle_task():
|
def handle_task():
|
||||||
|
Loading…
x
Reference in New Issue
Block a user