From ab4384e011752eb3326c7b2014c22e62dbe72e40 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Thu, 14 Nov 2024 16:28:10 +0800 Subject: [PATCH] =?UTF-8?q?Updates=20on=20parsing=20progress,=20including?= =?UTF-8?q?=20more=20detailed=20time=20cost=20inform=E2=80=A6=20(#3402)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? #3401 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/validation.py | 17 ++++++++++++++--- rag/svr/task_executor.py | 11 ++++++++--- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/api/validation.py b/api/validation.py index 31aa0e543..f4d610c45 100644 --- a/api/validation.py +++ b/api/validation.py @@ -32,7 +32,18 @@ def python_version_validation(): python_version_validation() + # Download nltk data -import nltk -nltk.download('wordnet', halt_on_error=False, quiet=True) -nltk.download('punkt_tab', halt_on_error=False, quiet=True) \ No newline at end of file +def download_nltk_data(): + import nltk + nltk.download('wordnet', halt_on_error=False, quiet=True) + nltk.download('punkt_tab', halt_on_error=False, quiet=True) + + +try: + from multiprocessing import Pool + pool = Pool(processes=1) + thr = pool.apply_async(download_nltk_data) + binary = thr.get(timeout=60) +except Exception as e: + print('\x1b[6;37;41m WARNING \x1b[0m' + "Downloading NLTK data failure.", flush=True) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index fb8207ae8..4c9b92b01 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -218,14 +218,17 @@ def build(row): logger.info("MINIO PUT({}):{}".format(row["name"], el)) if row["parser_config"].get("auto_keywords", 0): + st = timer() callback(msg="Start to generate keywords for every chunk ...") chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"]) for d in docs: d["important_kwd"] = keyword_extraction(chat_mdl, d["content_with_weight"], row["parser_config"]["auto_keywords"]).split(",") d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"])) + callback(msg="Keywords generation completed in {:.2f}s".format(timer()-st)) if row["parser_config"].get("auto_questions", 0): + st = timer() callback(msg="Start to generate questions for every chunk ...") chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"]) for d in docs: @@ -236,6 +239,7 @@ def build(row): d["content_ltks"] += " " + qst if "content_sm_ltks" in d: d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst) + callback(msg="Question generation completed in {:.2f}s".format(timer()-st)) return docs @@ -364,8 +368,8 @@ def main(): # TODO: exception handler ## set_progress(r["did"], -1, "ERROR: ") callback( - msg="Finished slicing files(%d). Start to embedding the content." % - len(cks)) + msg="Finished slicing files ({} chunks in {:.2f}s). Start to embedding the content.".format(len(cks), timer() - st) + ) st = timer() try: tk_count, vector_size = embedding(cks, embd_mdl, r["parser_config"], callback) @@ -374,7 +378,7 @@ def main(): logger.exception("run_rembedding got exception") tk_count = 0 logger.info("Embedding elapsed({}): {:.2f}".format(r["name"], timer() - st)) - callback(msg="Finished embedding({:.2f})! Start to build index!".format(timer() - st)) + callback(msg="Finished embedding (in {:.2f}s)! Start to build index!".format(timer() - st)) # logger.info(f"task_executor init_kb index {search.index_name(r["tenant_id"])} embd_mdl {embd_mdl.llm_name} vector length {vector_size}") init_kb(r, vector_size) @@ -396,6 +400,7 @@ def main(): if TaskService.do_cancel(r["id"]): docStoreConn.delete({"doc_id": r["doc_id"]}, search.index_name(r["tenant_id"]), r["kb_id"]) continue + callback(msg="Indexing elapsed in {:.2f}s.".format(timer() - st)) callback(1., "Done!") DocumentService.increment_chunk_num( r["doc_id"], r["kb_id"], tk_count, chunk_count, 0)