diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 43d5d0b03..4a6d3f71b 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -316,8 +316,7 @@ def change_parser(): return get_data_error_result(retmsg="Not supported yet!") e = DocumentService.update_by_id(doc.id, - {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0", - "token_num": 0, "chunk_num": 0, "process_duation": 0}) + {"parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": "0"}) if not e: return get_data_error_result(retmsg="Document not found!") if doc.token_num > 0: diff --git a/api/db/services/task_service.py b/api/db/services/task_service.py index a083fac24..c8cdd70ec 100644 --- a/api/db/services/task_service.py +++ b/api/db/services/task_service.py @@ -73,8 +73,9 @@ class TaskService(CommonService): @classmethod @DB.connection_context() def update_progress(cls, id, info): - cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( - cls.model.id == id).execute() + if info["progress_msg"]: + cls.model.update(progress_msg=cls.model.progress_msg + "\n" + info["progress_msg"]).where( + cls.model.id == id).execute() if "progress" in info: cls.model.update(progress=info["progress"]).where( cls.model.id == id).execute() diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 61d2ab0c9..2c379fc1d 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -725,7 +725,7 @@ class HuParser: (cropout( bxs, "figure", poss), - [txt] if not return_html else [f"

{txt}

"])) + [txt])) positions.append(poss) for k, bxs in tables.items(): diff --git a/docker/.env b/docker/.env index e036ef0c8..4345b784f 100644 --- a/docker/.env +++ b/docker/.env @@ -16,7 +16,7 @@ MEM_LIMIT=4073741824 MYSQL_PASSWORD=infini_rag_flow MYSQL_PORT=5455 -MINIO_USER=infiniflow +MINIO_USER=rag_flow MINIO_PASSWORD=infini_rag_flow SVR_HTTP_PORT=9380 diff --git a/rag/app/book.py b/rag/app/book.py index dd31f685f..24e3f3bf8 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -28,7 +28,7 @@ class Pdf(PdfParser): from_page, to_page, callback) - callback("OCR finished") + callback(msg="OCR finished") from timeit import default_timer as timer start = timer() diff --git a/rag/app/laws.py b/rag/app/laws.py index 297970a39..bbc99a925 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -57,7 +57,7 @@ class Pdf(PdfParser): to_page, callback ) - callback("OCR finished") + callback(msg="OCR finished") from timeit import default_timer as timer start = timer() @@ -135,6 +135,6 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if __name__ == "__main__": import sys - def dummy(a, b): + def dummy(prog=None, msg=""): pass chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/manual.py b/rag/app/manual.py index 75af024ce..6effc3de3 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -22,7 +22,7 @@ class Pdf(PdfParser): to_page, callback ) - callback("OCR finished.") + callback(msg="OCR finished.") from timeit import default_timer as timer start = timer() diff --git a/rag/app/naive.py b/rag/app/naive.py index 3b83c53f7..4d5ec8cbd 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -29,7 +29,7 @@ class Pdf(PdfParser): to_page, callback ) - callback("OCR finished") + callback(msg="OCR finished") from timeit import default_timer as timer start = timer() diff --git a/rag/app/paper.py b/rag/app/paper.py index a9d1afcc0..19efa22cd 100644 --- a/rag/app/paper.py +++ b/rag/app/paper.py @@ -36,7 +36,7 @@ class Pdf(PdfParser): to_page, callback ) - callback("OCR finished.") + callback(msg="OCR finished.") from timeit import default_timer as timer start = timer() diff --git a/rag/nlp/search.py b/rag/nlp/search.py index f229055dd..04d4588b3 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -305,8 +305,15 @@ class Dealer: "similarity": sim[i], "vector_similarity": vsim[i], "term_similarity": tsim[i], - "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim))) + "vector": self.trans2floats(sres.field[id].get("q_%d_vec" % dim, "\t".join(["0"] * dim))), + "positions": sres.field[id].get("position_int", "").split("\t") } + if len(d["positions"]) % 5 == 0: + poss = [] + for i in range(0, len(d["positions"]), 5): + poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]), + float(d["positions"][i + 3]), float(d["positions"][i + 4])]) + d["positions"] = poss ranks["chunks"].append(d) if dnm not in ranks["doc_aggs"]: ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0} diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 457808c7d..43fa86474 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -25,6 +25,7 @@ import traceback from functools import partial from timeit import default_timer as timer +import numpy as np from elasticsearch_dsl import Q from api.db.services.task_service import TaskService @@ -177,10 +178,11 @@ def embedding(docs, mdl, parser_config={}, callback=None): tts, c = mdl.encode(tts) tk_count += c - cnts_ = [] + cnts_ = np.array([]) for i in range(0, len(cnts), 32): vts, c = mdl.encode(cnts[i: i+32]) - cnts_.extend(vts) + if len(cnts_) == 0: cnts_ = vts + else: cnts_ = np.concatenate((cnts_, vts), axis=0) tk_count += c callback(msg="") cnts = cnts_