From c852a6dfbf31dc16a52131046579b1b8e8df2991 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Wed, 15 Jan 2025 15:20:29 +0800 Subject: [PATCH] Accelerate titles' embeddings. (#4492) ### What problem does this PR solve? ### Type of change - [x] Performance Improvement --- deepdoc/vision/layout_recognizer.py | 2 +- rag/svr/task_executor.py | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/deepdoc/vision/layout_recognizer.py b/deepdoc/vision/layout_recognizer.py index ea8f2a880..1cf3aa1a2 100644 --- a/deepdoc/vision/layout_recognizer.py +++ b/deepdoc/vision/layout_recognizer.py @@ -78,7 +78,7 @@ class LayoutRecognizer(Recognizer): "x0": b["bbox"][0] / scale_factor, "x1": b["bbox"][2] / scale_factor, "top": b["bbox"][1] / scale_factor, "bottom": b["bbox"][-1] / scale_factor, "page_number": pn, - } for b in lts if float(b["score"]) >= 0.8 or b["type"] not in self.garbage_layouts] + } for b in lts if float(b["score"]) >= 0.4 or b["type"] not in self.garbage_layouts] lts = self.sort_Y_firstly(lts, np.mean( [lt["bottom"] - lt["top"] for lt in lts]) / 2) lts = self.layouts_cleanup(bxs, lts) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index cd3db9ad2..0fe7b86a6 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -354,16 +354,9 @@ def embedding(docs, mdl, parser_config=None, callback=None): tk_count = 0 if len(tts) == len(cnts): - tts_ = np.array([]) - for i in range(0, len(tts), batch_size): - vts, c = mdl.encode(tts[i: i + batch_size]) - if len(tts_) == 0: - tts_ = vts - else: - tts_ = np.concatenate((tts_, vts), axis=0) - tk_count += c - callback(prog=0.6 + 0.1 * (i + 1) / len(tts), msg="") - tts = tts_ + vts, c = mdl.encode(tts[0: 1]) + tts = np.concatenate([vts for _ in range(len(tts))], axis=0) + tk_count += c cnts_ = np.array([]) for i in range(0, len(cnts), batch_size):