diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 361a88872..56ab04f93 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -309,7 +309,7 @@ class RAGFlowPdfParser: "bottom": b[-1][1] / ZM, "chars": [], "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], - self.mean_height[-1] / 3 + self.mean_height[pagenum-1] / 3 ) # merge chars in the same rect @@ -355,8 +355,8 @@ class RAGFlowPdfParser: del boxes_to_reg[i]["box_image"] logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s") bxs = [b for b in bxs if b["text"]] - if self.mean_height[-1] == 0: - self.mean_height[-1] = np.median([b["bottom"] - b["top"] + if self.mean_height[pagenum-1] == 0: + self.mean_height[pagenum-1] = np.median([b["bottom"] - b["top"] for b in bxs]) self.boxes.append(bxs) diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 811b6bb3d..34333a350 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -77,6 +77,7 @@ class FulltextQueryer: " ", rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())), ).strip() + otxt = txt txt = FulltextQueryer.rmWWW(txt) if not self.isChinese(txt): @@ -196,6 +197,8 @@ class FulltextQueryer: if qs: query = " OR ".join([f"({t})" for t in qs if t]) + if not query: + query = otxt return MatchTextExpr( self.query_fields, query, 100, {"minimum_should_match": min_match} ), keywords