Fix: empty query issue. (#7551)

### What problem does this PR solve?

#5214

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Kevin Hu 2025-05-09 12:20:19 +08:00 committed by GitHub
parent d66c17ab5c
commit a14865e6bb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 6 additions and 3 deletions

View File

@ -309,7 +309,7 @@ class RAGFlowPdfParser:
"bottom": b[-1][1] / ZM, "bottom": b[-1][1] / ZM,
"chars": [], "chars": [],
"page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
self.mean_height[-1] / 3 self.mean_height[pagenum-1] / 3
) )
# merge chars in the same rect # merge chars in the same rect
@ -355,8 +355,8 @@ class RAGFlowPdfParser:
del boxes_to_reg[i]["box_image"] del boxes_to_reg[i]["box_image"]
logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s") logging.info(f"__ocr recognize {len(bxs)} boxes cost {timer() - start}s")
bxs = [b for b in bxs if b["text"]] bxs = [b for b in bxs if b["text"]]
if self.mean_height[-1] == 0: if self.mean_height[pagenum-1] == 0:
self.mean_height[-1] = np.median([b["bottom"] - b["top"] self.mean_height[pagenum-1] = np.median([b["bottom"] - b["top"]
for b in bxs]) for b in bxs])
self.boxes.append(bxs) self.boxes.append(bxs)

View File

@ -77,6 +77,7 @@ class FulltextQueryer:
" ", " ",
rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())), rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
).strip() ).strip()
otxt = txt
txt = FulltextQueryer.rmWWW(txt) txt = FulltextQueryer.rmWWW(txt)
if not self.isChinese(txt): if not self.isChinese(txt):
@ -196,6 +197,8 @@ class FulltextQueryer:
if qs: if qs:
query = " OR ".join([f"({t})" for t in qs if t]) query = " OR ".join([f"({t})" for t in qs if t])
if not query:
query = otxt
return MatchTextExpr( return MatchTextExpr(
self.query_fields, query, 100, {"minimum_should_match": min_match} self.query_fields, query, 100, {"minimum_should_match": min_match}
), keywords ), keywords