diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 3973ce03a..361a88872 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -307,13 +307,13 @@ class RAGFlowPdfParser: [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM, "top": b[0][1] / ZM, "text": "", "txt": t, "bottom": b[-1][1] / ZM, + "chars": [], "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]], self.mean_height[-1] / 3 ) # merge chars in the same rect - for c in Recognizer.sort_Y_firstly( - chars, self.mean_height[pagenum - 1] // 4): + for c in chars: ii = Recognizer.find_overlapped(c, bxs) if ii is None: self.lefted_chars.append(c) @@ -323,11 +323,20 @@ class RAGFlowPdfParser: if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ': self.lefted_chars.append(c) continue - if c["text"] == " " and bxs[ii]["text"]: - if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]): - bxs[ii]["text"] += " " - else: - bxs[ii]["text"] += c["text"] + bxs[ii]["chars"].append(c) + + for b in bxs: + if not b["chars"]: + del b["chars"] + continue + m_ht = np.mean([c["height"] for c in b["chars"]]) + for c in Recognizer.sort_Y_firstly(b["chars"], m_ht): + if c["text"] == " " and b["text"]: + if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]): + b["text"] += " " + else: + b["text"] += c["text"] + del b["chars"] logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s") start = timer()