mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 17:49:01 +08:00
Refa: text order be robuster. (#7525)
### What problem does this PR solve? ### Type of change - [x] Refactoring
This commit is contained in:
parent
3827c47515
commit
9d3dd13fef
@ -307,13 +307,13 @@ class RAGFlowPdfParser:
|
|||||||
[{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
|
[{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
|
||||||
"top": b[0][1] / ZM, "text": "", "txt": t,
|
"top": b[0][1] / ZM, "text": "", "txt": t,
|
||||||
"bottom": b[-1][1] / ZM,
|
"bottom": b[-1][1] / ZM,
|
||||||
|
"chars": [],
|
||||||
"page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
|
"page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
|
||||||
self.mean_height[-1] / 3
|
self.mean_height[-1] / 3
|
||||||
)
|
)
|
||||||
|
|
||||||
# merge chars in the same rect
|
# merge chars in the same rect
|
||||||
for c in Recognizer.sort_Y_firstly(
|
for c in chars:
|
||||||
chars, self.mean_height[pagenum - 1] // 4):
|
|
||||||
ii = Recognizer.find_overlapped(c, bxs)
|
ii = Recognizer.find_overlapped(c, bxs)
|
||||||
if ii is None:
|
if ii is None:
|
||||||
self.lefted_chars.append(c)
|
self.lefted_chars.append(c)
|
||||||
@ -323,11 +323,20 @@ class RAGFlowPdfParser:
|
|||||||
if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
|
if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
|
||||||
self.lefted_chars.append(c)
|
self.lefted_chars.append(c)
|
||||||
continue
|
continue
|
||||||
if c["text"] == " " and bxs[ii]["text"]:
|
bxs[ii]["chars"].append(c)
|
||||||
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", bxs[ii]["text"][-1]):
|
|
||||||
bxs[ii]["text"] += " "
|
for b in bxs:
|
||||||
|
if not b["chars"]:
|
||||||
|
del b["chars"]
|
||||||
|
continue
|
||||||
|
m_ht = np.mean([c["height"] for c in b["chars"]])
|
||||||
|
for c in Recognizer.sort_Y_firstly(b["chars"], m_ht):
|
||||||
|
if c["text"] == " " and b["text"]:
|
||||||
|
if re.match(r"[0-9a-zA-Zа-яА-Я,.?;:!%%]", b["text"][-1]):
|
||||||
|
b["text"] += " "
|
||||||
else:
|
else:
|
||||||
bxs[ii]["text"] += c["text"]
|
b["text"] += c["text"]
|
||||||
|
del b["chars"]
|
||||||
|
|
||||||
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
|
logging.info(f"__ocr sorting {len(chars)} chars cost {timer() - start}s")
|
||||||
start = timer()
|
start = timer()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user