fix pdf_paser char content confusion (#1462)

### What problem does this PR solve?

#1407 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
H 2024-07-11 14:37:55 +08:00 committed by GitHub
parent dbb8f7b77b
commit 2290c2a2f0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -287,14 +287,15 @@ class RAGFlowPdfParser:
) )
# solve char content confusion # solve char content confusion
record_error_length = 0 record_error_length, ct = 0, 1
for c in chars[0:128]: for c in chars[0:128]:
ii = Recognizer.find_overlapped(c, bxs) ii = Recognizer.find_overlapped(c, bxs)
if ii is None: if ii is None:
continue continue
record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2) record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2)
ct += 1
record_error_length = record_error_length / 128 record_error_length = record_error_length / ct
for char in chars: for char in chars:
char["top"] -= record_error_length char["top"] -= record_error_length
char["bottom"] -= record_error_length char["bottom"] -= record_error_length