From 2290c2a2f0fc691900674451e2d4410da85c8956 Mon Sep 17 00:00:00 2001 From: H <43509927+guoyuhao2330@users.noreply.github.com> Date: Thu, 11 Jul 2024 14:37:55 +0800 Subject: [PATCH] fix pdf_paser char content confusion (#1462) ### What problem does this PR solve? #1407 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index fce9e91de..c7aaa9d58 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -287,14 +287,15 @@ class RAGFlowPdfParser: ) # solve char content confusion - record_error_length = 0 + record_error_length, ct = 0, 1 for c in chars[0:128]: ii = Recognizer.find_overlapped(c, bxs) if ii is None: continue - record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["bottom"] - c["bottom"] - c["top"]) / 2) + record_error_length += abs((bxs[ii]["bottom"] + bxs[ii]["top"] - c["bottom"] - c["top"]) / 2) + ct += 1 - record_error_length = record_error_length / 128 + record_error_length = record_error_length / ct for char in chars: char["top"] -= record_error_length char["bottom"] -= record_error_length