From 3bbdf3b7704af74decfd13ed94cf643193cd42de Mon Sep 17 00:00:00 2001 From: xinzhuang Date: Thu, 23 May 2024 14:29:42 +0800 Subject: [PATCH] fixbug for computing 'not concating feature' (#896) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? When pdfparser call `_naive_vertical_merge` method,there is a "not concating feature " value by computing difference between `b` and `b_`'s layoutno ,but actually is `b` and `b`. I think it's a bug, so fix it. Please check again. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/pdf_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index b95fd2f74..5068eed62 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -396,7 +396,7 @@ class RAGFlowPdfParser: ] # features for not concating feats = [ - b.get("layoutno", 0) != b.get("layoutno", 0), + b.get("layoutno", 0) != b_.get("layoutno", 0), b["text"].strip()[-1] in "。?!?", self.is_english and b["text"].strip()[-1] in ".!?", b["page_number"] == b_["page_number"] and b_["top"] -