From ea5e8caa696faf5a12f25f6a4e93c9c3697a05f3 Mon Sep 17 00:00:00 2001 From: liuzhenghua <1090179900@qq.com> Date: Mon, 12 May 2025 09:50:21 +0800 Subject: [PATCH] feat: Enable antialiasing for PDF image extraction to improve OCR accuracy (#7562) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? When the PDF uses vector fonts, the rendered text in the captured page image often has missing strokes, leading to numerous OCR errors and incorrect characters. Similar issues also occur in the extracted chart images. **Before** ![0089e1f76205b5b3](https://github.com/user-attachments/assets/a84f8cd7-48ae-4da4-81ca-fc0bd93320f1) **After** ![03053149e919773a](https://github.com/user-attachments/assets/45fa5ebb-a2de-42b1-9535-1ea087877eb2) You can use the following document for testing. [Casio说明书.pdf](https://github.com/user-attachments/files/20119690/Casio.pdf) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): Co-authored-by: liuzhenghua-jk --- deepdoc/parser/pdf_parser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 56ab04f93..492c4dc54 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -1015,7 +1015,7 @@ class RAGFlowPdfParser: with sys.modules[LOCK_KEY_pdfplumber]: with (pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))) as pdf: self.pdf = pdf - self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in + self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] try: