From cab274f56089ceab161d7e834f2f13e55802498d Mon Sep 17 00:00:00 2001 From: KevinHuSh Date: Tue, 30 Apr 2024 12:38:09 +0800 Subject: [PATCH] remove PyMuPDF (#618) ### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe): --- api/utils/file_utils.py | 8 +++----- deepdoc/parser/pdf_parser.py | 23 ++--------------------- deepdoc/vision/__init__.py | 15 +++++++-------- rag/utils/minio_conn.py | 2 +- requirements.txt | 2 -- 5 files changed, 13 insertions(+), 37 deletions(-) diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index e4c24e599..1b34d23fc 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -19,7 +19,7 @@ import os import re from io import BytesIO -import fitz +import pdfplumber from PIL import Image from cachetools import LRUCache, cached from ruamel.yaml import YAML @@ -172,11 +172,9 @@ def filename_type(filename): def thumbnail(filename, blob): filename = filename.lower() if re.match(r".*\.pdf$", filename): - pdf = fitz.open(stream=blob, filetype="pdf") - pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03)) + pdf = pdfplumber.open(BytesIO(blob)) buffered = BytesIO() - Image.frombytes("RGB", [pix.width, pix.height], - pix.samples).save(buffered, format="png") + pdf.pages[0].to_image().annotated.save(buffered, format="png") return "data:image/png;base64," + \ base64.b64encode(buffered.getvalue()).decode("utf-8") diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 8c0ac0045..763631e32 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -2,7 +2,6 @@ import os import random -import fitz import xgboost as xgb from io import BytesIO import torch @@ -922,9 +921,7 @@ class RAGFlowPdfParser: fnm) if not binary else pdfplumber.open(BytesIO(binary)) return len(pdf.pages) except Exception as e: - pdf = fitz.open(fnm) if not binary else fitz.open( - stream=fnm, filetype="pdf") - return len(pdf) + logging.error(str(e)) def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): @@ -946,23 +943,7 @@ class RAGFlowPdfParser: self.pdf.pages[page_from:page_to]] self.total_page = len(self.pdf.pages) except Exception as e: - self.pdf = fitz.open(fnm) if isinstance( - fnm, str) else fitz.open( - stream=fnm, filetype="pdf") - self.page_images = [] - self.page_chars = [] - mat = fitz.Matrix(zoomin, zoomin) - self.total_page = len(self.pdf) - for i, page in enumerate(self.pdf): - if i < page_from: - continue - if i >= page_to: - break - pix = page.get_pixmap(matrix=mat) - img = Image.frombytes("RGB", [pix.width, pix.height], - pix.samples) - self.page_images.append(img) - self.page_chars.append([]) + logging.error(str(e)) self.outlines = [] try: diff --git a/deepdoc/vision/__init__.py b/deepdoc/vision/__init__.py index 01bacbbdd..a312a547f 100644 --- a/deepdoc/vision/__init__.py +++ b/deepdoc/vision/__init__.py @@ -1,12 +1,13 @@ +import pdfplumber from .ocr import OCR from .recognizer import Recognizer from .layout_recognizer import LayoutRecognizer from .table_structure_recognizer import TableStructureRecognizer + def init_in_out(args): from PIL import Image - import fitz import os import traceback from api.utils.file_utils import traversal_files @@ -18,13 +19,11 @@ def init_in_out(args): def pdf_pages(fnm, zoomin=3): nonlocal outputs, images - pdf = fitz.open(fnm) - mat = fitz.Matrix(zoomin, zoomin) - for i, page in enumerate(pdf): - pix = page.get_pixmap(matrix=mat) - img = Image.frombytes("RGB", [pix.width, pix.height], - pix.samples) - images.append(img) + pdf = pdfplumber.open(fnm) + images = [p.to_image(resolution=72 * zoomin).annotated for i, p in + enumerate(pdf.pages)] + + for i, page in enumerate(images): outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") def images_and_outputs(fnm): diff --git a/rag/utils/minio_conn.py b/rag/utils/minio_conn.py index fbef33b5f..fa87ed3b0 100644 --- a/rag/utils/minio_conn.py +++ b/rag/utils/minio_conn.py @@ -35,7 +35,7 @@ class RAGFlowMinio(object): self.conn = None def put(self, bucket, fnm, binary): - for _ in range(10): + for _ in range(3): try: if not self.conn.bucket_exists(bucket): self.conn.make_bucket(bucket) diff --git a/requirements.txt b/requirements.txt index 419e2f6c5..03bf1af6c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -91,8 +91,6 @@ pycryptodomex==3.20.0 pydantic==2.6.2 pydantic_core==2.16.3 PyJWT==2.8.0 -PyMuPDF==1.23.25 -PyMuPDFb==1.23.22 PyMySQL==1.1.0 PyPDF2==3.0.1 pypdfium2==4.27.0