remove PyMuPDF (#618)

### What problem does this PR solve?
#613 

### Type of change


- [x] Other (please describe):
This commit is contained in:
KevinHuSh 2024-04-30 12:38:09 +08:00 committed by GitHub
parent 7059ec2298
commit cab274f560
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 13 additions and 37 deletions

View File

@ -19,7 +19,7 @@ import os
import re import re
from io import BytesIO from io import BytesIO
import fitz import pdfplumber
from PIL import Image from PIL import Image
from cachetools import LRUCache, cached from cachetools import LRUCache, cached
from ruamel.yaml import YAML from ruamel.yaml import YAML
@ -172,11 +172,9 @@ def filename_type(filename):
def thumbnail(filename, blob): def thumbnail(filename, blob):
filename = filename.lower() filename = filename.lower()
if re.match(r".*\.pdf$", filename): if re.match(r".*\.pdf$", filename):
pdf = fitz.open(stream=blob, filetype="pdf") pdf = pdfplumber.open(BytesIO(blob))
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
buffered = BytesIO() buffered = BytesIO()
Image.frombytes("RGB", [pix.width, pix.height], pdf.pages[0].to_image().annotated.save(buffered, format="png")
pix.samples).save(buffered, format="png")
return "data:image/png;base64," + \ return "data:image/png;base64," + \
base64.b64encode(buffered.getvalue()).decode("utf-8") base64.b64encode(buffered.getvalue()).decode("utf-8")

View File

@ -2,7 +2,6 @@
import os import os
import random import random
import fitz
import xgboost as xgb import xgboost as xgb
from io import BytesIO from io import BytesIO
import torch import torch
@ -922,9 +921,7 @@ class RAGFlowPdfParser:
fnm) if not binary else pdfplumber.open(BytesIO(binary)) fnm) if not binary else pdfplumber.open(BytesIO(binary))
return len(pdf.pages) return len(pdf.pages)
except Exception as e: except Exception as e:
pdf = fitz.open(fnm) if not binary else fitz.open( logging.error(str(e))
stream=fnm, filetype="pdf")
return len(pdf)
def __images__(self, fnm, zoomin=3, page_from=0, def __images__(self, fnm, zoomin=3, page_from=0,
page_to=299, callback=None): page_to=299, callback=None):
@ -946,23 +943,7 @@ class RAGFlowPdfParser:
self.pdf.pages[page_from:page_to]] self.pdf.pages[page_from:page_to]]
self.total_page = len(self.pdf.pages) self.total_page = len(self.pdf.pages)
except Exception as e: except Exception as e:
self.pdf = fitz.open(fnm) if isinstance( logging.error(str(e))
fnm, str) else fitz.open(
stream=fnm, filetype="pdf")
self.page_images = []
self.page_chars = []
mat = fitz.Matrix(zoomin, zoomin)
self.total_page = len(self.pdf)
for i, page in enumerate(self.pdf):
if i < page_from:
continue
if i >= page_to:
break
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height],
pix.samples)
self.page_images.append(img)
self.page_chars.append([])
self.outlines = [] self.outlines = []
try: try:

View File

@ -1,12 +1,13 @@
import pdfplumber
from .ocr import OCR from .ocr import OCR
from .recognizer import Recognizer from .recognizer import Recognizer
from .layout_recognizer import LayoutRecognizer from .layout_recognizer import LayoutRecognizer
from .table_structure_recognizer import TableStructureRecognizer from .table_structure_recognizer import TableStructureRecognizer
def init_in_out(args): def init_in_out(args):
from PIL import Image from PIL import Image
import fitz
import os import os
import traceback import traceback
from api.utils.file_utils import traversal_files from api.utils.file_utils import traversal_files
@ -18,13 +19,11 @@ def init_in_out(args):
def pdf_pages(fnm, zoomin=3): def pdf_pages(fnm, zoomin=3):
nonlocal outputs, images nonlocal outputs, images
pdf = fitz.open(fnm) pdf = pdfplumber.open(fnm)
mat = fitz.Matrix(zoomin, zoomin) images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
for i, page in enumerate(pdf): enumerate(pdf.pages)]
pix = page.get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], for i, page in enumerate(images):
pix.samples)
images.append(img)
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg") outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
def images_and_outputs(fnm): def images_and_outputs(fnm):

View File

@ -35,7 +35,7 @@ class RAGFlowMinio(object):
self.conn = None self.conn = None
def put(self, bucket, fnm, binary): def put(self, bucket, fnm, binary):
for _ in range(10): for _ in range(3):
try: try:
if not self.conn.bucket_exists(bucket): if not self.conn.bucket_exists(bucket):
self.conn.make_bucket(bucket) self.conn.make_bucket(bucket)

View File

@ -91,8 +91,6 @@ pycryptodomex==3.20.0
pydantic==2.6.2 pydantic==2.6.2
pydantic_core==2.16.3 pydantic_core==2.16.3
PyJWT==2.8.0 PyJWT==2.8.0
PyMuPDF==1.23.25
PyMuPDFb==1.23.22
PyMySQL==1.1.0 PyMySQL==1.1.0
PyPDF2==3.0.1 PyPDF2==3.0.1
pypdfium2==4.27.0 pypdfium2==4.27.0