mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-13 00:28:59 +08:00
remove PyMuPDF (#618)
### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe):
This commit is contained in:
parent
7059ec2298
commit
cab274f560
@ -19,7 +19,7 @@ import os
|
||||
import re
|
||||
from io import BytesIO
|
||||
|
||||
import fitz
|
||||
import pdfplumber
|
||||
from PIL import Image
|
||||
from cachetools import LRUCache, cached
|
||||
from ruamel.yaml import YAML
|
||||
@ -172,11 +172,9 @@ def filename_type(filename):
|
||||
def thumbnail(filename, blob):
|
||||
filename = filename.lower()
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
pdf = fitz.open(stream=blob, filetype="pdf")
|
||||
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
buffered = BytesIO()
|
||||
Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples).save(buffered, format="png")
|
||||
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
||||
return "data:image/png;base64," + \
|
||||
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
import os
|
||||
import random
|
||||
|
||||
import fitz
|
||||
import xgboost as xgb
|
||||
from io import BytesIO
|
||||
import torch
|
||||
@ -922,9 +921,7 @@ class RAGFlowPdfParser:
|
||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||
return len(pdf.pages)
|
||||
except Exception as e:
|
||||
pdf = fitz.open(fnm) if not binary else fitz.open(
|
||||
stream=fnm, filetype="pdf")
|
||||
return len(pdf)
|
||||
logging.error(str(e))
|
||||
|
||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||
page_to=299, callback=None):
|
||||
@ -946,23 +943,7 @@ class RAGFlowPdfParser:
|
||||
self.pdf.pages[page_from:page_to]]
|
||||
self.total_page = len(self.pdf.pages)
|
||||
except Exception as e:
|
||||
self.pdf = fitz.open(fnm) if isinstance(
|
||||
fnm, str) else fitz.open(
|
||||
stream=fnm, filetype="pdf")
|
||||
self.page_images = []
|
||||
self.page_chars = []
|
||||
mat = fitz.Matrix(zoomin, zoomin)
|
||||
self.total_page = len(self.pdf)
|
||||
for i, page in enumerate(self.pdf):
|
||||
if i < page_from:
|
||||
continue
|
||||
if i >= page_to:
|
||||
break
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples)
|
||||
self.page_images.append(img)
|
||||
self.page_chars.append([])
|
||||
logging.error(str(e))
|
||||
|
||||
self.outlines = []
|
||||
try:
|
||||
|
@ -1,12 +1,13 @@
|
||||
import pdfplumber
|
||||
|
||||
from .ocr import OCR
|
||||
from .recognizer import Recognizer
|
||||
from .layout_recognizer import LayoutRecognizer
|
||||
from .table_structure_recognizer import TableStructureRecognizer
|
||||
|
||||
|
||||
def init_in_out(args):
|
||||
from PIL import Image
|
||||
import fitz
|
||||
import os
|
||||
import traceback
|
||||
from api.utils.file_utils import traversal_files
|
||||
@ -18,13 +19,11 @@ def init_in_out(args):
|
||||
|
||||
def pdf_pages(fnm, zoomin=3):
|
||||
nonlocal outputs, images
|
||||
pdf = fitz.open(fnm)
|
||||
mat = fitz.Matrix(zoomin, zoomin)
|
||||
for i, page in enumerate(pdf):
|
||||
pix = page.get_pixmap(matrix=mat)
|
||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
||||
pix.samples)
|
||||
images.append(img)
|
||||
pdf = pdfplumber.open(fnm)
|
||||
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||
enumerate(pdf.pages)]
|
||||
|
||||
for i, page in enumerate(images):
|
||||
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
||||
|
||||
def images_and_outputs(fnm):
|
||||
|
@ -35,7 +35,7 @@ class RAGFlowMinio(object):
|
||||
self.conn = None
|
||||
|
||||
def put(self, bucket, fnm, binary):
|
||||
for _ in range(10):
|
||||
for _ in range(3):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
|
@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
||||
pydantic==2.6.2
|
||||
pydantic_core==2.16.3
|
||||
PyJWT==2.8.0
|
||||
PyMuPDF==1.23.25
|
||||
PyMuPDFb==1.23.22
|
||||
PyMySQL==1.1.0
|
||||
PyPDF2==3.0.1
|
||||
pypdfium2==4.27.0
|
||||
|
Loading…
x
Reference in New Issue
Block a user