mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 03:35:52 +08:00
remove PyMuPDF (#618)
### What problem does this PR solve? #613 ### Type of change - [x] Other (please describe):
This commit is contained in:
parent
7059ec2298
commit
cab274f560
@ -19,7 +19,7 @@ import os
|
|||||||
import re
|
import re
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
import fitz
|
import pdfplumber
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from cachetools import LRUCache, cached
|
from cachetools import LRUCache, cached
|
||||||
from ruamel.yaml import YAML
|
from ruamel.yaml import YAML
|
||||||
@ -172,11 +172,9 @@ def filename_type(filename):
|
|||||||
def thumbnail(filename, blob):
|
def thumbnail(filename, blob):
|
||||||
filename = filename.lower()
|
filename = filename.lower()
|
||||||
if re.match(r".*\.pdf$", filename):
|
if re.match(r".*\.pdf$", filename):
|
||||||
pdf = fitz.open(stream=blob, filetype="pdf")
|
pdf = pdfplumber.open(BytesIO(blob))
|
||||||
pix = pdf[0].get_pixmap(matrix=fitz.Matrix(0.03, 0.03))
|
|
||||||
buffered = BytesIO()
|
buffered = BytesIO()
|
||||||
Image.frombytes("RGB", [pix.width, pix.height],
|
pdf.pages[0].to_image().annotated.save(buffered, format="png")
|
||||||
pix.samples).save(buffered, format="png")
|
|
||||||
return "data:image/png;base64," + \
|
return "data:image/png;base64," + \
|
||||||
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
base64.b64encode(buffered.getvalue()).decode("utf-8")
|
||||||
|
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
import fitz
|
|
||||||
import xgboost as xgb
|
import xgboost as xgb
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
import torch
|
import torch
|
||||||
@ -922,9 +921,7 @@ class RAGFlowPdfParser:
|
|||||||
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
fnm) if not binary else pdfplumber.open(BytesIO(binary))
|
||||||
return len(pdf.pages)
|
return len(pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
pdf = fitz.open(fnm) if not binary else fitz.open(
|
logging.error(str(e))
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
return len(pdf)
|
|
||||||
|
|
||||||
def __images__(self, fnm, zoomin=3, page_from=0,
|
def __images__(self, fnm, zoomin=3, page_from=0,
|
||||||
page_to=299, callback=None):
|
page_to=299, callback=None):
|
||||||
@ -946,23 +943,7 @@ class RAGFlowPdfParser:
|
|||||||
self.pdf.pages[page_from:page_to]]
|
self.pdf.pages[page_from:page_to]]
|
||||||
self.total_page = len(self.pdf.pages)
|
self.total_page = len(self.pdf.pages)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.pdf = fitz.open(fnm) if isinstance(
|
logging.error(str(e))
|
||||||
fnm, str) else fitz.open(
|
|
||||||
stream=fnm, filetype="pdf")
|
|
||||||
self.page_images = []
|
|
||||||
self.page_chars = []
|
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
|
||||||
self.total_page = len(self.pdf)
|
|
||||||
for i, page in enumerate(self.pdf):
|
|
||||||
if i < page_from:
|
|
||||||
continue
|
|
||||||
if i >= page_to:
|
|
||||||
break
|
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
|
||||||
pix.samples)
|
|
||||||
self.page_images.append(img)
|
|
||||||
self.page_chars.append([])
|
|
||||||
|
|
||||||
self.outlines = []
|
self.outlines = []
|
||||||
try:
|
try:
|
||||||
|
@ -1,12 +1,13 @@
|
|||||||
|
import pdfplumber
|
||||||
|
|
||||||
from .ocr import OCR
|
from .ocr import OCR
|
||||||
from .recognizer import Recognizer
|
from .recognizer import Recognizer
|
||||||
from .layout_recognizer import LayoutRecognizer
|
from .layout_recognizer import LayoutRecognizer
|
||||||
from .table_structure_recognizer import TableStructureRecognizer
|
from .table_structure_recognizer import TableStructureRecognizer
|
||||||
|
|
||||||
|
|
||||||
def init_in_out(args):
|
def init_in_out(args):
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
import fitz
|
|
||||||
import os
|
import os
|
||||||
import traceback
|
import traceback
|
||||||
from api.utils.file_utils import traversal_files
|
from api.utils.file_utils import traversal_files
|
||||||
@ -18,13 +19,11 @@ def init_in_out(args):
|
|||||||
|
|
||||||
def pdf_pages(fnm, zoomin=3):
|
def pdf_pages(fnm, zoomin=3):
|
||||||
nonlocal outputs, images
|
nonlocal outputs, images
|
||||||
pdf = fitz.open(fnm)
|
pdf = pdfplumber.open(fnm)
|
||||||
mat = fitz.Matrix(zoomin, zoomin)
|
images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||||
for i, page in enumerate(pdf):
|
enumerate(pdf.pages)]
|
||||||
pix = page.get_pixmap(matrix=mat)
|
|
||||||
img = Image.frombytes("RGB", [pix.width, pix.height],
|
for i, page in enumerate(images):
|
||||||
pix.samples)
|
|
||||||
images.append(img)
|
|
||||||
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
outputs.append(os.path.split(fnm)[-1] + f"_{i}.jpg")
|
||||||
|
|
||||||
def images_and_outputs(fnm):
|
def images_and_outputs(fnm):
|
||||||
|
@ -35,7 +35,7 @@ class RAGFlowMinio(object):
|
|||||||
self.conn = None
|
self.conn = None
|
||||||
|
|
||||||
def put(self, bucket, fnm, binary):
|
def put(self, bucket, fnm, binary):
|
||||||
for _ in range(10):
|
for _ in range(3):
|
||||||
try:
|
try:
|
||||||
if not self.conn.bucket_exists(bucket):
|
if not self.conn.bucket_exists(bucket):
|
||||||
self.conn.make_bucket(bucket)
|
self.conn.make_bucket(bucket)
|
||||||
|
@ -91,8 +91,6 @@ pycryptodomex==3.20.0
|
|||||||
pydantic==2.6.2
|
pydantic==2.6.2
|
||||||
pydantic_core==2.16.3
|
pydantic_core==2.16.3
|
||||||
PyJWT==2.8.0
|
PyJWT==2.8.0
|
||||||
PyMuPDF==1.23.25
|
|
||||||
PyMuPDFb==1.23.22
|
|
||||||
PyMySQL==1.1.0
|
PyMySQL==1.1.0
|
||||||
PyPDF2==3.0.1
|
PyPDF2==3.0.1
|
||||||
pypdfium2==4.27.0
|
pypdfium2==4.27.0
|
||||||
|
Loading…
x
Reference in New Issue
Block a user