mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-07-10 00:31:48 +08:00
57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
from pathlib import Path
|
|
from typing import Dict
|
|
|
|
from flask import current_app
|
|
from llama_index.readers.file.base_parser import BaseParser
|
|
from pypdf import PdfReader
|
|
|
|
from extensions.ext_storage import storage
|
|
from models.model import UploadFile
|
|
|
|
|
|
class PDFParser(BaseParser):
|
|
"""PDF parser."""
|
|
|
|
def _init_parser(self) -> Dict:
|
|
"""Init parser."""
|
|
return {}
|
|
|
|
def parse_file(self, file: Path, errors: str = "ignore") -> str:
|
|
"""Parse file."""
|
|
if not current_app.config.get('PDF_PREVIEW', True):
|
|
return ''
|
|
|
|
plaintext_file_key = ''
|
|
plaintext_file_exists = False
|
|
if self._parser_config and 'upload_file' in self._parser_config and self._parser_config['upload_file']:
|
|
upload_file: UploadFile = self._parser_config['upload_file']
|
|
if upload_file.hash:
|
|
plaintext_file_key = 'upload_files/' + upload_file.tenant_id + '/' + upload_file.hash + '.plaintext'
|
|
try:
|
|
text = storage.load(plaintext_file_key).decode('utf-8')
|
|
plaintext_file_exists = True
|
|
return text
|
|
except FileNotFoundError:
|
|
pass
|
|
|
|
text_list = []
|
|
with open(file, "rb") as fp:
|
|
# Create a PDF object
|
|
pdf = PdfReader(fp)
|
|
|
|
# Get the number of pages in the PDF document
|
|
num_pages = len(pdf.pages)
|
|
|
|
# Iterate over every page
|
|
for page in range(num_pages):
|
|
# Extract the text from the page
|
|
page_text = pdf.pages[page].extract_text()
|
|
text_list.append(page_text)
|
|
text = "\n".join(text_list)
|
|
|
|
# save plaintext file for caching
|
|
if not plaintext_file_exists and plaintext_file_key:
|
|
storage.save(plaintext_file_key, text.encode('utf-8'))
|
|
|
|
return text
|