import logging from typing import List, Optional from langchain.document_loaders import PyPDFium2Loader from langchain.document_loaders.base import BaseLoader from langchain.schema import Document from extensions.ext_storage import storage from models.model import UploadFile logger = logging.getLogger(__name__) class PdfLoader(BaseLoader): """Load pdf files. Args: file_path: Path to the file to load. """ def __init__( self, file_path: str, upload_file: Optional[UploadFile] = None ): """Initialize with file path.""" self._file_path = file_path self._upload_file = upload_file def load(self) -> List[Document]: plaintext_file_key = '' plaintext_file_exists = False if self._upload_file: if self._upload_file.hash: plaintext_file_key = 'upload_files/' + self._upload_file.tenant_id + '/' \ + self._upload_file.hash + '.0625.plaintext' try: text = storage.load(plaintext_file_key).decode('utf-8') plaintext_file_exists = True return [Document(page_content=text)] except FileNotFoundError: pass documents = PyPDFium2Loader(file_path=self._file_path).load() text_list = [] for document in documents: text_list.append(document.page_content) text = "\n\n".join(text_list) # save plaintext file for caching if not plaintext_file_exists and plaintext_file_key: storage.save(plaintext_file_key, text.encode('utf-8')) return documents