mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 10:15:57 +08:00
feat:api Add support for extracting EPUB files in ExtractProcessor (#3254)
Co-authored-by: crazywoola <427733928@qq.com>
This commit is contained in:
parent
44448ba68d
commit
b00466f025
@ -16,6 +16,7 @@ from core.rag.extractor.pdf_extractor import PdfExtractor
|
|||||||
from core.rag.extractor.text_extractor import TextExtractor
|
from core.rag.extractor.text_extractor import TextExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
||||||
|
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||||
@ -106,6 +107,8 @@ class ExtractProcessor:
|
|||||||
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
|
||||||
elif file_extension == '.xml':
|
elif file_extension == '.xml':
|
||||||
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
|
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
|
||||||
|
elif file_extension == 'epub':
|
||||||
|
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
|
||||||
else:
|
else:
|
||||||
# txt
|
# txt
|
||||||
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
|
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
|
||||||
@ -123,6 +126,8 @@ class ExtractProcessor:
|
|||||||
extractor = WordExtractor(file_path)
|
extractor = WordExtractor(file_path)
|
||||||
elif file_extension == '.csv':
|
elif file_extension == '.csv':
|
||||||
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
||||||
|
elif file_extension == 'epub':
|
||||||
|
extractor = UnstructuredEpubExtractor(file_path)
|
||||||
else:
|
else:
|
||||||
# txt
|
# txt
|
||||||
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
extractor = TextExtractor(file_path, autodetect_encoding=True)
|
||||||
|
@ -0,0 +1,37 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class UnstructuredEpubExtractor(BaseExtractor):
|
||||||
|
"""Load epub files.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
file_path: Path to the file to load.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
file_path: str,
|
||||||
|
api_url: str = None,
|
||||||
|
):
|
||||||
|
"""Initialize with file path."""
|
||||||
|
self._file_path = file_path
|
||||||
|
self._api_url = api_url
|
||||||
|
|
||||||
|
def extract(self) -> list[Document]:
|
||||||
|
from unstructured.partition.epub import partition_epub
|
||||||
|
|
||||||
|
elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
|
||||||
|
from unstructured.chunking.title import chunk_by_title
|
||||||
|
chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
|
||||||
|
documents = []
|
||||||
|
for chunk in chunks:
|
||||||
|
text = chunk.text.strip()
|
||||||
|
documents.append(Document(page_content=text))
|
||||||
|
|
||||||
|
return documents
|
@ -65,7 +65,7 @@ qdrant-client==1.7.3
|
|||||||
cohere~=5.2.4
|
cohere~=5.2.4
|
||||||
pyyaml~=6.0.1
|
pyyaml~=6.0.1
|
||||||
numpy~=1.25.2
|
numpy~=1.25.2
|
||||||
unstructured[docx,pptx,msg,md,ppt]~=0.10.27
|
unstructured[docx,pptx,msg,md,ppt,epub]~=0.10.27
|
||||||
bs4~=0.0.1
|
bs4~=0.0.1
|
||||||
markdown~=3.5.1
|
markdown~=3.5.1
|
||||||
httpx[socks]~=0.24.1
|
httpx[socks]~=0.24.1
|
||||||
|
@ -22,7 +22,7 @@ IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
|
|||||||
|
|
||||||
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
|
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
|
||||||
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
|
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
|
||||||
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
|
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']
|
||||||
PREVIEW_WORDS_LIMIT = 3000
|
PREVIEW_WORDS_LIMIT = 3000
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user