mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-13 23:45:54 +08:00
fix document extractor node incorrectly processing doc and ppt files (#12902)
This commit is contained in:
parent
6529240da6
commit
2b86465d4c
@ -15,7 +15,7 @@ AUDIO_EXTENSIONS.extend([ext.upper() for ext in AUDIO_EXTENSIONS])
|
|||||||
|
|
||||||
if dify_config.ETL_TYPE == "Unstructured":
|
if dify_config.ETL_TYPE == "Unstructured":
|
||||||
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"]
|
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "mdx", "pdf", "html", "htm", "xlsx", "xls"]
|
||||||
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
DOCUMENT_EXTENSIONS.extend(("doc", "docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
|
||||||
if dify_config.UNSTRUCTURED_API_URL:
|
if dify_config.UNSTRUCTURED_API_URL:
|
||||||
DOCUMENT_EXTENSIONS.append("ppt")
|
DOCUMENT_EXTENSIONS.append("ppt")
|
||||||
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
|
||||||
|
@ -107,8 +107,10 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
|||||||
return _extract_text_from_plain_text(file_content)
|
return _extract_text_from_plain_text(file_content)
|
||||||
case "application/pdf":
|
case "application/pdf":
|
||||||
return _extract_text_from_pdf(file_content)
|
return _extract_text_from_pdf(file_content)
|
||||||
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
|
case "application/msword":
|
||||||
return _extract_text_from_doc(file_content)
|
return _extract_text_from_doc(file_content)
|
||||||
|
case "application/vnd.openxmlformats-officedocument.wordprocessingml.document":
|
||||||
|
return _extract_text_from_docx(file_content)
|
||||||
case "text/csv":
|
case "text/csv":
|
||||||
return _extract_text_from_csv(file_content)
|
return _extract_text_from_csv(file_content)
|
||||||
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
|
case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
|
||||||
@ -142,8 +144,10 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||||||
return _extract_text_from_yaml(file_content)
|
return _extract_text_from_yaml(file_content)
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
return _extract_text_from_pdf(file_content)
|
return _extract_text_from_pdf(file_content)
|
||||||
case ".doc" | ".docx":
|
case ".doc":
|
||||||
return _extract_text_from_doc(file_content)
|
return _extract_text_from_doc(file_content)
|
||||||
|
case ".docx":
|
||||||
|
return _extract_text_from_docx(file_content)
|
||||||
case ".csv":
|
case ".csv":
|
||||||
return _extract_text_from_csv(file_content)
|
return _extract_text_from_csv(file_content)
|
||||||
case ".xls" | ".xlsx":
|
case ".xls" | ".xlsx":
|
||||||
@ -203,7 +207,33 @@ def _extract_text_from_pdf(file_content: bytes) -> str:
|
|||||||
|
|
||||||
def _extract_text_from_doc(file_content: bytes) -> str:
|
def _extract_text_from_doc(file_content: bytes) -> str:
|
||||||
"""
|
"""
|
||||||
Extract text from a DOC/DOCX file.
|
Extract text from a DOC file.
|
||||||
|
"""
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
|
|
||||||
|
if not (dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY):
|
||||||
|
raise TextExtractionError("UNSTRUCTURED_API_URL and UNSTRUCTURED_API_KEY must be set")
|
||||||
|
|
||||||
|
try:
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".doc", delete=False) as temp_file:
|
||||||
|
temp_file.write(file_content)
|
||||||
|
temp_file.flush()
|
||||||
|
with open(temp_file.name, "rb") as file:
|
||||||
|
elements = partition_via_api(
|
||||||
|
file=file,
|
||||||
|
metadata_filename=temp_file.name,
|
||||||
|
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||||
|
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
||||||
|
)
|
||||||
|
os.unlink(temp_file.name)
|
||||||
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||||
|
except Exception as e:
|
||||||
|
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_docx(file_content: bytes) -> str:
|
||||||
|
"""
|
||||||
|
Extract text from a DOCX file.
|
||||||
For now support only paragraph and table add more if needed
|
For now support only paragraph and table add more if needed
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
@ -255,13 +285,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
|
|||||||
|
|
||||||
text.append(markdown_table)
|
text.append(markdown_table)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
|
logger.warning(f"Failed to extract table from DOC: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return "\n".join(text)
|
return "\n".join(text)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from DOCX: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
def _download_file_content(file: File) -> bytes:
|
def _download_file_content(file: File) -> bytes:
|
||||||
@ -329,14 +359,29 @@ def _extract_text_from_excel(file_content: bytes) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _extract_text_from_ppt(file_content: bytes) -> str:
|
def _extract_text_from_ppt(file_content: bytes) -> str:
|
||||||
|
from unstructured.partition.api import partition_via_api
|
||||||
from unstructured.partition.ppt import partition_ppt
|
from unstructured.partition.ppt import partition_ppt
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".ppt", delete=False) as temp_file:
|
||||||
|
temp_file.write(file_content)
|
||||||
|
temp_file.flush()
|
||||||
|
with open(temp_file.name, "rb") as file:
|
||||||
|
elements = partition_via_api(
|
||||||
|
file=file,
|
||||||
|
metadata_filename=temp_file.name,
|
||||||
|
api_url=dify_config.UNSTRUCTURED_API_URL,
|
||||||
|
api_key=dify_config.UNSTRUCTURED_API_KEY,
|
||||||
|
)
|
||||||
|
os.unlink(temp_file.name)
|
||||||
|
else:
|
||||||
with io.BytesIO(file_content) as file:
|
with io.BytesIO(file_content) as file:
|
||||||
elements = partition_ppt(file=file)
|
elements = partition_ppt(file=file)
|
||||||
return "\n".join([getattr(element, "text", "") for element in elements])
|
return "\n".join([getattr(element, "text", "") for element in elements])
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TextExtractionError(f"Failed to extract text from PPT: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_pptx(file_content: bytes) -> str:
|
def _extract_text_from_pptx(file_content: bytes) -> str:
|
||||||
|
@ -8,7 +8,7 @@ from core.variables.variables import StringVariable
|
|||||||
from core.workflow.entities.node_entities import NodeRunResult
|
from core.workflow.entities.node_entities import NodeRunResult
|
||||||
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
|
from core.workflow.nodes.document_extractor import DocumentExtractorNode, DocumentExtractorNodeData
|
||||||
from core.workflow.nodes.document_extractor.node import (
|
from core.workflow.nodes.document_extractor.node import (
|
||||||
_extract_text_from_doc,
|
_extract_text_from_docx,
|
||||||
_extract_text_from_pdf,
|
_extract_text_from_pdf,
|
||||||
_extract_text_from_plain_text,
|
_extract_text_from_plain_text,
|
||||||
)
|
)
|
||||||
@ -120,7 +120,7 @@ def test_run_extract_text(
|
|||||||
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
|
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_pdf", mock_pdf_extract)
|
||||||
elif mime_type.startswith("application/vnd.openxmlformats"):
|
elif mime_type.startswith("application/vnd.openxmlformats"):
|
||||||
mock_docx_extract = Mock(return_value=expected_text[0])
|
mock_docx_extract = Mock(return_value=expected_text[0])
|
||||||
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_doc", mock_docx_extract)
|
monkeypatch.setattr("core.workflow.nodes.document_extractor.node._extract_text_from_docx", mock_docx_extract)
|
||||||
|
|
||||||
result = document_extractor_node._run()
|
result = document_extractor_node._run()
|
||||||
|
|
||||||
@ -163,14 +163,14 @@ def test_extract_text_from_pdf(mock_pdf_document):
|
|||||||
|
|
||||||
|
|
||||||
@patch("docx.Document")
|
@patch("docx.Document")
|
||||||
def test_extract_text_from_doc(mock_document):
|
def test_extract_text_from_docx(mock_document):
|
||||||
mock_paragraph1 = Mock()
|
mock_paragraph1 = Mock()
|
||||||
mock_paragraph1.text = "Paragraph 1"
|
mock_paragraph1.text = "Paragraph 1"
|
||||||
mock_paragraph2 = Mock()
|
mock_paragraph2 = Mock()
|
||||||
mock_paragraph2.text = "Paragraph 2"
|
mock_paragraph2.text = "Paragraph 2"
|
||||||
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
|
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
|
||||||
|
|
||||||
text = _extract_text_from_doc(b"PK\x03\x04")
|
text = _extract_text_from_docx(b"PK\x03\x04")
|
||||||
assert text == "Paragraph 1\nParagraph 2"
|
assert text == "Paragraph 1\nParagraph 2"
|
||||||
|
|
||||||
|
|
||||||
|
@ -52,7 +52,7 @@ export const getInputVars = (text: string): ValueSelector[] => {
|
|||||||
|
|
||||||
export const FILE_EXTS: Record<string, string[]> = {
|
export const FILE_EXTS: Record<string, string[]> = {
|
||||||
[SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'],
|
[SupportUploadFileTypes.image]: ['JPG', 'JPEG', 'PNG', 'GIF', 'WEBP', 'SVG'],
|
||||||
[SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'],
|
[SupportUploadFileTypes.document]: ['TXT', 'MD', 'MDX', 'MARKDOWN', 'PDF', 'HTML', 'XLSX', 'XLS', 'DOC', 'DOCX', 'CSV', 'EML', 'MSG', 'PPTX', 'PPT', 'XML', 'EPUB'],
|
||||||
[SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'],
|
[SupportUploadFileTypes.audio]: ['MP3', 'M4A', 'WAV', 'WEBM', 'AMR', 'MPGA'],
|
||||||
[SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'],
|
[SupportUploadFileTypes.video]: ['MP4', 'MOV', 'MPEG', 'MPGA'],
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user