mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-14 17:15:54 +08:00
feat: add YAML type in document extractor node (#9997)
This commit is contained in:
parent
c647e4307a
commit
0095896051
@ -5,6 +5,7 @@ import json
|
|||||||
import docx
|
import docx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2
|
import pypdfium2
|
||||||
|
import yaml
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
from unstructured.partition.msg import partition_msg
|
from unstructured.partition.msg import partition_msg
|
||||||
@ -101,6 +102,8 @@ def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
|
|||||||
return _extract_text_from_msg(file_content)
|
return _extract_text_from_msg(file_content)
|
||||||
case "application/json":
|
case "application/json":
|
||||||
return _extract_text_from_json(file_content)
|
return _extract_text_from_json(file_content)
|
||||||
|
case "application/x-yaml" | "text/yaml":
|
||||||
|
return _extract_text_from_yaml(file_content)
|
||||||
case _:
|
case _:
|
||||||
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
|
||||||
|
|
||||||
@ -112,6 +115,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
|
|||||||
return _extract_text_from_plain_text(file_content)
|
return _extract_text_from_plain_text(file_content)
|
||||||
case ".json":
|
case ".json":
|
||||||
return _extract_text_from_json(file_content)
|
return _extract_text_from_json(file_content)
|
||||||
|
case ".yaml" | ".yml":
|
||||||
|
return _extract_text_from_yaml(file_content)
|
||||||
case ".pdf":
|
case ".pdf":
|
||||||
return _extract_text_from_pdf(file_content)
|
return _extract_text_from_pdf(file_content)
|
||||||
case ".doc" | ".docx":
|
case ".doc" | ".docx":
|
||||||
@ -149,6 +154,15 @@ def _extract_text_from_json(file_content: bytes) -> str:
|
|||||||
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_text_from_yaml(file_content: bytes) -> str:
|
||||||
|
"""Extract the content from yaml file"""
|
||||||
|
try:
|
||||||
|
yaml_data = yaml.safe_load_all(file_content.decode("utf-8"))
|
||||||
|
return yaml.dump_all(yaml_data, allow_unicode=True, sort_keys=False)
|
||||||
|
except (UnicodeDecodeError, yaml.YAMLError) as e:
|
||||||
|
raise TextExtractionError(f"Failed to decode or parse YAML file: {e}") from e
|
||||||
|
|
||||||
|
|
||||||
def _extract_text_from_pdf(file_content: bytes) -> str:
|
def _extract_text_from_pdf(file_content: bytes) -> str:
|
||||||
try:
|
try:
|
||||||
pdf_file = io.BytesIO(file_content)
|
pdf_file = io.BytesIO(file_content)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user