fix(document_extractor): pptx file type and missing metadata_filename UnstructuredIO (#11364)

Co-authored-by: Julian Huynh <julian.huynh@immersio.io>
This commit is contained in:
Huỳnh Gia Bôi 2024-12-06 17:55:59 +07:00 committed by GitHub
parent 1490a19fa1
commit 9277156b6c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,8 @@
import csv import csv
import io import io
import json import json
import os
import tempfile
import docx import docx
import pandas as pd import pandas as pd
@ -264,14 +266,20 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
def _extract_text_from_pptx(file_content: bytes) -> str: def _extract_text_from_pptx(file_content: bytes) -> str:
try: try:
with io.BytesIO(file_content) as file:
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY: if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
temp_file.write(file_content)
temp_file.flush()
with open(temp_file.name, "rb") as file:
elements = partition_via_api( elements = partition_via_api(
file=file, file=file,
metadata_filename=temp_file.name,
api_url=dify_config.UNSTRUCTURED_API_URL, api_url=dify_config.UNSTRUCTURED_API_URL,
api_key=dify_config.UNSTRUCTURED_API_KEY, api_key=dify_config.UNSTRUCTURED_API_KEY,
) )
os.unlink(temp_file.name)
else: else:
with io.BytesIO(file_content) as file:
elements = partition_pptx(file=file) elements = partition_pptx(file=file)
return "\n".join([getattr(element, "text", "") for element in elements]) return "\n".join([getattr(element, "text", "") for element in elements])
except Exception as e: except Exception as e: