fix: excel in node only read one sheet, close #9661 (#11215)

Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
yihong 2024-11-30 11:11:08 +08:00 committed by GitHub
parent d96a28487a
commit 5a9b785773
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,8 +4,8 @@ import json
import docx import docx
import pandas as pd import pandas as pd
import pypdfium2 import pypdfium2 # type: ignore
import yaml import yaml # type: ignore
from unstructured.partition.api import partition_via_api from unstructured.partition.api import partition_via_api
from unstructured.partition.email import partition_email from unstructured.partition.email import partition_email
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
@ -237,15 +237,17 @@ def _extract_text_from_csv(file_content: bytes) -> str:
def _extract_text_from_excel(file_content: bytes) -> str: def _extract_text_from_excel(file_content: bytes) -> str:
"""Extract text from an Excel file using pandas.""" """Extract text from an Excel file using pandas."""
try: try:
df = pd.read_excel(io.BytesIO(file_content)) excel_file = pd.ExcelFile(io.BytesIO(file_content))
markdown_table = ""
# Drop rows where all elements are NaN for sheet_name in excel_file.sheet_names:
df.dropna(how="all", inplace=True) try:
df = excel_file.parse(sheet_name=sheet_name)
# Convert DataFrame to Markdown table df.dropna(how="all", inplace=True)
markdown_table = df.to_markdown(index=False) # Create Markdown table two times to separate tables with a newline
markdown_table += df.to_markdown(index=False) + "\n\n"
except Exception as e:
continue
return markdown_table return markdown_table
except Exception as e: except Exception as e:
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e