mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 15:59:00 +08:00
Signed-off-by: yihong0618 <zouzou0208@gmail.com>
This commit is contained in:
parent
d96a28487a
commit
5a9b785773
@ -4,8 +4,8 @@ import json
|
|||||||
|
|
||||||
import docx
|
import docx
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pypdfium2
|
import pypdfium2 # type: ignore
|
||||||
import yaml
|
import yaml # type: ignore
|
||||||
from unstructured.partition.api import partition_via_api
|
from unstructured.partition.api import partition_via_api
|
||||||
from unstructured.partition.email import partition_email
|
from unstructured.partition.email import partition_email
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
@ -237,15 +237,17 @@ def _extract_text_from_csv(file_content: bytes) -> str:
|
|||||||
|
|
||||||
def _extract_text_from_excel(file_content: bytes) -> str:
|
def _extract_text_from_excel(file_content: bytes) -> str:
|
||||||
"""Extract text from an Excel file using pandas."""
|
"""Extract text from an Excel file using pandas."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
df = pd.read_excel(io.BytesIO(file_content))
|
excel_file = pd.ExcelFile(io.BytesIO(file_content))
|
||||||
|
markdown_table = ""
|
||||||
# Drop rows where all elements are NaN
|
for sheet_name in excel_file.sheet_names:
|
||||||
df.dropna(how="all", inplace=True)
|
try:
|
||||||
|
df = excel_file.parse(sheet_name=sheet_name)
|
||||||
# Convert DataFrame to Markdown table
|
df.dropna(how="all", inplace=True)
|
||||||
markdown_table = df.to_markdown(index=False)
|
# Create Markdown table two times to separate tables with a newline
|
||||||
|
markdown_table += df.to_markdown(index=False) + "\n\n"
|
||||||
|
except Exception as e:
|
||||||
|
continue
|
||||||
return markdown_table
|
return markdown_table
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
|
raise TextExtractionError(f"Failed to extract text from Excel file: {str(e)}") from e
|
||||||
|
Loading…
x
Reference in New Issue
Block a user