diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index b01d7e492..2e53e5c4a 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -12,35 +12,63 @@ # import logging -from openpyxl import load_workbook, Workbook import sys from io import BytesIO -from rag.nlp import find_codec - import pandas as pd +from openpyxl import Workbook, load_workbook + +from rag.nlp import find_codec class RAGFlowExcelParser: + @staticmethod def _load_excel_to_workbook(file_like_object): + if isinstance(file_like_object, bytes): + file_like_object = BytesIO(file_like_object) + + # Read first 4 bytes to determine file type + file_like_object.seek(0) + file_head = file_like_object.read(4) + file_like_object.seek(0) + + if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')): + logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook") + + try: + file_like_object.seek(0) + df = pd.read_csv(file_like_object) + return RAGFlowExcelParser._dataframe_to_workbook(df) + + except Exception as e_csv: + raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}") + try: return load_workbook(file_like_object) except Exception as e: logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") try: + file_like_object.seek(0) df = pd.read_excel(file_like_object) - wb = Workbook() - ws = wb.active - ws.title = "Data" - for col_num, column_name in enumerate(df.columns, 1): - ws.cell(row=1, column=col_num, value=column_name) - for row_num, row in enumerate(df.values, 2): - for col_num, value in enumerate(row, 1): - ws.cell(row=row_num, column=col_num, value=value) - return wb + return RAGFlowExcelParser._dataframe_to_workbook(df) except Exception as e_pandas: - raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}") + raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}") + + @staticmethod + def _dataframe_to_workbook(df): + wb = Workbook() + ws = wb.active + ws.title = "Data" + + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + + return wb def html(self, fnm, chunk_rows=256): file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm @@ -62,7 +90,7 @@ class RAGFlowExcelParser: tb += f"