diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index bb9d65b00..84f731b41 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -1,6 +1,3 @@ -# -# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. -# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at @@ -14,19 +11,51 @@ # limitations under the License. # -from openpyxl import load_workbook +from openpyxl import load_workbook, Workbook import sys from io import BytesIO from rag.nlp import find_codec +import pandas as pd + class RAGFlowExcelParser: def html(self, fnm, chunk_rows=256): - if isinstance(fnm, str): - wb = load_workbook(fnm) + + # if isinstance(fnm, str): + # wb = load_workbook(fnm) + # else: + # wb = load_workbook(BytesIO(fnm))++ + + s_fnm = fnm + if not isinstance(fnm, str): + s_fnm = BytesIO(fnm) else: - wb = load_workbook(BytesIO(fnm)) + pass + + try: + wb = load_workbook(s_fnm) + except Exception as e: + print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files') + df = pd.read_excel(s_fnm) + wb = Workbook() + # if len(wb.worksheets) > 0: + # del wb.worksheets[0] + # else: pass + ws = wb.active + ws.title = "Data" + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + else: + pass + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + else: + pass + else: + pass tb_chunks = [] for sheetname in wb.sheetnames: @@ -45,7 +74,7 @@ class RAGFlowExcelParser: tb += f"