From b0c21b00d9352681905d60bfaa23ec6ff9fbff46 Mon Sep 17 00:00:00 2001 From: hy89 <31279043+hy89@users.noreply.github.com> Date: Wed, 5 Mar 2025 11:55:27 +0800 Subject: [PATCH] =?UTF-8?q?Refactor:=20Optimize=20error=20handling=20and?= =?UTF-8?q?=20support=20parsing=20of=20XLS(EXCEL97=E2=80=942003)=20files.?= =?UTF-8?q?=20(#5633)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Optimize error handling and support parsing of XLS(EXCEL97—2003) files. --- deepdoc/parser/excel_parser.py | 92 +++++++++------------------------- rag/app/table.py | 6 +-- 2 files changed, 27 insertions(+), 71 deletions(-) diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 84f731b41..b01d7e492 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -11,6 +11,7 @@ # limitations under the License. # +import logging from openpyxl import load_workbook, Workbook import sys from io import BytesIO @@ -21,42 +22,29 @@ import pandas as pd class RAGFlowExcelParser: - def html(self, fnm, chunk_rows=256): - - # if isinstance(fnm, str): - # wb = load_workbook(fnm) - # else: - # wb = load_workbook(BytesIO(fnm))++ - - s_fnm = fnm - if not isinstance(fnm, str): - s_fnm = BytesIO(fnm) - else: - pass - + @staticmethod + def _load_excel_to_workbook(file_like_object): try: - wb = load_workbook(s_fnm) + return load_workbook(file_like_object) except Exception as e: - print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files') - df = pd.read_excel(s_fnm) - wb = Workbook() - # if len(wb.worksheets) > 0: - # del wb.worksheets[0] - # else: pass - ws = wb.active - ws.title = "Data" - for col_num, column_name in enumerate(df.columns, 1): - ws.cell(row=1, column=col_num, value=column_name) - else: - pass - for row_num, row in enumerate(df.values, 2): - for col_num, value in enumerate(row, 1): - ws.cell(row=row_num, column=col_num, value=value) - else: - pass - else: - pass + logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead") + try: + df = pd.read_excel(file_like_object) + wb = Workbook() + ws = wb.active + ws.title = "Data" + for col_num, column_name in enumerate(df.columns, 1): + ws.cell(row=1, column=col_num, value=column_name) + for row_num, row in enumerate(df.values, 2): + for col_num, value in enumerate(row, 1): + ws.cell(row=row_num, column=col_num, value=value) + return wb + except Exception as e_pandas: + raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}") + def html(self, fnm, chunk_rows=256): + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm + wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object) tb_chunks = [] for sheetname in wb.sheetnames: ws = wb[sheetname] @@ -89,40 +77,8 @@ class RAGFlowExcelParser: return tb_chunks def __call__(self, fnm): - # if isinstance(fnm, str): - # wb = load_workbook(fnm) - # else: - # wb = load_workbook(BytesIO(fnm)) - - s_fnm = fnm - if not isinstance(fnm, str): - s_fnm = BytesIO(fnm) - else: - pass - - try: - wb = load_workbook(s_fnm) - except Exception as e: - print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files') - df = pd.read_excel(s_fnm) - wb = Workbook() - if len(wb.worksheets) > 0: - del wb.worksheets[0] - else: - pass - ws = wb.active - ws.title = "Data" - for col_num, column_name in enumerate(df.columns, 1): - ws.cell(row=1, column=col_num, value=column_name) - else: - pass - for row_num, row in enumerate(df.values, 2): - for col_num, value in enumerate(row, 1): - ws.cell(row=row_num, column=col_num, value=value) - else: - pass - else: - pass + file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm + wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object) res = [] for sheetname in wb.sheetnames: @@ -148,7 +104,7 @@ class RAGFlowExcelParser: @staticmethod def row_number(fnm, binary): if fnm.split(".")[-1].lower().find("xls") >= 0: - wb = load_workbook(BytesIO(binary)) + wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary)) total = 0 for sheetname in wb.sheetnames: ws = wb[sheetname] diff --git a/rag/app/table.py b/rag/app/table.py index 18620810e..6ecab0427 100644 --- a/rag/app/table.py +++ b/rag/app/table.py @@ -20,7 +20,7 @@ from io import BytesIO from xpinyin import Pinyin import numpy as np import pandas as pd -from openpyxl import load_workbook +# from openpyxl import load_workbook, Workbook from dateutil.parser import parse as datetime_parse from api.db.services.knowledgebase_service import KnowledgebaseService @@ -33,9 +33,9 @@ class Excel(ExcelParser): def __call__(self, fnm, binary=None, from_page=0, to_page=10000000000, callback=None): if not binary: - wb = load_workbook(fnm) + wb = Excel._load_excel_to_workbook(fnm) else: - wb = load_workbook(BytesIO(binary)) + wb = Excel._load_excel_to_workbook(BytesIO(binary)) total = 0 for sheetname in wb.sheetnames: total += len(list(wb[sheetname].rows))