Refactor: Optimize error handling and support parsing of XLS(EXCEL97—2003) files. (#5633)

Optimize error handling and support parsing of XLS(EXCEL97—2003) files.
This commit is contained in:
hy89 2025-03-05 11:55:27 +08:00 committed by GitHub
parent 47684fa17c
commit b0c21b00d9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 27 additions and 71 deletions

View File

@ -11,6 +11,7 @@
# limitations under the License. # limitations under the License.
# #
import logging
from openpyxl import load_workbook, Workbook from openpyxl import load_workbook, Workbook
import sys import sys
from io import BytesIO from io import BytesIO
@ -21,42 +22,29 @@ import pandas as pd
class RAGFlowExcelParser: class RAGFlowExcelParser:
def html(self, fnm, chunk_rows=256): @staticmethod
def _load_excel_to_workbook(file_like_object):
# if isinstance(fnm, str):
# wb = load_workbook(fnm)
# else:
# wb = load_workbook(BytesIO(fnm))++
s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass
try: try:
wb = load_workbook(s_fnm) return load_workbook(file_like_object)
except Exception as e: except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files') logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
df = pd.read_excel(s_fnm) try:
df = pd.read_excel(file_like_object)
wb = Workbook() wb = Workbook()
# if len(wb.worksheets) > 0:
# del wb.worksheets[0]
# else: pass
ws = wb.active ws = wb.active
ws.title = "Data" ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1): for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name) ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2): for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1): for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value) ws.cell(row=row_num, column=col_num, value=value)
else: return wb
pass except Exception as e_pandas:
else: raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")
pass
def html(self, fnm, chunk_rows=256):
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
tb_chunks = [] tb_chunks = []
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheetname]
@ -89,40 +77,8 @@ class RAGFlowExcelParser:
return tb_chunks return tb_chunks
def __call__(self, fnm): def __call__(self, fnm):
# if isinstance(fnm, str): file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
# wb = load_workbook(fnm) wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
# else:
# wb = load_workbook(BytesIO(fnm))
s_fnm = fnm
if not isinstance(fnm, str):
s_fnm = BytesIO(fnm)
else:
pass
try:
wb = load_workbook(s_fnm)
except Exception as e:
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
df = pd.read_excel(s_fnm)
wb = Workbook()
if len(wb.worksheets) > 0:
del wb.worksheets[0]
else:
pass
ws = wb.active
ws.title = "Data"
for col_num, column_name in enumerate(df.columns, 1):
ws.cell(row=1, column=col_num, value=column_name)
else:
pass
for row_num, row in enumerate(df.values, 2):
for col_num, value in enumerate(row, 1):
ws.cell(row=row_num, column=col_num, value=value)
else:
pass
else:
pass
res = [] res = []
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
@ -148,7 +104,7 @@ class RAGFlowExcelParser:
@staticmethod @staticmethod
def row_number(fnm, binary): def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0: if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = load_workbook(BytesIO(binary)) wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
total = 0 total = 0
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
ws = wb[sheetname] ws = wb[sheetname]

View File

@ -20,7 +20,7 @@ from io import BytesIO
from xpinyin import Pinyin from xpinyin import Pinyin
import numpy as np import numpy as np
import pandas as pd import pandas as pd
from openpyxl import load_workbook # from openpyxl import load_workbook, Workbook
from dateutil.parser import parse as datetime_parse from dateutil.parser import parse as datetime_parse
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
@ -33,9 +33,9 @@ class Excel(ExcelParser):
def __call__(self, fnm, binary=None, from_page=0, def __call__(self, fnm, binary=None, from_page=0,
to_page=10000000000, callback=None): to_page=10000000000, callback=None):
if not binary: if not binary:
wb = load_workbook(fnm) wb = Excel._load_excel_to_workbook(fnm)
else: else:
wb = load_workbook(BytesIO(binary)) wb = Excel._load_excel_to_workbook(BytesIO(binary))
total = 0 total = 0
for sheetname in wb.sheetnames: for sheetname in wb.sheetnames:
total += len(list(wb[sheetname].rows)) total += len(list(wb[sheetname].rows))