mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 10:25:58 +08:00
Refactor: Optimize error handling and support parsing of XLS(EXCEL97—2003) files. (#5633)
Optimize error handling and support parsing of XLS(EXCEL97—2003) files.
This commit is contained in:
parent
47684fa17c
commit
b0c21b00d9
@ -11,6 +11,7 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
|
import logging
|
||||||
from openpyxl import load_workbook, Workbook
|
from openpyxl import load_workbook, Workbook
|
||||||
import sys
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
@ -21,42 +22,29 @@ import pandas as pd
|
|||||||
|
|
||||||
|
|
||||||
class RAGFlowExcelParser:
|
class RAGFlowExcelParser:
|
||||||
def html(self, fnm, chunk_rows=256):
|
@staticmethod
|
||||||
|
def _load_excel_to_workbook(file_like_object):
|
||||||
# if isinstance(fnm, str):
|
|
||||||
# wb = load_workbook(fnm)
|
|
||||||
# else:
|
|
||||||
# wb = load_workbook(BytesIO(fnm))++
|
|
||||||
|
|
||||||
s_fnm = fnm
|
|
||||||
if not isinstance(fnm, str):
|
|
||||||
s_fnm = BytesIO(fnm)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
wb = load_workbook(s_fnm)
|
return load_workbook(file_like_object)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
|
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
|
||||||
df = pd.read_excel(s_fnm)
|
try:
|
||||||
|
df = pd.read_excel(file_like_object)
|
||||||
wb = Workbook()
|
wb = Workbook()
|
||||||
# if len(wb.worksheets) > 0:
|
|
||||||
# del wb.worksheets[0]
|
|
||||||
# else: pass
|
|
||||||
ws = wb.active
|
ws = wb.active
|
||||||
ws.title = "Data"
|
ws.title = "Data"
|
||||||
for col_num, column_name in enumerate(df.columns, 1):
|
for col_num, column_name in enumerate(df.columns, 1):
|
||||||
ws.cell(row=1, column=col_num, value=column_name)
|
ws.cell(row=1, column=col_num, value=column_name)
|
||||||
else:
|
|
||||||
pass
|
|
||||||
for row_num, row in enumerate(df.values, 2):
|
for row_num, row in enumerate(df.values, 2):
|
||||||
for col_num, value in enumerate(row, 1):
|
for col_num, value in enumerate(row, 1):
|
||||||
ws.cell(row=row_num, column=col_num, value=value)
|
ws.cell(row=row_num, column=col_num, value=value)
|
||||||
else:
|
return wb
|
||||||
pass
|
except Exception as e_pandas:
|
||||||
else:
|
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")
|
||||||
pass
|
|
||||||
|
|
||||||
|
def html(self, fnm, chunk_rows=256):
|
||||||
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||||
|
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
||||||
tb_chunks = []
|
tb_chunks = []
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
@ -89,40 +77,8 @@ class RAGFlowExcelParser:
|
|||||||
return tb_chunks
|
return tb_chunks
|
||||||
|
|
||||||
def __call__(self, fnm):
|
def __call__(self, fnm):
|
||||||
# if isinstance(fnm, str):
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||||
# wb = load_workbook(fnm)
|
wb = RAGFlowExcelParser._load_excel_to_workbook(file_like_object)
|
||||||
# else:
|
|
||||||
# wb = load_workbook(BytesIO(fnm))
|
|
||||||
|
|
||||||
s_fnm = fnm
|
|
||||||
if not isinstance(fnm, str):
|
|
||||||
s_fnm = BytesIO(fnm)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
try:
|
|
||||||
wb = load_workbook(s_fnm)
|
|
||||||
except Exception as e:
|
|
||||||
print(f'****wxy: file parser error: {e}, s_fnm={s_fnm}, trying convert files')
|
|
||||||
df = pd.read_excel(s_fnm)
|
|
||||||
wb = Workbook()
|
|
||||||
if len(wb.worksheets) > 0:
|
|
||||||
del wb.worksheets[0]
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
ws = wb.active
|
|
||||||
ws.title = "Data"
|
|
||||||
for col_num, column_name in enumerate(df.columns, 1):
|
|
||||||
ws.cell(row=1, column=col_num, value=column_name)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
for row_num, row in enumerate(df.values, 2):
|
|
||||||
for col_num, value in enumerate(row, 1):
|
|
||||||
ws.cell(row=row_num, column=col_num, value=value)
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
pass
|
|
||||||
|
|
||||||
res = []
|
res = []
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
@ -148,7 +104,7 @@ class RAGFlowExcelParser:
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def row_number(fnm, binary):
|
def row_number(fnm, binary):
|
||||||
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
if fnm.split(".")[-1].lower().find("xls") >= 0:
|
||||||
wb = load_workbook(BytesIO(binary))
|
wb = RAGFlowExcelParser._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
|
@ -20,7 +20,7 @@ from io import BytesIO
|
|||||||
from xpinyin import Pinyin
|
from xpinyin import Pinyin
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
from openpyxl import load_workbook
|
# from openpyxl import load_workbook, Workbook
|
||||||
from dateutil.parser import parse as datetime_parse
|
from dateutil.parser import parse as datetime_parse
|
||||||
|
|
||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
@ -33,9 +33,9 @@ class Excel(ExcelParser):
|
|||||||
def __call__(self, fnm, binary=None, from_page=0,
|
def __call__(self, fnm, binary=None, from_page=0,
|
||||||
to_page=10000000000, callback=None):
|
to_page=10000000000, callback=None):
|
||||||
if not binary:
|
if not binary:
|
||||||
wb = load_workbook(fnm)
|
wb = Excel._load_excel_to_workbook(fnm)
|
||||||
else:
|
else:
|
||||||
wb = load_workbook(BytesIO(binary))
|
wb = Excel._load_excel_to_workbook(BytesIO(binary))
|
||||||
total = 0
|
total = 0
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
total += len(list(wb[sheetname].rows))
|
total += len(list(wb[sheetname].rows))
|
||||||
|
Loading…
x
Reference in New Issue
Block a user