diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py index 736ac32ef..b7d9539e4 100644 --- a/deepdoc/parser/excel_parser.py +++ b/deepdoc/parser/excel_parser.py @@ -7,30 +7,39 @@ from rag.nlp import find_codec class RAGFlowExcelParser: - def html(self, fnm): + def html(self, fnm,chunk_rows=256): if isinstance(fnm, str): wb = load_workbook(fnm) else: wb = load_workbook(BytesIO(fnm)) - tb = "" + + tb_chunks = [] for sheetname in wb.sheetnames: ws = wb[sheetname] rows = list(ws.rows) - if not rows:continue - tb += f"" + if not rows: continue + + tb_rows_0 = "" for t in list(rows[0]): - tb += f"" - tb += "" - for r in list(rows[1:]): - tb += "" - for i, c in enumerate(r): - if c.value is None: - tb += "" - else: - tb += f"" - tb += "" - tb += "
{sheetname}
{t.value}
{c.value}
\n" - return tb + tb_rows_0 += f"{t.value}" + tb_rows_0 += "" + + for chunk_i in range((len(rows) - 1) // chunk_rows + 1): + tb = "" + tb += f"" + tb += tb_rows_0 + for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]): + tb += "" + for i, c in enumerate(r): + if c.value is None: + tb += "" + else: + tb += f"" + tb += "" + tb += "
{sheetname}
{c.value}
\n" + tb_chunks.append(tb) + + return tb_chunks def __call__(self, fnm): if isinstance(fnm, str): diff --git a/rag/app/naive.py b/rag/app/naive.py index 01bb4de1d..f91734b27 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = ExcelParser() - sections = [(excel_parser.html(binary), "")] + sections = [(l, "") for l in excel_parser.html(binary) if l] elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") diff --git a/rag/app/one.py b/rag/app/one.py index 531fd0a70..ac67a64b2 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") excel_parser = ExcelParser() - sections = [excel_parser.html(binary)] + sections = excel_parser.html(binary , 10000000) elif re.search(r"\.txt$", filename, re.IGNORECASE): callback(0.1, "Start to parse.")