From c27c02ea6771128fa04c56ae9daf348dd4100403 Mon Sep 17 00:00:00 2001
From: GYH <43509927+guoyuhao2330@users.noreply.github.com>
Date: Mon, 20 May 2024 18:35:15 +0800
Subject: [PATCH] Split Excel file into different chunks (#847)
### What problem does this PR solve?
Split Excel into different chunk
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
---
deepdoc/parser/excel_parser.py | 41 +++++++++++++++++++++-------------
rag/app/naive.py | 2 +-
rag/app/one.py | 2 +-
3 files changed, 27 insertions(+), 18 deletions(-)
diff --git a/deepdoc/parser/excel_parser.py b/deepdoc/parser/excel_parser.py
index 736ac32ef..b7d9539e4 100644
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@@ -7,30 +7,39 @@ from rag.nlp import find_codec
class RAGFlowExcelParser:
- def html(self, fnm):
+ def html(self, fnm,chunk_rows=256):
if isinstance(fnm, str):
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(fnm))
- tb = ""
+
+ tb_chunks = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
- if not rows:continue
- tb += f"
{sheetname}"
+ if not rows: continue
+
+ tb_rows_0 = "
"
for t in list(rows[0]):
- tb += f"{t.value} | "
- tb += "
"
- for r in list(rows[1:]):
- tb += ""
- for i, c in enumerate(r):
- if c.value is None:
- tb += " | "
- else:
- tb += f"{c.value} | "
- tb += "
"
- tb += "
\n"
- return tb
+ tb_rows_0 += f"{t.value} | "
+ tb_rows_0 += ""
+
+ for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
+ tb = ""
+ tb += f"{sheetname}"
+ tb += tb_rows_0
+ for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
+ tb += ""
+ for i, c in enumerate(r):
+ if c.value is None:
+ tb += " | "
+ else:
+ tb += f"{c.value} | "
+ tb += "
"
+ tb += "
\n"
+ tb_chunks.append(tb)
+
+ return tb_chunks
def __call__(self, fnm):
if isinstance(fnm, str):
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 01bb4de1d..f91734b27 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
- sections = [(excel_parser.html(binary), "")]
+ sections = [(l, "") for l in excel_parser.html(binary) if l]
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
diff --git a/rag/app/one.py b/rag/app/one.py
index 531fd0a70..ac67a64b2 100644
--- a/rag/app/one.py
+++ b/rag/app/one.py
@@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
excel_parser = ExcelParser()
- sections = [excel_parser.html(binary)]
+ sections = excel_parser.html(binary , 10000000)
elif re.search(r"\.txt$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")