mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-15 00:36:12 +08:00
Split Excel file into different chunks (#847)
### What problem does this PR solve? Split Excel into different chunk ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
71068895ae
commit
c27c02ea67
@ -7,21 +7,28 @@ from rag.nlp import find_codec
|
|||||||
|
|
||||||
|
|
||||||
class RAGFlowExcelParser:
|
class RAGFlowExcelParser:
|
||||||
def html(self, fnm):
|
def html(self, fnm,chunk_rows=256):
|
||||||
if isinstance(fnm, str):
|
if isinstance(fnm, str):
|
||||||
wb = load_workbook(fnm)
|
wb = load_workbook(fnm)
|
||||||
else:
|
else:
|
||||||
wb = load_workbook(BytesIO(fnm))
|
wb = load_workbook(BytesIO(fnm))
|
||||||
tb = ""
|
|
||||||
|
tb_chunks = []
|
||||||
for sheetname in wb.sheetnames:
|
for sheetname in wb.sheetnames:
|
||||||
ws = wb[sheetname]
|
ws = wb[sheetname]
|
||||||
rows = list(ws.rows)
|
rows = list(ws.rows)
|
||||||
if not rows: continue
|
if not rows: continue
|
||||||
tb += f"<table><caption>{sheetname}</caption><tr>"
|
|
||||||
|
tb_rows_0 = "<tr>"
|
||||||
for t in list(rows[0]):
|
for t in list(rows[0]):
|
||||||
tb += f"<th>{t.value}</th>"
|
tb_rows_0 += f"<th>{t.value}</th>"
|
||||||
tb += "</tr>"
|
tb_rows_0 += "</tr>"
|
||||||
for r in list(rows[1:]):
|
|
||||||
|
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
|
||||||
|
tb = ""
|
||||||
|
tb += f"<table><caption>{sheetname}</caption>"
|
||||||
|
tb += tb_rows_0
|
||||||
|
for r in list(rows[1 + chunk_i * chunk_rows:1 + (chunk_i + 1) * chunk_rows]):
|
||||||
tb += "<tr>"
|
tb += "<tr>"
|
||||||
for i, c in enumerate(r):
|
for i, c in enumerate(r):
|
||||||
if c.value is None:
|
if c.value is None:
|
||||||
@ -30,7 +37,9 @@ class RAGFlowExcelParser:
|
|||||||
tb += f"<td>{c.value}</td>"
|
tb += f"<td>{c.value}</td>"
|
||||||
tb += "</tr>"
|
tb += "</tr>"
|
||||||
tb += "</table>\n"
|
tb += "</table>\n"
|
||||||
return tb
|
tb_chunks.append(tb)
|
||||||
|
|
||||||
|
return tb_chunks
|
||||||
|
|
||||||
def __call__(self, fnm):
|
def __call__(self, fnm):
|
||||||
if isinstance(fnm, str):
|
if isinstance(fnm, str):
|
||||||
|
@ -134,7 +134,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
sections = [(excel_parser.html(binary), "")]
|
sections = [(l, "") for l in excel_parser.html(binary) if l]
|
||||||
|
|
||||||
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(txt|md|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
@ -78,7 +78,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
sections = [excel_parser.html(binary)]
|
sections = excel_parser.html(binary , 10000000)
|
||||||
|
|
||||||
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
elif re.search(r"\.txt$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user