ragflow/deepdoc/parser/excel_parser.py
ly0303521 101b8ff813
fix chunk method "Table" losing content when the Excel file has multi… (#4123)
…ple sheets

### What problem does this PR solve?
discussed in https://github.com/infiniflow/ragflow/pull/4102
- In excel_parser.py, `total` means the total number of rows in Excel,
but it return in the first iterate, that lead to the wrong `to_page`
- In table.py, it when Excel file has multiple sheets, it will be
divided into multiple parts, every part size is 3000, `data` may be
empty, because it has recorded in the last iterate.
### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2024-12-19 17:30:26 +08:00

104 lines
3.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from openpyxl import load_workbook
import sys
from io import BytesIO
from rag.nlp import find_codec
class RAGFlowExcelParser:
def html(self, fnm, chunk_rows=256):
if isinstance(fnm, str):
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(fnm))
tb_chunks = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
tb_rows_0 = "<tr>"
for t in list(rows[0]):
tb_rows_0 += f"<th>{t.value}</th>"
tb_rows_0 += "</tr>"
for chunk_i in range((len(rows) - 1) // chunk_rows + 1):
tb = ""
tb += f"<table><caption>{sheetname}</caption>"
tb += tb_rows_0
for r in list(
rows[1 + chunk_i * chunk_rows : 1 + (chunk_i + 1) * chunk_rows]
):
tb += "<tr>"
for i, c in enumerate(r):
if c.value is None:
tb += "<td></td>"
else:
tb += f"<td>{c.value}</td>"
tb += "</tr>"
tb += "</table>\n"
tb_chunks.append(tb)
return tb_chunks
def __call__(self, fnm):
if isinstance(fnm, str):
wb = load_workbook(fnm)
else:
wb = load_workbook(BytesIO(fnm))
res = []
for sheetname in wb.sheetnames:
ws = wb[sheetname]
rows = list(ws.rows)
if not rows:
continue
ti = list(rows[0])
for r in list(rows[1:]):
fields = []
for i, c in enumerate(r):
if not c.value:
continue
t = str(ti[i].value) if i < len(ti) else ""
t += ("" if t else "") + str(c.value)
fields.append(t)
line = "; ".join(fields)
if sheetname.lower().find("sheet") < 0:
line += " ——" + sheetname
res.append(line)
return res
@staticmethod
def row_number(fnm, binary):
if fnm.split(".")[-1].lower().find("xls") >= 0:
wb = load_workbook(BytesIO(binary))
total = 0
for sheetname in wb.sheetnames:
ws = wb[sheetname]
total += len(list(ws.rows))
return total
if fnm.split(".")[-1].lower() in ["csv", "txt"]:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
return len(txt.split("\n"))
if __name__ == "__main__":
psr = RAGFlowExcelParser()
psr(sys.argv[1])