mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-01 04:12:00 +08:00
Feat: add CSV file parsing support (#5989)
### What problem does this PR solve? Add CSV file parsing support #4552, #5849, #5870 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
d660f6b9a5
commit
7cd37c37cd
@ -12,35 +12,63 @@
|
|||||||
#
|
#
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from openpyxl import load_workbook, Workbook
|
|
||||||
import sys
|
import sys
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
|
|
||||||
from rag.nlp import find_codec
|
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from openpyxl import Workbook, load_workbook
|
||||||
|
|
||||||
|
from rag.nlp import find_codec
|
||||||
|
|
||||||
|
|
||||||
class RAGFlowExcelParser:
|
class RAGFlowExcelParser:
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _load_excel_to_workbook(file_like_object):
|
def _load_excel_to_workbook(file_like_object):
|
||||||
|
if isinstance(file_like_object, bytes):
|
||||||
|
file_like_object = BytesIO(file_like_object)
|
||||||
|
|
||||||
|
# Read first 4 bytes to determine file type
|
||||||
|
file_like_object.seek(0)
|
||||||
|
file_head = file_like_object.read(4)
|
||||||
|
file_like_object.seek(0)
|
||||||
|
|
||||||
|
if not (file_head.startswith(b'PK\x03\x04') or file_head.startswith(b'\xD0\xCF\x11\xE0')):
|
||||||
|
logging.info("****wxy: Not an Excel file, converting CSV to Excel Workbook")
|
||||||
|
|
||||||
|
try:
|
||||||
|
file_like_object.seek(0)
|
||||||
|
df = pd.read_csv(file_like_object)
|
||||||
|
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||||
|
|
||||||
|
except Exception as e_csv:
|
||||||
|
raise Exception(f"****wxy: Failed to parse CSV and convert to Excel Workbook: {e_csv}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return load_workbook(file_like_object)
|
return load_workbook(file_like_object)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
|
logging.info(f"****wxy: openpyxl load error: {e}, try pandas instead")
|
||||||
try:
|
try:
|
||||||
|
file_like_object.seek(0)
|
||||||
df = pd.read_excel(file_like_object)
|
df = pd.read_excel(file_like_object)
|
||||||
wb = Workbook()
|
return RAGFlowExcelParser._dataframe_to_workbook(df)
|
||||||
ws = wb.active
|
|
||||||
ws.title = "Data"
|
|
||||||
for col_num, column_name in enumerate(df.columns, 1):
|
|
||||||
ws.cell(row=1, column=col_num, value=column_name)
|
|
||||||
for row_num, row in enumerate(df.values, 2):
|
|
||||||
for col_num, value in enumerate(row, 1):
|
|
||||||
ws.cell(row=row_num, column=col_num, value=value)
|
|
||||||
return wb
|
|
||||||
except Exception as e_pandas:
|
except Exception as e_pandas:
|
||||||
raise Exception(f"****wxy: pandas read error: {e_pandas}, original openpyxl error: {e}")
|
raise Exception(f"****wxy: pandas.read_excel error: {e_pandas}, original openpyxl error: {e}")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _dataframe_to_workbook(df):
|
||||||
|
wb = Workbook()
|
||||||
|
ws = wb.active
|
||||||
|
ws.title = "Data"
|
||||||
|
|
||||||
|
for col_num, column_name in enumerate(df.columns, 1):
|
||||||
|
ws.cell(row=1, column=col_num, value=column_name)
|
||||||
|
|
||||||
|
for row_num, row in enumerate(df.values, 2):
|
||||||
|
for col_num, value in enumerate(row, 1):
|
||||||
|
ws.cell(row=row_num, column=col_num, value=value)
|
||||||
|
|
||||||
|
return wb
|
||||||
|
|
||||||
def html(self, fnm, chunk_rows=256):
|
def html(self, fnm, chunk_rows=256):
|
||||||
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
file_like_object = BytesIO(fnm) if not isinstance(fnm, str) else fnm
|
||||||
@ -62,7 +90,7 @@ class RAGFlowExcelParser:
|
|||||||
tb += f"<table><caption>{sheetname}</caption>"
|
tb += f"<table><caption>{sheetname}</caption>"
|
||||||
tb += tb_rows_0
|
tb += tb_rows_0
|
||||||
for r in list(
|
for r in list(
|
||||||
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
|
rows[1 + chunk_i * chunk_rows: 1 + (chunk_i + 1) * chunk_rows]
|
||||||
):
|
):
|
||||||
tb += "<tr>"
|
tb += "<tr>"
|
||||||
for i, c in enumerate(r):
|
for i, c in enumerate(r):
|
||||||
@ -120,4 +148,3 @@ class RAGFlowExcelParser:
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
psr = RAGFlowExcelParser()
|
psr = RAGFlowExcelParser()
|
||||||
psr(sys.argv[1])
|
psr(sys.argv[1])
|
||||||
|
|
||||||
|
@ -240,7 +240,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
callback=callback)
|
callback=callback)
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
|
|
||||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
excel_parser = ExcelParser()
|
excel_parser = ExcelParser()
|
||||||
if parser_config.get("html4excel"):
|
if parser_config.get("html4excel"):
|
||||||
@ -307,9 +307,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
|
|
||||||
def dummy(prog=None, msg=""):
|
def dummy(prog=None, msg=""):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user