mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 18:49:02 +08:00
Optimize csv and excel extract (#3155)
Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
parent
762657eeef
commit
9eba6ffdd4
@ -2,6 +2,8 @@
|
|||||||
import csv
|
import csv
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.extractor.helpers import detect_file_encodings
|
from core.rag.extractor.helpers import detect_file_encodings
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -52,21 +54,23 @@ class CSVExtractor(BaseExtractor):
|
|||||||
|
|
||||||
def _read_from_file(self, csvfile) -> list[Document]:
|
def _read_from_file(self, csvfile) -> list[Document]:
|
||||||
docs = []
|
docs = []
|
||||||
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
|
|
||||||
for i, row in enumerate(csv_reader):
|
|
||||||
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
|
|
||||||
try:
|
try:
|
||||||
source = (
|
# load csv file into pandas dataframe
|
||||||
row[self.source_column]
|
df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args)
|
||||||
if self.source_column is not None
|
|
||||||
else ''
|
# check source column exists
|
||||||
)
|
if self.source_column and self.source_column not in df.columns:
|
||||||
except KeyError:
|
raise ValueError(f"Source column '{self.source_column}' not found in CSV file.")
|
||||||
raise ValueError(
|
|
||||||
f"Source column '{self.source_column}' not found in CSV file."
|
# create document objects
|
||||||
)
|
|
||||||
|
for i, row in df.iterrows():
|
||||||
|
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
|
||||||
|
source = row[self.source_column] if self.source_column else ''
|
||||||
metadata = {"source": source, "row": i}
|
metadata = {"source": source, "row": i}
|
||||||
doc = Document(page_content=content, metadata=metadata)
|
doc = Document(page_content=content, metadata=metadata)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
except csv.Error as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
return docs
|
return docs
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
"""Abstract interface for document loader implementations."""
|
"""Abstract interface for document loader implementations."""
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from openpyxl.reader.excel import load_workbook
|
import pandas as pd
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -27,23 +27,20 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
"""Load from file path."""
|
"""Load from file path using Pandas."""
|
||||||
data = []
|
data = []
|
||||||
wb = load_workbook(filename=self._file_path, read_only=True)
|
|
||||||
# loop over all sheets
|
# 使用 Pandas 读取 Excel 文件的每个工作表
|
||||||
for sheet in wb:
|
xls = pd.ExcelFile(self._file_path)
|
||||||
keys = []
|
for sheet_name in xls.sheet_names:
|
||||||
if 'A1:A1' == sheet.calculate_dimension():
|
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||||
sheet.reset_dimensions()
|
|
||||||
for row in sheet.iter_rows(values_only=True):
|
# filter out rows with all NaN values
|
||||||
if all(v is None for v in row):
|
df.dropna(how='all', inplace=True)
|
||||||
continue
|
|
||||||
if keys == []:
|
# transform each row into a Document
|
||||||
keys = list(map(str, row))
|
for _, row in df.iterrows():
|
||||||
else:
|
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
||||||
row_dict = dict(zip(keys, list(map(str, row))))
|
|
||||||
row_dict = {k: v for k, v in row_dict.items() if v}
|
|
||||||
item = ''.join(f'{k}:{v};' for k, v in row_dict.items())
|
|
||||||
document = Document(page_content=item, metadata={'source': self._file_path})
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
||||||
data.append(document)
|
data.append(document)
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user