Optimize csv and excel extract (#3155)

Co-authored-by: jyong <jyong@dify.ai>
This commit is contained in:
Jyong 2024-04-08 16:34:43 +08:00 committed by GitHub
parent 762657eeef
commit 9eba6ffdd4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 36 additions and 35 deletions

View File

@ -2,6 +2,8 @@
import csv import csv
from typing import Optional from typing import Optional
import pandas as pd
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.extractor.helpers import detect_file_encodings from core.rag.extractor.helpers import detect_file_encodings
from core.rag.models.document import Document from core.rag.models.document import Document
@ -52,21 +54,23 @@ class CSVExtractor(BaseExtractor):
def _read_from_file(self, csvfile) -> list[Document]: def _read_from_file(self, csvfile) -> list[Document]:
docs = [] docs = []
csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
for i, row in enumerate(csv_reader):
content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
try: try:
source = ( # load csv file into pandas dataframe
row[self.source_column] df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args)
if self.source_column is not None
else '' # check source column exists
) if self.source_column and self.source_column not in df.columns:
except KeyError: raise ValueError(f"Source column '{self.source_column}' not found in CSV file.")
raise ValueError(
f"Source column '{self.source_column}' not found in CSV file." # create document objects
)
for i, row in df.iterrows():
content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns)
source = row[self.source_column] if self.source_column else ''
metadata = {"source": source, "row": i} metadata = {"source": source, "row": i}
doc = Document(page_content=content, metadata=metadata) doc = Document(page_content=content, metadata=metadata)
docs.append(doc) docs.append(doc)
except csv.Error as e:
raise e
return docs return docs

View File

@ -1,7 +1,7 @@
"""Abstract interface for document loader implementations.""" """Abstract interface for document loader implementations."""
from typing import Optional from typing import Optional
from openpyxl.reader.excel import load_workbook import pandas as pd
from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document from core.rag.models.document import Document
@ -27,23 +27,20 @@ class ExcelExtractor(BaseExtractor):
self._autodetect_encoding = autodetect_encoding self._autodetect_encoding = autodetect_encoding
def extract(self) -> list[Document]: def extract(self) -> list[Document]:
"""Load from file path.""" """Load from file path using Pandas."""
data = [] data = []
wb = load_workbook(filename=self._file_path, read_only=True)
# loop over all sheets # 使用 Pandas 读取 Excel 文件的每个工作表
for sheet in wb: xls = pd.ExcelFile(self._file_path)
keys = [] for sheet_name in xls.sheet_names:
if 'A1:A1' == sheet.calculate_dimension(): df = pd.read_excel(xls, sheet_name=sheet_name)
sheet.reset_dimensions()
for row in sheet.iter_rows(values_only=True): # filter out rows with all NaN values
if all(v is None for v in row): df.dropna(how='all', inplace=True)
continue
if keys == []: # transform each row into a Document
keys = list(map(str, row)) for _, row in df.iterrows():
else: item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
row_dict = dict(zip(keys, list(map(str, row))))
row_dict = {k: v for k, v in row_dict.items() if v}
item = ''.join(f'{k}:{v};' for k, v in row_dict.items())
document = Document(page_content=item, metadata={'source': self._file_path}) document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document) data.append(document)