diff --git a/api/core/rag/extractor/csv_extractor.py b/api/core/rag/extractor/csv_extractor.py index a8077971dc..059bee5f6c 100644 --- a/api/core/rag/extractor/csv_extractor.py +++ b/api/core/rag/extractor/csv_extractor.py @@ -2,6 +2,8 @@ import csv from typing import Optional +import pandas as pd + from core.rag.extractor.extractor_base import BaseExtractor from core.rag.extractor.helpers import detect_file_encodings from core.rag.models.document import Document @@ -52,21 +54,23 @@ class CSVExtractor(BaseExtractor): def _read_from_file(self, csvfile) -> list[Document]: docs = [] - csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore - for i, row in enumerate(csv_reader): - content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items()) - try: - source = ( - row[self.source_column] - if self.source_column is not None - else '' - ) - except KeyError: - raise ValueError( - f"Source column '{self.source_column}' not found in CSV file." - ) - metadata = {"source": source, "row": i} - doc = Document(page_content=content, metadata=metadata) - docs.append(doc) + try: + # load csv file into pandas dataframe + df = pd.read_csv(csvfile, error_bad_lines=False, **self.csv_args) + + # check source column exists + if self.source_column and self.source_column not in df.columns: + raise ValueError(f"Source column '{self.source_column}' not found in CSV file.") + + # create document objects + + for i, row in df.iterrows(): + content = ";".join(f"{col.strip()}: {str(row[col]).strip()}" for col in df.columns) + source = row[self.source_column] if self.source_column else '' + metadata = {"source": source, "row": i} + doc = Document(page_content=content, metadata=metadata) + docs.append(doc) + except csv.Error as e: + raise e return docs diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 30989be880..0a964bdb01 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -1,7 +1,7 @@ """Abstract interface for document loader implementations.""" from typing import Optional -from openpyxl.reader.excel import load_workbook +import pandas as pd from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -27,24 +27,21 @@ class ExcelExtractor(BaseExtractor): self._autodetect_encoding = autodetect_encoding def extract(self) -> list[Document]: - """Load from file path.""" + """Load from file path using Pandas.""" data = [] - wb = load_workbook(filename=self._file_path, read_only=True) - # loop over all sheets - for sheet in wb: - keys = [] - if 'A1:A1' == sheet.calculate_dimension(): - sheet.reset_dimensions() - for row in sheet.iter_rows(values_only=True): - if all(v is None for v in row): - continue - if keys == []: - keys = list(map(str, row)) - else: - row_dict = dict(zip(keys, list(map(str, row)))) - row_dict = {k: v for k, v in row_dict.items() if v} - item = ''.join(f'{k}:{v};' for k, v in row_dict.items()) - document = Document(page_content=item, metadata={'source': self._file_path}) - data.append(document) + + # 使用 Pandas 读取 Excel 文件的每个工作表 + xls = pd.ExcelFile(self._file_path) + for sheet_name in xls.sheet_names: + df = pd.read_excel(xls, sheet_name=sheet_name) + + # filter out rows with all NaN values + df.dropna(how='all', inplace=True) + + # transform each row into a Document + for _, row in df.iterrows(): + item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) + document = Document(page_content=item, metadata={'source': self._file_path}) + data.append(document) return data