From 9f8ca75a8161fe35a4ae5da70253f3014c21c3be Mon Sep 17 00:00:00 2001 From: YC Date: Wed, 5 Jun 2024 15:28:43 +0800 Subject: [PATCH] fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600) --- api/core/rag/extractor/excel_extractor.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 2b0066448e..4d2f61139a 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor): documents = [] # loop over all sheets for sheet in wb.sheets(): - for row_index, row in enumerate(sheet.get_rows(), start=1): - row_header = None + row_header = None + for row_index, row in enumerate(sheet.get_rows(), start=1): if self.is_blank_row(row): continue if row_header is None: @@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor): item_arr = [] for index, cell in enumerate(row): txt_value = str(cell.value) - item_arr.append(f'{row_header[index].value}:{txt_value}') - item_str = "\n".join(item_arr) + item_arr.append(f'"{row_header[index].value}":"{txt_value}"') + item_str = ",".join(item_arr) document = Document(page_content=item_str, metadata={'source': self._file_path}) documents.append(document) return documents @@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor): # transform each row into a Document for _, row in df.iterrows(): - item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) + item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)) document = Document(page_content=item, metadata={'source': self._file_path}) data.append(document) return data