fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)

This commit is contained in:
YC 2024-06-05 15:28:43 +08:00 committed by GitHub
parent 80a87f36ea
commit 9f8ca75a81
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
documents = [] documents = []
# loop over all sheets # loop over all sheets
for sheet in wb.sheets(): for sheet in wb.sheets():
for row_index, row in enumerate(sheet.get_rows(), start=1):
row_header = None row_header = None
for row_index, row in enumerate(sheet.get_rows(), start=1):
if self.is_blank_row(row): if self.is_blank_row(row):
continue continue
if row_header is None: if row_header is None:
@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
item_arr = [] item_arr = []
for index, cell in enumerate(row): for index, cell in enumerate(row):
txt_value = str(cell.value) txt_value = str(cell.value)
item_arr.append(f'{row_header[index].value}:{txt_value}') item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
item_str = "\n".join(item_arr) item_str = ",".join(item_arr)
document = Document(page_content=item_str, metadata={'source': self._file_path}) document = Document(page_content=item_str, metadata={'source': self._file_path})
documents.append(document) documents.append(document)
return documents return documents
@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
# transform each row into a Document # transform each row into a Document
for _, row in df.iterrows(): for _, row in df.iterrows():
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
document = Document(page_content=item, metadata={'source': self._file_path}) document = Document(page_content=item, metadata={'source': self._file_path})
data.append(document) data.append(document)
return data return data