mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-15 06:35:53 +08:00
fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)
This commit is contained in:
parent
80a87f36ea
commit
9f8ca75a81
@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
documents = []
|
documents = []
|
||||||
# loop over all sheets
|
# loop over all sheets
|
||||||
for sheet in wb.sheets():
|
for sheet in wb.sheets():
|
||||||
|
row_header = None
|
||||||
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
||||||
row_header = None
|
|
||||||
if self.is_blank_row(row):
|
if self.is_blank_row(row):
|
||||||
continue
|
continue
|
||||||
if row_header is None:
|
if row_header is None:
|
||||||
@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
item_arr = []
|
item_arr = []
|
||||||
for index, cell in enumerate(row):
|
for index, cell in enumerate(row):
|
||||||
txt_value = str(cell.value)
|
txt_value = str(cell.value)
|
||||||
item_arr.append(f'{row_header[index].value}:{txt_value}')
|
item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
|
||||||
item_str = "\n".join(item_arr)
|
item_str = ",".join(item_arr)
|
||||||
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
||||||
documents.append(document)
|
documents.append(document)
|
||||||
return documents
|
return documents
|
||||||
@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
|
|
||||||
# transform each row into a Document
|
# transform each row into a Document
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
|
||||||
document = Document(page_content=item, metadata={'source': self._file_path})
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
||||||
data.append(document)
|
data.append(document)
|
||||||
return data
|
return data
|
||||||
|
Loading…
x
Reference in New Issue
Block a user