mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-06-04 11:14:10 +08:00
add xlsx support hyperlink extract (#6722)
This commit is contained in:
parent
5d77dc4f58
commit
cf258b7a67
@ -3,6 +3,7 @@ import os
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
from openpyxl import load_workbook
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -28,26 +29,48 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
""" Load from Excel file in xls or xlsx format using Pandas."""
|
""" Load from Excel file in xls or xlsx format using Pandas and openpyxl."""
|
||||||
documents = []
|
documents = []
|
||||||
# Determine the file extension
|
|
||||||
file_extension = os.path.splitext(self._file_path)[-1].lower()
|
file_extension = os.path.splitext(self._file_path)[-1].lower()
|
||||||
# Read each worksheet of an Excel file using Pandas
|
|
||||||
if file_extension == '.xlsx':
|
if file_extension == '.xlsx':
|
||||||
excel_file = pd.ExcelFile(self._file_path, engine='openpyxl')
|
wb = load_workbook(self._file_path, data_only=True)
|
||||||
|
for sheet_name in wb.sheetnames:
|
||||||
|
sheet = wb[sheet_name]
|
||||||
|
data = sheet.values
|
||||||
|
cols = next(data)
|
||||||
|
df = pd.DataFrame(data, columns=cols)
|
||||||
|
|
||||||
|
df.dropna(how='all', inplace=True)
|
||||||
|
|
||||||
|
for index, row in df.iterrows():
|
||||||
|
page_content = []
|
||||||
|
for col_index, (k, v) in enumerate(row.items()):
|
||||||
|
if pd.notna(v):
|
||||||
|
cell = sheet.cell(row=index + 2,
|
||||||
|
column=col_index + 1) # +2 to account for header and 1-based index
|
||||||
|
if cell.hyperlink:
|
||||||
|
value = f"[{v}]({cell.hyperlink.target})"
|
||||||
|
page_content.append(f'"{k}":"{value}"')
|
||||||
|
else:
|
||||||
|
page_content.append(f'"{k}":"{v}"')
|
||||||
|
documents.append(Document(page_content=';'.join(page_content),
|
||||||
|
metadata={'source': self._file_path}))
|
||||||
|
|
||||||
elif file_extension == '.xls':
|
elif file_extension == '.xls':
|
||||||
excel_file = pd.ExcelFile(self._file_path, engine='xlrd')
|
excel_file = pd.ExcelFile(self._file_path, engine='xlrd')
|
||||||
|
for sheet_name in excel_file.sheet_names:
|
||||||
|
df = excel_file.parse(sheet_name=sheet_name)
|
||||||
|
df.dropna(how='all', inplace=True)
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
page_content = []
|
||||||
|
for k, v in row.items():
|
||||||
|
if pd.notna(v):
|
||||||
|
page_content.append(f'"{k}":"{v}"')
|
||||||
|
documents.append(Document(page_content=';'.join(page_content),
|
||||||
|
metadata={'source': self._file_path}))
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported file extension: {file_extension}")
|
raise ValueError(f"Unsupported file extension: {file_extension}")
|
||||||
for sheet_name in excel_file.sheet_names:
|
|
||||||
df: pd.DataFrame = excel_file.parse(sheet_name=sheet_name)
|
|
||||||
|
|
||||||
# filter out rows with all NaN values
|
|
||||||
df.dropna(how='all', inplace=True)
|
|
||||||
|
|
||||||
# transform each row into a Document
|
|
||||||
documents += [Document(page_content=';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v)),
|
|
||||||
metadata={'source': self._file_path},
|
|
||||||
) for _, row in df.iterrows()]
|
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
2
api/poetry.lock
generated
2
api/poetry.lock
generated
@ -9543,4 +9543,4 @@ cffi = ["cffi (>=1.11)"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "6b7d8b1333ae9c71ba2e1c5800eecf1535ed3945cd55ebb1e253b7a29ba09559"
|
content-hash = "9619ddabdd67710981c13dcfa3ddae0a48497c9f694afc81b820e882440c1265"
|
||||||
|
@ -177,6 +177,7 @@ xinference-client = "0.9.4"
|
|||||||
yarl = "~1.9.4"
|
yarl = "~1.9.4"
|
||||||
zhipuai = "1.0.7"
|
zhipuai = "1.0.7"
|
||||||
rank-bm25 = "~0.2.2"
|
rank-bm25 = "~0.2.2"
|
||||||
|
openpyxl = "^3.1.5"
|
||||||
############################################################
|
############################################################
|
||||||
# Tool dependencies required by tool implementations
|
# Tool dependencies required by tool implementations
|
||||||
############################################################
|
############################################################
|
||||||
|
Loading…
x
Reference in New Issue
Block a user