mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-15 01:56:01 +08:00
add xls file suport (#3321)
This commit is contained in:
parent
42936fc917
commit
ad65c891e7
@ -4,12 +4,15 @@ from werkzeug.exceptions import Unauthorized
|
|||||||
|
|
||||||
if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
|
if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
|
||||||
from gevent import monkey
|
from gevent import monkey
|
||||||
|
|
||||||
monkey.patch_all()
|
monkey.patch_all()
|
||||||
# if os.environ.get("VECTOR_STORE") == 'milvus':
|
# if os.environ.get("VECTOR_STORE") == 'milvus':
|
||||||
import grpc.experimental.gevent
|
import grpc.experimental.gevent
|
||||||
|
|
||||||
grpc.experimental.gevent.init_gevent()
|
grpc.experimental.gevent.init_gevent()
|
||||||
|
|
||||||
import langchain
|
import langchain
|
||||||
|
|
||||||
langchain.verbose = True
|
langchain.verbose = True
|
||||||
|
|
||||||
import json
|
import json
|
||||||
@ -44,6 +47,7 @@ from services.account_service import AccountService
|
|||||||
# DO NOT REMOVE BELOW
|
# DO NOT REMOVE BELOW
|
||||||
from events import event_handlers
|
from events import event_handlers
|
||||||
from models import account, dataset, model, source, task, tool, tools, web
|
from models import account, dataset, model, source, task, tool, tools, web
|
||||||
|
|
||||||
# DO NOT REMOVE ABOVE
|
# DO NOT REMOVE ABOVE
|
||||||
|
|
||||||
|
|
||||||
@ -51,7 +55,7 @@ warnings.simplefilter("ignore", ResourceWarning)
|
|||||||
|
|
||||||
# fix windows platform
|
# fix windows platform
|
||||||
if os.name == "nt":
|
if os.name == "nt":
|
||||||
os.system('tzutil /s "UTC"')
|
os.system('tzutil /s "UTC"')
|
||||||
else:
|
else:
|
||||||
os.environ['TZ'] = 'UTC'
|
os.environ['TZ'] = 'UTC'
|
||||||
time.tzset()
|
time.tzset()
|
||||||
@ -60,6 +64,7 @@ else:
|
|||||||
class DifyApp(Flask):
|
class DifyApp(Flask):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
# -------------
|
# -------------
|
||||||
# Configuration
|
# Configuration
|
||||||
# -------------
|
# -------------
|
||||||
@ -67,6 +72,7 @@ class DifyApp(Flask):
|
|||||||
|
|
||||||
config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first
|
config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first
|
||||||
|
|
||||||
|
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
# Application Factory Function
|
# Application Factory Function
|
||||||
# ----------------------------
|
# ----------------------------
|
||||||
@ -192,7 +198,6 @@ def register_blueprints(app):
|
|||||||
app = create_app()
|
app = create_app()
|
||||||
celery = app.extensions["celery"]
|
celery = app.extensions["celery"]
|
||||||
|
|
||||||
|
|
||||||
if app.config['TESTING']:
|
if app.config['TESTING']:
|
||||||
print("App is running in TESTING mode")
|
print("App is running in TESTING mode")
|
||||||
|
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
import xlrd
|
||||||
|
|
||||||
from core.rag.extractor.extractor_base import BaseExtractor
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
self._autodetect_encoding = autodetect_encoding
|
self._autodetect_encoding = autodetect_encoding
|
||||||
|
|
||||||
def extract(self) -> list[Document]:
|
def extract(self) -> list[Document]:
|
||||||
|
""" parse excel file"""
|
||||||
|
if self._file_path.endswith('.xls'):
|
||||||
|
return self._extract4xls()
|
||||||
|
elif self._file_path.endswith('.xlsx'):
|
||||||
|
return self._extract4xlsx()
|
||||||
|
|
||||||
|
def _extract4xls(self) -> list[Document]:
|
||||||
|
wb = xlrd.open_workbook(filename=self._file_path)
|
||||||
|
documents = []
|
||||||
|
# loop over all sheets
|
||||||
|
for sheet in wb.sheets():
|
||||||
|
for row_index, row in enumerate(sheet.get_rows(), start=1):
|
||||||
|
row_header = None
|
||||||
|
if self.is_blank_row(row):
|
||||||
|
continue
|
||||||
|
if row_header is None:
|
||||||
|
row_header = row
|
||||||
|
continue
|
||||||
|
item_arr = []
|
||||||
|
for index, cell in enumerate(row):
|
||||||
|
txt_value = str(cell.value)
|
||||||
|
item_arr.append(f'{row_header[index].value}:{txt_value}')
|
||||||
|
item_str = "\n".join(item_arr)
|
||||||
|
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
||||||
|
documents.append(document)
|
||||||
|
return documents
|
||||||
|
|
||||||
|
def _extract4xlsx(self) -> list[Document]:
|
||||||
"""Load from file path using Pandas."""
|
"""Load from file path using Pandas."""
|
||||||
data = []
|
data = []
|
||||||
|
# Read each worksheet of an Excel file using Pandas
|
||||||
# 使用 Pandas 读取 Excel 文件的每个工作表
|
|
||||||
xls = pd.ExcelFile(self._file_path)
|
xls = pd.ExcelFile(self._file_path)
|
||||||
for sheet_name in xls.sheet_names:
|
for sheet_name in xls.sheet_names:
|
||||||
df = pd.read_excel(xls, sheet_name=sheet_name)
|
df = pd.read_excel(xls, sheet_name=sheet_name)
|
||||||
@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor):
|
|||||||
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
||||||
document = Document(page_content=item, metadata={'source': self._file_path})
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
||||||
data.append(document)
|
data.append(document)
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def is_blank_row(row):
|
||||||
|
"""
|
||||||
|
|
||||||
|
Determine whether the specified line is a blank line.
|
||||||
|
:param row: row object。
|
||||||
|
:return: Returns True if the row is blank, False otherwise.
|
||||||
|
"""
|
||||||
|
# Iterates through the cells and returns False if a non-empty cell is found
|
||||||
|
for cell in row:
|
||||||
|
if cell.value is not None and cell.value != '':
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
@ -84,7 +84,7 @@ class ExtractProcessor:
|
|||||||
etl_type = current_app.config['ETL_TYPE']
|
etl_type = current_app.config['ETL_TYPE']
|
||||||
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
|
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
|
||||||
if etl_type == 'Unstructured':
|
if etl_type == 'Unstructured':
|
||||||
if file_extension == '.xlsx':
|
if file_extension == '.xlsx' or file_extension == '.xls':
|
||||||
extractor = ExcelExtractor(file_path)
|
extractor = ExcelExtractor(file_path)
|
||||||
elif file_extension == '.pdf':
|
elif file_extension == '.pdf':
|
||||||
extractor = PdfExtractor(file_path)
|
extractor = PdfExtractor(file_path)
|
||||||
@ -114,7 +114,7 @@ class ExtractProcessor:
|
|||||||
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
|
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
|
||||||
else TextExtractor(file_path, autodetect_encoding=True)
|
else TextExtractor(file_path, autodetect_encoding=True)
|
||||||
else:
|
else:
|
||||||
if file_extension == '.xlsx':
|
if file_extension == '.xlsx' or file_extension == '.xls':
|
||||||
extractor = ExcelExtractor(file_path)
|
extractor = ExcelExtractor(file_path)
|
||||||
elif file_extension == '.pdf':
|
elif file_extension == '.pdf':
|
||||||
extractor = PdfExtractor(file_path)
|
extractor = PdfExtractor(file_path)
|
||||||
|
@ -82,3 +82,4 @@ qrcode~=7.4.2
|
|||||||
azure-storage-blob==12.9.0
|
azure-storage-blob==12.9.0
|
||||||
azure-identity==1.15.0
|
azure-identity==1.15.0
|
||||||
lxml==5.1.0
|
lxml==5.1.0
|
||||||
|
xlrd~=2.0.1
|
||||||
|
@ -20,9 +20,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError
|
|||||||
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
|
IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
|
||||||
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
|
IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
|
||||||
|
|
||||||
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
|
ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv']
|
||||||
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
|
UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls',
|
||||||
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']
|
'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']
|
||||||
|
|
||||||
PREVIEW_WORDS_LIMIT = 3000
|
PREVIEW_WORDS_LIMIT = 3000
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user