diff --git a/api/app.py b/api/app.py index aea28ac93a..a921cbce04 100644 --- a/api/app.py +++ b/api/app.py @@ -4,12 +4,15 @@ from werkzeug.exceptions import Unauthorized if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true': from gevent import monkey + monkey.patch_all() # if os.environ.get("VECTOR_STORE") == 'milvus': import grpc.experimental.gevent + grpc.experimental.gevent.init_gevent() import langchain + langchain.verbose = True import json @@ -44,6 +47,7 @@ from services.account_service import AccountService # DO NOT REMOVE BELOW from events import event_handlers from models import account, dataset, model, source, task, tool, tools, web + # DO NOT REMOVE ABOVE @@ -51,7 +55,7 @@ warnings.simplefilter("ignore", ResourceWarning) # fix windows platform if os.name == "nt": - os.system('tzutil /s "UTC"') + os.system('tzutil /s "UTC"') else: os.environ['TZ'] = 'UTC' time.tzset() @@ -60,6 +64,7 @@ else: class DifyApp(Flask): pass + # ------------- # Configuration # ------------- @@ -67,6 +72,7 @@ class DifyApp(Flask): config_type = os.getenv('EDITION', default='SELF_HOSTED') # ce edition first + # ---------------------------- # Application Factory Function # ---------------------------- @@ -192,7 +198,6 @@ def register_blueprints(app): app = create_app() celery = app.extensions["celery"] - if app.config['TESTING']: print("App is running in TESTING mode") diff --git a/api/core/rag/extractor/excel_extractor.py b/api/core/rag/extractor/excel_extractor.py index 0a964bdb01..2b0066448e 100644 --- a/api/core/rag/extractor/excel_extractor.py +++ b/api/core/rag/extractor/excel_extractor.py @@ -2,6 +2,7 @@ from typing import Optional import pandas as pd +import xlrd from core.rag.extractor.extractor_base import BaseExtractor from core.rag.models.document import Document @@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor): self._autodetect_encoding = autodetect_encoding def extract(self) -> list[Document]: + """ parse excel file""" + if self._file_path.endswith('.xls'): + return self._extract4xls() + elif self._file_path.endswith('.xlsx'): + return self._extract4xlsx() + + def _extract4xls(self) -> list[Document]: + wb = xlrd.open_workbook(filename=self._file_path) + documents = [] + # loop over all sheets + for sheet in wb.sheets(): + for row_index, row in enumerate(sheet.get_rows(), start=1): + row_header = None + if self.is_blank_row(row): + continue + if row_header is None: + row_header = row + continue + item_arr = [] + for index, cell in enumerate(row): + txt_value = str(cell.value) + item_arr.append(f'{row_header[index].value}:{txt_value}') + item_str = "\n".join(item_arr) + document = Document(page_content=item_str, metadata={'source': self._file_path}) + documents.append(document) + return documents + + def _extract4xlsx(self) -> list[Document]: """Load from file path using Pandas.""" data = [] - - # 使用 Pandas 读取 Excel 文件的每个工作表 + # Read each worksheet of an Excel file using Pandas xls = pd.ExcelFile(self._file_path) for sheet_name in xls.sheet_names: df = pd.read_excel(xls, sheet_name=sheet_name) @@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor): item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v)) document = Document(page_content=item, metadata={'source': self._file_path}) data.append(document) - return data + + @staticmethod + def is_blank_row(row): + """ + + Determine whether the specified line is a blank line. + :param row: row object。 + :return: Returns True if the row is blank, False otherwise. + """ + # Iterates through the cells and returns False if a non-empty cell is found + for cell in row: + if cell.value is not None and cell.value != '': + return False + return True diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index 8bb884c2dd..1136e11f76 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -84,7 +84,7 @@ class ExtractProcessor: etl_type = current_app.config['ETL_TYPE'] unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] if etl_type == 'Unstructured': - if file_extension == '.xlsx': + if file_extension == '.xlsx' or file_extension == '.xls': extractor = ExcelExtractor(file_path) elif file_extension == '.pdf': extractor = PdfExtractor(file_path) @@ -114,7 +114,7 @@ class ExtractProcessor: extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \ else TextExtractor(file_path, autodetect_encoding=True) else: - if file_extension == '.xlsx': + if file_extension == '.xlsx' or file_extension == '.xls': extractor = ExcelExtractor(file_path) elif file_extension == '.pdf': extractor = PdfExtractor(file_path) diff --git a/api/requirements.txt b/api/requirements.txt index f9ff2ee60d..874ca63827 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -82,3 +82,4 @@ qrcode~=7.4.2 azure-storage-blob==12.9.0 azure-identity==1.15.0 lxml==5.1.0 +xlrd~=2.0.1 diff --git a/api/services/file_service.py b/api/services/file_service.py index 39f31098ae..ab71ef02a0 100644 --- a/api/services/file_service.py +++ b/api/services/file_service.py @@ -20,9 +20,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg'] IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS]) -ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv'] -UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', +ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv'] +UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub'] + PREVIEW_WORDS_LIMIT = 3000