From b5204111da0c5b81120374d30fa5e9a5b35221ae Mon Sep 17 00:00:00 2001 From: majian <46617237@qq.com> Date: Mon, 20 May 2024 13:14:17 +0800 Subject: [PATCH] Add UNSTRUCTURED_API_KEY env support (#4369) --- api/.env.example | 1 + api/config.py | 1 + api/core/rag/extractor/extract_processor.py | 3 ++- .../extractor/unstructured/unstructured_ppt_extractor.py | 6 ++++-- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/api/.env.example b/api/.env.example index e0f87d471a..05d4f4a530 100644 --- a/api/.env.example +++ b/api/.env.example @@ -144,6 +144,7 @@ NOTION_INTERNAL_SECRET=you-internal-secret ETL_TYPE=dify UNSTRUCTURED_API_URL= +UNSTRUCTURED_API_KEY= SSRF_PROXY_HTTP_URL= SSRF_PROXY_HTTPS_URL= diff --git a/api/config.py b/api/config.py index 10c0c7e878..4d1f905440 100644 --- a/api/config.py +++ b/api/config.py @@ -365,6 +365,7 @@ class Config: self.ETL_TYPE = get_env('ETL_TYPE') self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL') + self.UNSTRUCTURED_API_KEY = get_env('UNSTRUCTURED_API_KEY') self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED') self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO') diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index a7adea8a05..093965e10e 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -96,6 +96,7 @@ class ExtractProcessor: file_extension = input_file.suffix.lower() etl_type = current_app.config['ETL_TYPE'] unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL'] + unstructured_api_key = current_app.config['UNSTRUCTURED_API_KEY'] if etl_type == 'Unstructured': if file_extension == '.xlsx' or file_extension == '.xls': extractor = ExcelExtractor(file_path) @@ -115,7 +116,7 @@ class ExtractProcessor: elif file_extension == '.eml': extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url) elif file_extension == '.ppt': - extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url) + extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key) elif file_extension == '.pptx': extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url) elif file_extension == '.xml': diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py index 6d3ffe6589..d354b593ed 100644 --- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py +++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py @@ -17,16 +17,18 @@ class UnstructuredPPTExtractor(BaseExtractor): def __init__( self, file_path: str, - api_url: str + api_url: str, + api_key: str ): """Initialize with file path.""" self._file_path = file_path self._api_url = api_url + self._api_key = api_key def extract(self) -> list[Document]: from unstructured.partition.api import partition_via_api - elements = partition_via_api(filename=self._file_path, api_url=self._api_url) + elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key) text_by_page = {} for element in elements: page = element.metadata.page_number