Add UNSTRUCTURED_API_KEY env support (#4369)

This commit is contained in:
majian 2024-05-20 13:14:17 +08:00 committed by GitHub
parent 3a51f2a778
commit b5204111da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 8 additions and 3 deletions

View File

@ -144,6 +144,7 @@ NOTION_INTERNAL_SECRET=you-internal-secret
ETL_TYPE=dify
UNSTRUCTURED_API_URL=
UNSTRUCTURED_API_KEY=
SSRF_PROXY_HTTP_URL=
SSRF_PROXY_HTTPS_URL=

View File

@ -365,6 +365,7 @@ class Config:
self.ETL_TYPE = get_env('ETL_TYPE')
self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
self.UNSTRUCTURED_API_KEY = get_env('UNSTRUCTURED_API_KEY')
self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED')
self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO')

View File

@ -96,6 +96,7 @@ class ExtractProcessor:
file_extension = input_file.suffix.lower()
etl_type = current_app.config['ETL_TYPE']
unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
unstructured_api_key = current_app.config['UNSTRUCTURED_API_KEY']
if etl_type == 'Unstructured':
if file_extension == '.xlsx' or file_extension == '.xls':
extractor = ExcelExtractor(file_path)
@ -115,7 +116,7 @@ class ExtractProcessor:
elif file_extension == '.eml':
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
elif file_extension == '.ppt':
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url)
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == '.pptx':
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
elif file_extension == '.xml':

View File

@ -17,16 +17,18 @@ class UnstructuredPPTExtractor(BaseExtractor):
def __init__(
self,
file_path: str,
api_url: str
api_url: str,
api_key: str
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key
def extract(self) -> list[Document]:
from unstructured.partition.api import partition_via_api
elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
text_by_page = {}
for element in elements:
page = element.metadata.page_number