diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml index 311283dcb5..edd28f7d22 100644 --- a/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl.yaml @@ -25,3 +25,12 @@ credentials_for_provider: en_US: Get your Firecrawl API key from your Firecrawl account settings. zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。 url: https://www.firecrawl.dev/account + base_url: + type: text-input + required: false + label: + en_US: Firecrawl server's Base URL + pt_BR: Firecrawl server's Base URL + placeholder: + en_US: https://www.firecrawl.dev + pt_BR: https://www.firecrawl.dev diff --git a/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py new file mode 100644 index 0000000000..a28f479170 --- /dev/null +++ b/api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py @@ -0,0 +1,98 @@ +import time + +import requests + + +class FirecrawlApp: + def __init__(self, api_key=None, base_url=None): + self.api_key = api_key + self.base_url = base_url or 'https://api.firecrawl.dev' + if self.api_key is None and self.base_url == 'https://api.firecrawl.dev': + raise ValueError('No API key provided') + + def scrape_url(self, url, params=None) -> dict: + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + json_data = {'url': url} + if params: + json_data.update(params) + response = requests.post( + f'{self.base_url}/v0/scrape', + headers=headers, + json=json_data + ) + if response.status_code == 200: + response = response.json() + if response['success'] == True: + return response['data'] + else: + raise Exception(f'Failed to scrape URL. Error: {response["error"]}') + + elif response.status_code in [402, 409, 500]: + error_message = response.json().get('error', 'Unknown error occurred') + raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') + else: + raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') + + def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str: + headers = self._prepare_headers() + json_data = {'url': url} + if params: + json_data.update(params) + response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers) + if response.status_code == 200: + job_id = response.json().get('jobId') + if wait_until_done: + return self._monitor_job_status(job_id, headers, timeout) + else: + return {'jobId': job_id} + else: + self._handle_error(response, 'start crawl job') + + def check_crawl_status(self, job_id) -> dict: + headers = self._prepare_headers() + response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, 'check crawl status') + + def _prepare_headers(self): + return { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.api_key}' + } + + def _post_request(self, url, data, headers): + return requests.post(url, headers=headers, json=data) + + def _get_request(self, url, headers): + return requests.get(url, headers=headers) + + def _monitor_job_status(self, job_id, headers, timeout): + while True: + status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers) + if status_response.status_code == 200: + status_data = status_response.json() + if status_data['status'] == 'completed': + if 'data' in status_data: + return status_data['data'] + else: + raise Exception('Crawl job completed but no data was returned') + elif status_data['status'] in ['active', 'paused', 'pending', 'queued']: + if timeout < 2: + timeout = 2 + time.sleep(timeout) # Wait for the specified timeout before checking again + else: + raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') + else: + self._handle_error(status_response, 'check crawl status') + + def _handle_error(self, response, action): + if response.status_code in [402, 409, 500]: + error_message = response.json().get('error', 'Unknown error occurred') + raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') + else: + raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') diff --git a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py index 1eaa5d8013..ab3a73dd03 100644 --- a/api/core/tools/provider/builtin/firecrawl/tools/crawl.py +++ b/api/core/tools/provider/builtin/firecrawl/tools/crawl.py @@ -1,15 +1,14 @@ from typing import Any, Union -from firecrawl import FirecrawlApp - from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp from core.tools.tool.builtin_tool import BuiltinTool class CrawlTool(BuiltinTool): def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: # initialize the app object with the api key - app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key']) + app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url']) options = { 'crawlerOptions': {