mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-11 13:59:04 +08:00
The firecrawl tool now supports self-hosting (#5528)
Co-authored-by: takatost <takatost@gmail.com>
This commit is contained in:
parent
023dba9475
commit
cdc2a6f637
@ -25,3 +25,12 @@ credentials_for_provider:
|
||||
en_US: Get your Firecrawl API key from your Firecrawl account settings.
|
||||
zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
|
||||
url: https://www.firecrawl.dev/account
|
||||
base_url:
|
||||
type: text-input
|
||||
required: false
|
||||
label:
|
||||
en_US: Firecrawl server's Base URL
|
||||
pt_BR: Firecrawl server's Base URL
|
||||
placeholder:
|
||||
en_US: https://www.firecrawl.dev
|
||||
pt_BR: https://www.firecrawl.dev
|
||||
|
98
api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py
Normal file
98
api/core/tools/provider/builtin/firecrawl/firecrawl_appx.py
Normal file
@ -0,0 +1,98 @@
|
||||
import time
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class FirecrawlApp:
|
||||
def __init__(self, api_key=None, base_url=None):
|
||||
self.api_key = api_key
|
||||
self.base_url = base_url or 'https://api.firecrawl.dev'
|
||||
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
|
||||
raise ValueError('No API key provided')
|
||||
|
||||
def scrape_url(self, url, params=None) -> dict:
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
f'{self.base_url}/v0/scrape',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] == True:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
|
||||
headers = self._prepare_headers()
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
return self._monitor_job_status(job_id, headers, timeout)
|
||||
else:
|
||||
return {'jobId': job_id}
|
||||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id) -> dict:
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def _prepare_headers(self):
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
|
||||
def _post_request(self, url, data, headers):
|
||||
return requests.post(url, headers=headers, json=data)
|
||||
|
||||
def _get_request(self, url, headers):
|
||||
return requests.get(url, headers=headers)
|
||||
|
||||
def _monitor_job_status(self, job_id, headers, timeout):
|
||||
while True:
|
||||
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
return status_data['data']
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
||||
if timeout < 2:
|
||||
timeout = 2
|
||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
||||
else:
|
||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
||||
else:
|
||||
self._handle_error(status_response, 'check crawl status')
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
if response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
@ -1,15 +1,14 @@
|
||||
from typing import Any, Union
|
||||
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||
from core.tools.provider.builtin.firecrawl.firecrawl_appx import FirecrawlApp
|
||||
from core.tools.tool.builtin_tool import BuiltinTool
|
||||
|
||||
|
||||
class CrawlTool(BuiltinTool):
|
||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||
# initialize the app object with the api key
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'])
|
||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
||||
|
||||
options = {
|
||||
'crawlerOptions': {
|
||||
|
Loading…
x
Reference in New Issue
Block a user