mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 01:29:01 +08:00
Removed firecrawl-py, fixed and improved firecrawl tool (#5896)
Co-authored-by: -LAN- <laipz8200@outlook.com>
This commit is contained in:
parent
bf2268b0af
commit
cc63af8e72
@ -6,7 +6,7 @@ identity:
|
|||||||
zh_CN: Firecrawl
|
zh_CN: Firecrawl
|
||||||
description:
|
description:
|
||||||
en_US: Firecrawl API integration for web crawling and scraping.
|
en_US: Firecrawl API integration for web crawling and scraping.
|
||||||
zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。
|
zh_Hans: Firecrawl API 集成,用于网页爬取和数据抓取。
|
||||||
icon: icon.svg
|
icon: icon.svg
|
||||||
tags:
|
tags:
|
||||||
- search
|
- search
|
||||||
@ -17,20 +17,22 @@ credentials_for_provider:
|
|||||||
required: true
|
required: true
|
||||||
label:
|
label:
|
||||||
en_US: Firecrawl API Key
|
en_US: Firecrawl API Key
|
||||||
zh_CN: Firecrawl API 密钥
|
zh_Hans: Firecrawl API 密钥
|
||||||
placeholder:
|
placeholder:
|
||||||
en_US: Please input your Firecrawl API key
|
en_US: Please input your Firecrawl API key
|
||||||
zh_CN: 请输入您的 Firecrawl API 密钥
|
zh_Hans: 请输入您的 Firecrawl API 密钥,如果是自托管版本,可以随意填写密钥
|
||||||
help:
|
help:
|
||||||
en_US: Get your Firecrawl API key from your Firecrawl account settings.
|
en_US: Get your Firecrawl API key from your Firecrawl account settings.If you are using a self-hosted version, you may enter any key at your convenience.
|
||||||
zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
|
zh_Hans: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。如果是自托管版本,可以随意填写密钥。
|
||||||
url: https://www.firecrawl.dev/account
|
url: https://www.firecrawl.dev/account
|
||||||
base_url:
|
base_url:
|
||||||
type: text-input
|
type: text-input
|
||||||
required: false
|
required: false
|
||||||
label:
|
label:
|
||||||
en_US: Firecrawl server's Base URL
|
en_US: Firecrawl server's Base URL
|
||||||
|
zh_Hans: Firecrawl服务器的API URL
|
||||||
pt_BR: Firecrawl server's Base URL
|
pt_BR: Firecrawl server's Base URL
|
||||||
placeholder:
|
placeholder:
|
||||||
en_US: https://www.firecrawl.dev
|
en_US: https://www.firecrawl.dev
|
||||||
|
zh_HansL: https://www.firecrawl.dev
|
||||||
pt_BR: https://www.firecrawl.dev
|
pt_BR: https://www.firecrawl.dev
|
||||||
|
@ -1,98 +1,93 @@
|
|||||||
import time
|
import time
|
||||||
|
from collections.abc import Mapping
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from requests.exceptions import HTTPError
|
||||||
|
|
||||||
|
|
||||||
class FirecrawlApp:
|
class FirecrawlApp:
|
||||||
def __init__(self, api_key=None, base_url=None):
|
def __init__(self, api_key: str | None = None, base_url: str | None = None):
|
||||||
self.api_key = api_key
|
self.api_key = api_key
|
||||||
self.base_url = base_url or 'https://api.firecrawl.dev'
|
self.base_url = base_url or 'https://api.firecrawl.dev'
|
||||||
if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
|
if not self.api_key:
|
||||||
raise ValueError('No API key provided')
|
raise ValueError("API key is required")
|
||||||
|
|
||||||
def scrape_url(self, url, params=None) -> dict:
|
def _prepare_headers(self, idempotency_key: str | None = None):
|
||||||
headers = {
|
headers = {
|
||||||
'Content-Type': 'application/json',
|
'Content-Type': 'application/json',
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
'Authorization': f'Bearer {self.api_key}'
|
||||||
}
|
}
|
||||||
json_data = {'url': url}
|
if idempotency_key:
|
||||||
if params:
|
headers['Idempotency-Key'] = idempotency_key
|
||||||
json_data.update(params)
|
return headers
|
||||||
response = requests.post(
|
|
||||||
f'{self.base_url}/v0/scrape',
|
|
||||||
headers=headers,
|
|
||||||
json=json_data
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
response = response.json()
|
|
||||||
if response['success'] == True:
|
|
||||||
return response['data']
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
||||||
|
|
||||||
elif response.status_code in [402, 409, 500]:
|
def _request(
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
self,
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
method: str,
|
||||||
else:
|
url: str,
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
data: Mapping[str, Any] | None = None,
|
||||||
|
headers: Mapping[str, str] | None = None,
|
||||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
|
retries: int = 3,
|
||||||
headers = self._prepare_headers()
|
backoff_factor: float = 0.3,
|
||||||
json_data = {'url': url}
|
) -> Mapping[str, Any] | None:
|
||||||
if params:
|
for i in range(retries):
|
||||||
json_data.update(params)
|
try:
|
||||||
response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
|
response = requests.request(method, url, json=data, headers=headers)
|
||||||
if response.status_code == 200:
|
response.raise_for_status()
|
||||||
job_id = response.json().get('jobId')
|
|
||||||
if wait_until_done:
|
|
||||||
return self._monitor_job_status(job_id, headers, timeout)
|
|
||||||
else:
|
|
||||||
return {'jobId': job_id}
|
|
||||||
else:
|
|
||||||
self._handle_error(response, 'start crawl job')
|
|
||||||
|
|
||||||
def check_crawl_status(self, job_id) -> dict:
|
|
||||||
headers = self._prepare_headers()
|
|
||||||
response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
|
||||||
if response.status_code == 200:
|
|
||||||
return response.json()
|
return response.json()
|
||||||
|
except requests.exceptions.RequestException as e:
|
||||||
|
if i < retries - 1:
|
||||||
|
time.sleep(backoff_factor * (2 ** i))
|
||||||
else:
|
else:
|
||||||
self._handle_error(response, 'check crawl status')
|
raise
|
||||||
|
return None
|
||||||
|
|
||||||
def _prepare_headers(self):
|
def scrape_url(self, url: str, **kwargs):
|
||||||
return {
|
endpoint = f'{self.base_url}/v0/scrape'
|
||||||
'Content-Type': 'application/json',
|
headers = self._prepare_headers()
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
data = {'url': url, **kwargs}
|
||||||
}
|
response = self._request('POST', endpoint, data, headers)
|
||||||
|
if response is None:
|
||||||
|
raise HTTPError("Failed to scrape URL after multiple retries")
|
||||||
|
return response
|
||||||
|
|
||||||
def _post_request(self, url, data, headers):
|
def search(self, query: str, **kwargs):
|
||||||
return requests.post(url, headers=headers, json=data)
|
endpoint = f'{self.base_url}/v0/search'
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
data = {'query': query, **kwargs}
|
||||||
|
response = self._request('POST', endpoint, data, headers)
|
||||||
|
if response is None:
|
||||||
|
raise HTTPError("Failed to perform search after multiple retries")
|
||||||
|
return response
|
||||||
|
|
||||||
def _get_request(self, url, headers):
|
def crawl_url(
|
||||||
return requests.get(url, headers=headers)
|
self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
|
||||||
|
):
|
||||||
|
endpoint = f'{self.base_url}/v0/crawl'
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
|
data = {'url': url, **kwargs}
|
||||||
|
response = self._request('POST', endpoint, data, headers)
|
||||||
|
if response is None:
|
||||||
|
raise HTTPError("Failed to initiate crawl after multiple retries")
|
||||||
|
job_id: str = response['jobId']
|
||||||
|
if wait:
|
||||||
|
return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
|
||||||
|
return job_id
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, timeout):
|
def check_crawl_status(self, job_id: str):
|
||||||
|
endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
response = self._request('GET', endpoint, headers=headers)
|
||||||
|
if response is None:
|
||||||
|
raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
|
||||||
|
return response
|
||||||
|
|
||||||
|
def _monitor_job_status(self, job_id: str, poll_interval: int):
|
||||||
while True:
|
while True:
|
||||||
status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
status = self.check_crawl_status(job_id)
|
||||||
if status_response.status_code == 200:
|
if status['status'] == 'completed':
|
||||||
status_data = status_response.json()
|
return status
|
||||||
if status_data['status'] == 'completed':
|
elif status['status'] == 'failed':
|
||||||
if 'data' in status_data:
|
raise HTTPError(f'Job {job_id} failed: {status["error"]}')
|
||||||
return status_data['data']
|
time.sleep(poll_interval)
|
||||||
else:
|
|
||||||
raise Exception('Crawl job completed but no data was returned')
|
|
||||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
|
||||||
if timeout < 2:
|
|
||||||
timeout = 2
|
|
||||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
|
||||||
else:
|
|
||||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
|
||||||
else:
|
|
||||||
self._handle_error(status_response, 'check crawl status')
|
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
|
||||||
if response.status_code in [402, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
|
||||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
|
||||||
|
@ -1,3 +1,4 @@
|
|||||||
|
import json
|
||||||
from typing import Any, Union
|
from typing import Any, Union
|
||||||
|
|
||||||
from core.tools.entities.tool_entities import ToolInvokeMessage
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
||||||
@ -7,7 +8,6 @@ from core.tools.tool.builtin_tool import BuiltinTool
|
|||||||
|
|
||||||
class CrawlTool(BuiltinTool):
|
class CrawlTool(BuiltinTool):
|
||||||
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
|
||||||
# initialize the app object with the api key
|
|
||||||
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'], base_url=self.runtime.credentials['base_url'])
|
||||||
|
|
||||||
options = {
|
options = {
|
||||||
@ -21,29 +21,16 @@ class CrawlTool(BuiltinTool):
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
# crawl the url
|
|
||||||
crawl_result = app.crawl_url(
|
crawl_result = app.crawl_url(
|
||||||
url=tool_parameters['url'],
|
url=tool_parameters['url'],
|
||||||
params=options,
|
params=options,
|
||||||
wait_until_done=True,
|
wait=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# reformat crawl result
|
if not isinstance(crawl_result, str):
|
||||||
crawl_output = "**Crawl Result**\n\n"
|
crawl_result = json.dumps(crawl_result, ensure_ascii=False, indent=4)
|
||||||
try:
|
|
||||||
for result in crawl_result:
|
|
||||||
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
|
|
||||||
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n"
|
|
||||||
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
|
|
||||||
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
|
|
||||||
crawl_output += "---\n\n"
|
|
||||||
except Exception as e:
|
|
||||||
crawl_output += f"An error occurred: {str(e)}\n"
|
|
||||||
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
|
|
||||||
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n"
|
|
||||||
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
|
|
||||||
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
|
|
||||||
crawl_output += "---\n\n"
|
|
||||||
|
|
||||||
|
if not crawl_result:
|
||||||
|
return self.create_text_message("Crawl request failed.")
|
||||||
|
|
||||||
return self.create_text_message(crawl_output)
|
return self.create_text_message(crawl_result)
|
||||||
|
16
api/poetry.lock
generated
16
api/poetry.lock
generated
@ -2083,20 +2083,6 @@ files = [
|
|||||||
{file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
|
{file = "filetype-1.2.0.tar.gz", hash = "sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb"},
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
|
||||||
name = "firecrawl-py"
|
|
||||||
version = "0.0.5"
|
|
||||||
description = "Python SDK for Firecrawl API"
|
|
||||||
optional = false
|
|
||||||
python-versions = "*"
|
|
||||||
files = [
|
|
||||||
{file = "firecrawl-py-0.0.5.tar.gz", hash = "sha256:3d1cc30b7d86c12aa06e6434ebb526072cd70ab9a0c8b145008efe044a1cd09c"},
|
|
||||||
{file = "firecrawl_py-0.0.5-py3-none-any.whl", hash = "sha256:476694345141c0145a1bee9c01a8ad0103f75892c12a122dc511a3adad0785e7"},
|
|
||||||
]
|
|
||||||
|
|
||||||
[package.dependencies]
|
|
||||||
requests = "*"
|
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "flask"
|
name = "flask"
|
||||||
version = "3.0.3"
|
version = "3.0.3"
|
||||||
@ -9095,4 +9081,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "90f0e77567fbe5100d15bf2bc9472007aafc53c2fd594b6a90dd8455dea58582"
|
content-hash = "420c866aaff914d48c00c443a59f181c778690c24f81a955b1f970729bb441b7"
|
||||||
|
@ -115,7 +115,6 @@ chardet = "~5.1.0"
|
|||||||
cohere = "~5.2.4"
|
cohere = "~5.2.4"
|
||||||
cos-python-sdk-v5 = "1.9.30"
|
cos-python-sdk-v5 = "1.9.30"
|
||||||
dashscope = { version = "~1.17.0", extras = ["tokenizer"] }
|
dashscope = { version = "~1.17.0", extras = ["tokenizer"] }
|
||||||
firecrawl-py = "0.0.5"
|
|
||||||
flask = "~3.0.1"
|
flask = "~3.0.1"
|
||||||
flask-compress = "~1.14"
|
flask-compress = "~1.14"
|
||||||
flask-cors = "~4.0.0"
|
flask-cors = "~4.0.0"
|
||||||
|
Loading…
x
Reference in New Issue
Block a user