Add new tool: Firecrawl (#3819)

Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: Yeuoly <admin@srmxy.cn>
This commit is contained in:
Richards Tu 2024-04-29 14:20:36 +08:00 committed by GitHub
parent 8f2ae51fe5
commit f26ad16af7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 180 additions and 1 deletions

View File

@ -0,0 +1,3 @@
<svg xmlns="http://www.w3.org/2000/svg" width="111" height="111" viewBox="0 0 111 111" fill="none">
<text x="0" y="90" font-family="Verdana" font-size="85" fill="black">🔥</text>
</svg>

After

Width:  |  Height:  |  Size: 193 B

View File

@ -0,0 +1,23 @@
from core.tools.errors import ToolProviderCredentialValidationError
from core.tools.provider.builtin.firecrawl.tools.crawl import CrawlTool
from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
class FirecrawlProvider(BuiltinToolProviderController):
def _validate_credentials(self, credentials: dict) -> None:
try:
# Example validation using the Crawl tool
CrawlTool().fork_tool_runtime(
meta={"credentials": credentials}
).invoke(
user_id='',
tool_parameters={
"url": "https://example.com",
"includes": '',
"excludes": '',
"limit": 1,
"onlyMainContent": True,
}
)
except Exception as e:
raise ToolProviderCredentialValidationError(str(e))

View File

@ -0,0 +1,24 @@
identity:
author: Richards Tu
name: firecrawl
label:
en_US: Firecrawl
zh_CN: Firecrawl
description:
en_US: Firecrawl API integration for web crawling and scraping.
zh_CN: Firecrawl API 集成,用于网页爬取和数据抓取。
icon: icon.svg
credentials_for_provider:
firecrawl_api_key:
type: secret-input
required: true
label:
en_US: Firecrawl API Key
zh_CN: Firecrawl API 密钥
placeholder:
en_US: Please input your Firecrawl API key
zh_CN: 请输入您的 Firecrawl API 密钥
help:
en_US: Get your Firecrawl API key from your Firecrawl account settings.
zh_CN: 从您的 Firecrawl 账户设置中获取 Firecrawl API 密钥。
url: https://www.firecrawl.dev/account

View File

@ -0,0 +1,50 @@
from typing import Any, Union
from firecrawl import FirecrawlApp
from core.tools.entities.tool_entities import ToolInvokeMessage
from core.tools.tool.builtin_tool import BuiltinTool
class CrawlTool(BuiltinTool):
def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
# initialize the app object with the api key
app = FirecrawlApp(api_key=self.runtime.credentials['firecrawl_api_key'])
options = {
'crawlerOptions': {
'excludes': tool_parameters.get('excludes', '').split(',') if tool_parameters.get('excludes') else [],
'includes': tool_parameters.get('includes', '').split(',') if tool_parameters.get('includes') else [],
'limit': tool_parameters.get('limit', 5)
},
'pageOptions': {
'onlyMainContent': tool_parameters.get('onlyMainContent', False)
}
}
# crawl the url
crawl_result = app.crawl_url(
url=tool_parameters['url'],
params=options,
wait_until_done=True,
)
# reformat crawl result
crawl_output = "**Crawl Result**\n\n"
try:
for result in crawl_result:
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description', '')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"
except Exception as e:
crawl_output += f"An error occurred: {str(e)}\n"
crawl_output += f"**- Title:** {result.get('metadata', {}).get('title', '')}\n"
crawl_output += f"**- Description:** {result.get('metadata', {}).get('description','')}\n"
crawl_output += f"**- URL:** {result.get('metadata', {}).get('ogUrl', '')}\n\n"
crawl_output += f"**- Web Content:**\n{result.get('markdown', '')}\n\n"
crawl_output += "---\n\n"
return self.create_text_message(crawl_output)

View File

@ -0,0 +1,78 @@
identity:
name: crawl
author: Richards Tu
label:
en_US: Crawl
zh_Hans: 爬取
description:
human:
en_US: Extract data from a website by crawling through a URL.
zh_Hans: 通过URL从网站中提取数据。
llm: This tool initiates a web crawl to extract data from a specified URL. It allows configuring crawler options such as including or excluding URL patterns, generating alt text for images using LLMs (paid plan required), limiting the maximum number of pages to crawl, and returning only the main content of the page. The tool can return either a list of crawled documents or a list of URLs based on the provided options.
parameters:
- name: url
type: string
required: true
label:
en_US: URL to crawl
zh_Hans: 要爬取的URL
human_description:
en_US: The URL of the website to crawl and extract data from.
zh_Hans: 要爬取并提取数据的网站URL。
llm_description: The URL of the website that needs to be crawled. This is a required parameter.
form: llm
- name: includes
type: string
required: false
label:
en_US: URL patterns to include
zh_Hans: 要包含的URL模式
human_description:
en_US: Specify URL patterns to include during the crawl. Only pages matching these patterns will be crawled, you can use ',' to separate multiple patterns.
zh_Hans: 指定爬取过程中要包含的URL模式。只有与这些模式匹配的页面才会被爬取。
form: form
default: ''
- name: excludes
type: string
required: false
label:
en_US: URL patterns to exclude
zh_Hans: 要排除的URL模式
human_description:
en_US: Specify URL patterns to exclude during the crawl. Pages matching these patterns will be skipped, you can use ',' to separate multiple patterns.
zh_Hans: 指定爬取过程中要排除的URL模式。匹配这些模式的页面将被跳过。
form: form
default: 'blog/*'
- name: limit
type: number
required: false
label:
en_US: Maximum number of pages to crawl
zh_Hans: 最大爬取页面数
human_description:
en_US: Specify the maximum number of pages to crawl. The crawler will stop after reaching this limit.
zh_Hans: 指定要爬取的最大页面数。爬虫将在达到此限制后停止。
form: form
min: 1
max: 20
default: 5
- name: onlyMainContent
type: boolean
required: false
label:
en_US: Only return the main content of the page
zh_Hans: 仅返回页面的主要内容
human_description:
en_US: If enabled, the crawler will only return the main content of the page, excluding headers, navigation, footers, etc.
zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。
form: form
options:
- value: true
label:
en_US: Yes
zh_Hans:
- value: false
label:
en_US: No
zh_Hans:
default: false

View File

@ -81,4 +81,5 @@ lxml==5.1.0
xlrd~=2.0.1
pydantic~=1.10.0
pgvecto-rs==0.1.4
firecrawl-py==0.0.5
oss2==2.15.0