From 588615b20e01d4c0e1fc1ec132eb0fcef5181e9d Mon Sep 17 00:00:00 2001 From: William Espegren <131612909+WilliamEspegren@users.noreply.github.com> Date: Thu, 18 Jul 2024 08:29:33 +0200 Subject: [PATCH] feat: Spider web scraper & crawler tool (#5725) --- .../provider/builtin/spider/_assets/icon.svg | 1 + .../tools/provider/builtin/spider/spider.py | 14 ++ .../tools/provider/builtin/spider/spider.yaml | 27 ++ .../provider/builtin/spider/spiderApp.py | 237 ++++++++++++++++++ .../builtin/spider/tools/scraper_crawler.py | 47 ++++ .../builtin/spider/tools/scraper_crawler.yaml | 100 ++++++++ 6 files changed, 426 insertions(+) create mode 100644 api/core/tools/provider/builtin/spider/_assets/icon.svg create mode 100644 api/core/tools/provider/builtin/spider/spider.py create mode 100644 api/core/tools/provider/builtin/spider/spider.yaml create mode 100644 api/core/tools/provider/builtin/spider/spiderApp.py create mode 100644 api/core/tools/provider/builtin/spider/tools/scraper_crawler.py create mode 100644 api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml diff --git a/api/core/tools/provider/builtin/spider/_assets/icon.svg b/api/core/tools/provider/builtin/spider/_assets/icon.svg new file mode 100644 index 0000000000..604a09d01d --- /dev/null +++ b/api/core/tools/provider/builtin/spider/_assets/icon.svg @@ -0,0 +1 @@ +Spider v1 Logo diff --git a/api/core/tools/provider/builtin/spider/spider.py b/api/core/tools/provider/builtin/spider/spider.py new file mode 100644 index 0000000000..6fa431b6bb --- /dev/null +++ b/api/core/tools/provider/builtin/spider/spider.py @@ -0,0 +1,14 @@ +from typing import Any + +from core.tools.errors import ToolProviderCredentialValidationError +from core.tools.provider.builtin.spider.spiderApp import Spider +from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController + + +class SpiderProvider(BuiltinToolProviderController): + def _validate_credentials(self, credentials: dict[str, Any]) -> None: + try: + app = Spider(api_key=credentials["spider_api_key"]) + app.scrape_url(url="https://spider.cloud") + except Exception as e: + raise ToolProviderCredentialValidationError(str(e)) diff --git a/api/core/tools/provider/builtin/spider/spider.yaml b/api/core/tools/provider/builtin/spider/spider.yaml new file mode 100644 index 0000000000..45702c85dd --- /dev/null +++ b/api/core/tools/provider/builtin/spider/spider.yaml @@ -0,0 +1,27 @@ +identity: + author: William Espegren + name: spider + label: + en_US: Spider + zh_CN: Spider + description: + en_US: Spider API integration, returning LLM-ready data by scraping & crawling websites. + zh_CN: Spider API 集成,通过爬取和抓取网站返回 LLM-ready 数据。 + icon: icon.svg + tags: + - search + - utilities +credentials_for_provider: + spider_api_key: + type: secret-input + required: true + label: + en_US: Spider API Key + zh_CN: Spider API 密钥 + placeholder: + en_US: Please input your Spider API key + zh_CN: 请输入您的 Spider API 密钥 + help: + en_US: Get your Spider API key from your Spider dashboard + zh_CN: 从您的 Spider 仪表板中获取 Spider API 密钥。 + url: https://spider.cloud/ diff --git a/api/core/tools/provider/builtin/spider/spiderApp.py b/api/core/tools/provider/builtin/spider/spiderApp.py new file mode 100644 index 0000000000..82c0df19ca --- /dev/null +++ b/api/core/tools/provider/builtin/spider/spiderApp.py @@ -0,0 +1,237 @@ +import os +from typing import Literal, Optional, TypedDict + +import requests + + +class RequestParamsDict(TypedDict, total=False): + url: Optional[str] + request: Optional[Literal["http", "chrome", "smart"]] + limit: Optional[int] + return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]] + tld: Optional[bool] + depth: Optional[int] + cache: Optional[bool] + budget: Optional[dict[str, int]] + locale: Optional[str] + cookies: Optional[str] + stealth: Optional[bool] + headers: Optional[dict[str, str]] + anti_bot: Optional[bool] + metadata: Optional[bool] + viewport: Optional[dict[str, int]] + encoding: Optional[str] + subdomains: Optional[bool] + user_agent: Optional[str] + store_data: Optional[bool] + gpt_config: Optional[list[str]] + fingerprint: Optional[bool] + storageless: Optional[bool] + readability: Optional[bool] + proxy_enabled: Optional[bool] + respect_robots: Optional[bool] + query_selector: Optional[str] + full_resources: Optional[bool] + request_timeout: Optional[int] + run_in_background: Optional[bool] + skip_config_checks: Optional[bool] + + +class Spider: + def __init__(self, api_key: Optional[str] = None): + """ + Initialize the Spider with an API key. + + :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable. + :raises ValueError: If no API key is provided. + """ + self.api_key = api_key or os.getenv("SPIDER_API_KEY") + if self.api_key is None: + raise ValueError("No API key provided") + + def api_post( + self, + endpoint: str, + data: dict, + stream: bool, + content_type: str = "application/json", + ): + """ + Send a POST request to the specified API endpoint. + + :param endpoint: The API endpoint to which the POST request is sent. + :param data: The data (dictionary) to be sent in the POST request. + :param stream: Boolean indicating if the response should be streamed. + :return: The JSON response or the raw response stream if stream is True. + """ + headers = self._prepare_headers(content_type) + response = self._post_request( + f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream + ) + + if stream: + return response + elif response.status_code == 200: + return response.json() + else: + self._handle_error(response, f"post to {endpoint}") + + def api_get( + self, endpoint: str, stream: bool, content_type: str = "application/json" + ): + """ + Send a GET request to the specified endpoint. + + :param endpoint: The API endpoint from which to retrieve data. + :return: The JSON decoded response. + """ + headers = self._prepare_headers(content_type) + response = self._get_request( + f"https://api.spider.cloud/v1/{endpoint}", headers, stream + ) + if response.status_code == 200: + return response.json() + else: + self._handle_error(response, f"get from {endpoint}") + + def get_credits(self): + """ + Retrieve the account's remaining credits. + + :return: JSON response containing the number of credits left. + """ + return self.api_get("credits", stream=False) + + def scrape_url( + self, + url: str, + params: Optional[RequestParamsDict] = None, + stream: bool = False, + content_type: str = "application/json", + ): + """ + Scrape data from the specified URL. + + :param url: The URL from which to scrape data. + :param params: Optional dictionary of additional parameters for the scrape request. + :return: JSON response containing the scraping results. + """ + + # Add { "return_format": "markdown" } to the params if not already present + if "return_format" not in params: + params["return_format"] = "markdown" + + # Set limit to 1 + params["limit"] = 1 + + return self.api_post( + "crawl", {"url": url, **(params or {})}, stream, content_type + ) + + def crawl_url( + self, + url: str, + params: Optional[RequestParamsDict] = None, + stream: bool = False, + content_type: str = "application/json", + ): + """ + Start crawling at the specified URL. + + :param url: The URL to begin crawling. + :param params: Optional dictionary with additional parameters to customize the crawl. + :param stream: Boolean indicating if the response should be streamed. Defaults to False. + :return: JSON response or the raw response stream if streaming enabled. + """ + + # Add { "return_format": "markdown" } to the params if not already present + if "return_format" not in params: + params["return_format"] = "markdown" + + return self.api_post( + "crawl", {"url": url, **(params or {})}, stream, content_type + ) + + def links( + self, + url: str, + params: Optional[RequestParamsDict] = None, + stream: bool = False, + content_type: str = "application/json", + ): + """ + Retrieve links from the specified URL. + + :param url: The URL from which to extract links. + :param params: Optional parameters for the link retrieval request. + :return: JSON response containing the links. + """ + return self.api_post( + "links", {"url": url, **(params or {})}, stream, content_type + ) + + def extract_contacts( + self, + url: str, + params: Optional[RequestParamsDict] = None, + stream: bool = False, + content_type: str = "application/json", + ): + """ + Extract contact information from the specified URL. + + :param url: The URL from which to extract contact information. + :param params: Optional parameters for the contact extraction. + :return: JSON response containing extracted contact details. + """ + return self.api_post( + "pipeline/extract-contacts", + {"url": url, **(params or {})}, + stream, + content_type, + ) + + def label( + self, + url: str, + params: Optional[RequestParamsDict] = None, + stream: bool = False, + content_type: str = "application/json", + ): + """ + Apply labeling to data extracted from the specified URL. + + :param url: The URL to label data from. + :param params: Optional parameters to guide the labeling process. + :return: JSON response with labeled data. + """ + return self.api_post( + "pipeline/label", {"url": url, **(params or {})}, stream, content_type + ) + + def _prepare_headers(self, content_type: str = "application/json"): + return { + "Content-Type": content_type, + "Authorization": f"Bearer {self.api_key}", + "User-Agent": "Spider-Client/0.0.27", + } + + def _post_request(self, url: str, data, headers, stream=False): + return requests.post(url, headers=headers, json=data, stream=stream) + + def _get_request(self, url: str, headers, stream=False): + return requests.get(url, headers=headers, stream=stream) + + def _delete_request(self, url: str, headers, stream=False): + return requests.delete(url, headers=headers, stream=stream) + + def _handle_error(self, response, action): + if response.status_code in [402, 409, 500]: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception( + f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}" + ) + else: + raise Exception( + f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}" + ) diff --git a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py new file mode 100644 index 0000000000..64bbcc10cc --- /dev/null +++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py @@ -0,0 +1,47 @@ +from typing import Any, Union + +from core.tools.entities.tool_entities import ToolInvokeMessage +from core.tools.provider.builtin.spider.spiderApp import Spider +from core.tools.tool.builtin_tool import BuiltinTool + + +class ScrapeTool(BuiltinTool): + def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]: + # initialize the app object with the api key + app = Spider(api_key=self.runtime.credentials['spider_api_key']) + + url = tool_parameters['url'] + mode = tool_parameters['mode'] + + options = { + 'limit': tool_parameters.get('limit', 0), + 'depth': tool_parameters.get('depth', 0), + 'blacklist': tool_parameters.get('blacklist', '').split(',') if tool_parameters.get('blacklist') else [], + 'whitelist': tool_parameters.get('whitelist', '').split(',') if tool_parameters.get('whitelist') else [], + 'readability': tool_parameters.get('readability', False), + } + + result = "" + + try: + if mode == 'scrape': + scrape_result = app.scrape_url( + url=url, + params=options, + ) + + for i in scrape_result: + result += "URL: " + i.get('url', '') + "\n" + result += "CONTENT: " + i.get('content', '') + "\n\n" + elif mode == 'crawl': + crawl_result = app.crawl_url( + url=tool_parameters['url'], + params=options, + ) + for i in crawl_result: + result += "URL: " + i.get('url', '') + "\n" + result += "CONTENT: " + i.get('content', '') + "\n\n" + except Exception as e: + return self.create_text_message("An error occured", str(e)) + + return self.create_text_message(result) diff --git a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml new file mode 100644 index 0000000000..4ebdce61ff --- /dev/null +++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml @@ -0,0 +1,100 @@ +identity: + name: scraper_crawler + author: William Espegren + label: + en_US: Web Scraper & Crawler + zh_Hans: 网页抓取与爬虫 +description: + human: + en_US: A tool for scraping & crawling webpages. Input should be a url. + zh_Hans: 用于抓取和爬取网页的工具。输入应该是一个网址。 + llm: A tool for scraping & crawling webpages. Input should be a url. +parameters: + - name: url + type: string + required: true + label: + en_US: URL + zh_Hans: 网址 + human_description: + en_US: url to be scraped or crawled + zh_Hans: 要抓取或爬取的网址 + llm_description: url to either be scraped or crawled + form: llm + - name: mode + type: select + required: true + options: + - value: scrape + label: + en_US: scrape + zh_Hans: 抓取 + - value: crawl + label: + en_US: crawl + zh_Hans: 爬取 + default: crawl + label: + en_US: Mode + zh_Hans: 模式 + human_description: + en_US: used for selecting to either scrape the website or crawl the entire website following subpages + zh_Hans: 用于选择抓取网站或爬取整个网站及其子页面 + form: form + - name: limit + type: number + required: false + label: + en_US: maximum number of pages to crawl + zh_Hans: 最大爬取页面数 + human_description: + en_US: specify the maximum number of pages to crawl per website. the crawler will stop after reaching this limit. + zh_Hans: 指定每个网站要爬取的最大页面数。爬虫将在达到此限制后停止。 + form: form + min: 0 + default: 0 + - name: depth + type: number + required: false + label: + en_US: maximum depth of pages to crawl + zh_Hans: 最大爬取深度 + human_description: + en_US: the crawl limit for maximum depth. + zh_Hans: 最大爬取深度的限制。 + form: form + min: 0 + default: 0 + - name: blacklist + type: string + required: false + label: + en_US: url patterns to exclude + zh_Hans: 要排除的URL模式 + human_description: + en_US: blacklist a set of paths that you do not want to crawl. you can use regex patterns to help with the list. + zh_Hans: 指定一组不想爬取的路径。您可以使用正则表达式模式来帮助定义列表。 + placeholder: /blog/*, /about + form: form + - name: whitelist + type: string + required: false + label: + en_US: URL patterns to include + zh_Hans: 要包含的URL模式 + human_description: + en_US: Whitelist a set of paths that you want to crawl, ignoring all other routes that do not match the patterns. You can use regex patterns to help with the list. + zh_Hans: 指定一组要爬取的路径,忽略所有不匹配模式的其他路由。您可以使用正则表达式模式来帮助定义列表。 + placeholder: /blog/*, /about + form: form + - name: readability + type: boolean + required: false + label: + en_US: Pre-process the content for LLM usage + zh_Hans: 仅返回页面的主要内容 + human_description: + en_US: Use Mozilla's readability to pre-process the content for reading. This may drastically improve the content for LLM usage. + zh_Hans: 如果启用,爬虫将仅返回页面的主要内容,不包括标题、导航、页脚等。 + form: form + default: false