From 588615b20e01d4c0e1fc1ec132eb0fcef5181e9d Mon Sep 17 00:00:00 2001
From: William Espegren <131612909+WilliamEspegren@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:29:33 +0200
Subject: [PATCH] feat: Spider web scraper & crawler tool (#5725)

---
 .../provider/builtin/spider/_assets/icon.svg  |   1 +
 .../tools/provider/builtin/spider/spider.py   |  14 ++
 .../tools/provider/builtin/spider/spider.yaml |  27 ++
 .../provider/builtin/spider/spiderApp.py      | 237 ++++++++++++++++++
 .../builtin/spider/tools/scraper_crawler.py   |  47 ++++
 .../builtin/spider/tools/scraper_crawler.yaml | 100 ++++++++
 6 files changed, 426 insertions(+)
 create mode 100644 api/core/tools/provider/builtin/spider/_assets/icon.svg
 create mode 100644 api/core/tools/provider/builtin/spider/spider.py
 create mode 100644 api/core/tools/provider/builtin/spider/spider.yaml
 create mode 100644 api/core/tools/provider/builtin/spider/spiderApp.py
 create mode 100644 api/core/tools/provider/builtin/spider/tools/scraper_crawler.py
 create mode 100644 api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml
diff --git a/api/core/tools/provider/builtin/spider/_assets/icon.svg b/api/core/tools/provider/builtin/spider/_assets/icon.svg
new file mode 100644
index 0000000000..604a09d01d
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/_assets/icon.svg
@@ -0,0 +1 @@
+<svg height="30" width="30" viewBox="0 0 36 34" xml:space="preserve" xmlns="http://www.w3.org/2000/svg" class="fill-accent-foreground transition-all group-hover:scale-110"><title>Spider v1 Logo</title><path fill-rule="evenodd" clip-rule="evenodd" d="M9.13883 7.06589V0.164429L13.0938 0.164429V6.175L14.5178 7.4346C15.577 6.68656 16.7337 6.27495 17.945 6.27495C19.1731 6.27495 20.3451 6.69807 21.4163 7.46593L22.8757 6.175V0.164429L26.8307 0.164429V7.06589V7.95679L26.1634 8.54706L24.0775 10.3922C24.3436 10.8108 24.5958 11.2563 24.8327 11.7262L26.0467 11.4215L28.6971 8.08749L31.793 10.5487L28.7257 14.407L28.3089 14.9313L27.6592 15.0944L26.2418 15.4502C26.3124 15.7082 26.3793 15.9701 26.4422 16.2355L28.653 16.6566L29.092 16.7402L29.4524 17.0045L35.3849 21.355L33.0461 24.5444L27.474 20.4581L27.0719 20.3816C27.1214 21.0613 27.147 21.7543 27.147 22.4577C27.147 22.5398 27.1466 22.6214 27.1459 22.7024L29.5889 23.7911L30.3219 24.1177L30.62 24.8629L33.6873 32.5312L30.0152 34L27.246 27.0769L26.7298 26.8469C25.5612 32.2432 22.0701 33.8808 17.945 33.8808C13.8382 33.8808 10.3598 32.2577 9.17593 26.9185L8.82034 27.0769L6.05109 34L2.37897 32.5312L5.44629 24.8629L5.74435 24.1177L6.47743 23.7911L8.74487 22.7806C8.74366 22.6739 8.74305 22.5663 8.74305 22.4577C8.74305 21.7616 8.76804 21.0758 8.81654 20.4028L8.52606 20.4581L2.95395 24.5444L0.615112 21.355L6.54761 17.0045L6.908 16.7402L7.34701 16.6566L9.44264 16.2575C9.50917 15.9756 9.5801 15.6978 9.65528 15.4242L8.34123 15.0944L7.69155 14.9313L7.27471 14.407L4.20739 10.5487L7.30328 8.08749L9.95376 11.4215L11.0697 11.7016C11.3115 11.2239 11.5692 10.7716 11.8412 10.3473L9.80612 8.54706L9.13883 7.95679V7.06589Z"></path></svg>
diff --git a/api/core/tools/provider/builtin/spider/spider.py b/api/core/tools/provider/builtin/spider/spider.py
new file mode 100644
index 0000000000..6fa431b6bb
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/spider.py
@@ -0,0 +1,14 @@
+from typing import Any
+
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.spider.spiderApp import Spider
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class SpiderProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict[str, Any]) -> None:
+        try:
+            app = Spider(api_key=credentials["spider_api_key"])
+            app.scrape_url(url="https://spider.cloud")
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(str(e))
diff --git a/api/core/tools/provider/builtin/spider/spider.yaml b/api/core/tools/provider/builtin/spider/spider.yaml
new file mode 100644
index 0000000000..45702c85dd
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/spider.yaml
@@ -0,0 +1,27 @@
+identity:
+  author: William Espegren
+  name: spider
+  label:
+    en_US: Spider
+    zh_CN: Spider
+  description:
+    en_US: Spider API integration, returning LLM-ready data by scraping & crawling websites.
+    zh_CN: Spider API 集成，通过爬取和抓取网站返回 LLM-ready 数据。
+  icon: icon.svg
+  tags:
+    - search
+    - utilities
+credentials_for_provider:
+  spider_api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: Spider API Key
+      zh_CN: Spider API 密钥
+    placeholder:
+      en_US: Please input your Spider API key
+      zh_CN: 请输入您的 Spider API 密钥
+    help:
+      en_US: Get your Spider API key from your Spider dashboard
+      zh_CN: 从您的 Spider 仪表板中获取 Spider API 密钥。
+    url: https://spider.cloud/
diff --git a/api/core/tools/provider/builtin/spider/spiderApp.py b/api/core/tools/provider/builtin/spider/spiderApp.py
new file mode 100644
index 0000000000..82c0df19ca
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/spiderApp.py
@@ -0,0 +1,237 @@
+import os
+from typing import Literal, Optional, TypedDict
+
+import requests
+
+
+class RequestParamsDict(TypedDict, total=False):
+    url: Optional[str]
+    request: Optional[Literal["http", "chrome", "smart"]]
+    limit: Optional[int]
+    return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
+    tld: Optional[bool]
+    depth: Optional[int]
+    cache: Optional[bool]
+    budget: Optional[dict[str, int]]
+    locale: Optional[str]
+    cookies: Optional[str]
+    stealth: Optional[bool]
+    headers: Optional[dict[str, str]]
+    anti_bot: Optional[bool]
+    metadata: Optional[bool]
+    viewport: Optional[dict[str, int]]
+    encoding: Optional[str]
+    subdomains: Optional[bool]
+    user_agent: Optional[str]
+    store_data: Optional[bool]
+    gpt_config: Optional[list[str]]
+    fingerprint: Optional[bool]
+    storageless: Optional[bool]
+    readability: Optional[bool]
+    proxy_enabled: Optional[bool]
+    respect_robots: Optional[bool]
+    query_selector: Optional[str]
+    full_resources: Optional[bool]
+    request_timeout: Optional[int]
+    run_in_background: Optional[bool]
+    skip_config_checks: Optional[bool]
+
+
+class Spider:
+    def __init__(self, api_key: Optional[str] = None):
+        """
+        Initialize the Spider with an API key.
+
+        :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
+        :raises ValueError: If no API key is provided.
+        """
+        self.api_key = api_key or os.getenv("SPIDER_API_KEY")
+        if self.api_key is None:
+            raise ValueError("No API key provided")
+
+    def api_post(
+        self,
+        endpoint: str,
+        data: dict,
+        stream: bool,
+        content_type: str = "application/json",
+    ):
+        """
+        Send a POST request to the specified API endpoint.
+
+        :param endpoint: The API endpoint to which the POST request is sent.
+        :param data: The data (dictionary) to be sent in the POST request.
+        :param stream: Boolean indicating if the response should be streamed.
+        :return: The JSON response or the raw response stream if stream is True.
+        """
+        headers = self._prepare_headers(content_type)
+        response = self._post_request(
+            f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
+        )
+
+        if stream:
+            return response
+        elif response.status_code == 200:
+            return response.json()
+        else:
+            self._handle_error(response, f"post to {endpoint}")
+
+    def api_get(
+        self, endpoint: str, stream: bool, content_type: str = "application/json"
+    ):
+        """
+        Send a GET request to the specified endpoint.
+
+        :param endpoint: The API endpoint from which to retrieve data.
+        :return: The JSON decoded response.
+        """
+        headers = self._prepare_headers(content_type)
+        response = self._get_request(
+            f"https://api.spider.cloud/v1/{endpoint}", headers, stream
+        )
+        if response.status_code == 200:
+            return response.json()
+        else:
+            self._handle_error(response, f"get from {endpoint}")
+
+    def get_credits(self):
+        """
+        Retrieve the account's remaining credits.
+
+        :return: JSON response containing the number of credits left.
+        """
+        return self.api_get("credits", stream=False)
+
+    def scrape_url(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Scrape data from the specified URL.
+
+        :param url: The URL from which to scrape data.
+        :param params: Optional dictionary of additional parameters for the scrape request.
+        :return: JSON response containing the scraping results.
+        """
+
+        # Add { "return_format": "markdown" } to the params if not already present
+        if "return_format" not in params:
+            params["return_format"] = "markdown"    
+
+        # Set limit to 1
+        params["limit"] = 1
+
+        return self.api_post(
+            "crawl", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def crawl_url(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Start crawling at the specified URL.
+
+        :param url: The URL to begin crawling.
+        :param params: Optional dictionary with additional parameters to customize the crawl.
+        :param stream: Boolean indicating if the response should be streamed. Defaults to False.
+        :return: JSON response or the raw response stream if streaming enabled.
+        """
+
+        # Add { "return_format": "markdown" } to the params if not already present
+        if "return_format" not in params:
+            params["return_format"] = "markdown"
+
+        return self.api_post(
+            "crawl", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def links(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Retrieve links from the specified URL.
+
+        :param url: The URL from which to extract links.
+        :param params: Optional parameters for the link retrieval request.
+        :return: JSON response containing the links.
+        """
+        return self.api_post(
+            "links", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def extract_contacts(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Extract contact information from the specified URL.
+
+        :param url: The URL from which to extract contact information.
+        :param params: Optional parameters for the contact extraction.
+        :return: JSON response containing extracted contact details.
+        """
+        return self.api_post(
+            "pipeline/extract-contacts",
+            {"url": url, **(params or {})},
+            stream,
+            content_type,
+        )
+
+    def label(
+        self,
+        url: str,
+        params: Optional[RequestParamsDict] = None,
+        stream: bool = False,
+        content_type: str = "application/json",
+    ):
+        """
+        Apply labeling to data extracted from the specified URL.
+
+        :param url: The URL to label data from.
+        :param params: Optional parameters to guide the labeling process.
+        :return: JSON response with labeled data.
+        """
+        return self.api_post(
+            "pipeline/label", {"url": url, **(params or {})}, stream, content_type
+        )
+
+    def _prepare_headers(self, content_type: str = "application/json"):
+        return {
+            "Content-Type": content_type,
+            "Authorization": f"Bearer {self.api_key}",
+            "User-Agent": "Spider-Client/0.0.27",
+        }
+
+    def _post_request(self, url: str, data, headers, stream=False):
+        return requests.post(url, headers=headers, json=data, stream=stream)
+
+    def _get_request(self, url: str, headers, stream=False):
+        return requests.get(url, headers=headers, stream=stream)
+
+    def _delete_request(self, url: str, headers, stream=False):
+        return requests.delete(url, headers=headers, stream=stream)
+
+    def _handle_error(self, response, action):
+        if response.status_code in [402, 409, 500]:
+            error_message = response.json().get("error", "Unknown error occurred")
+            raise Exception(
+                f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
+            )
+        else:
+            raise Exception(
+                f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
+            )
diff --git a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py
new file mode 100644
index 0000000000..64bbcc10cc
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.py
@@ -0,0 +1,47 @@
+from typing import Any, Union
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.provider.builtin.spider.spiderApp import Spider
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class ScrapeTool(BuiltinTool):
+    def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        # initialize the app object with the api key
+        app = Spider(api_key=self.runtime.credentials['spider_api_key'])
+
+        url = tool_parameters['url']
+        mode = tool_parameters['mode']
+        
+        options = {
+            'limit': tool_parameters.get('limit', 0),
+            'depth': tool_parameters.get('depth', 0),
+            'blacklist': tool_parameters.get('blacklist', '').split(',') if tool_parameters.get('blacklist') else [],
+            'whitelist': tool_parameters.get('whitelist', '').split(',') if tool_parameters.get('whitelist') else [],
+            'readability': tool_parameters.get('readability', False),
+        }
+
+        result = ""
+
+        try:
+            if mode == 'scrape':
+                scrape_result = app.scrape_url(
+                    url=url, 
+                    params=options,
+                )
+
+                for i in scrape_result:
+                    result += "URL: " + i.get('url', '') + "\n"
+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
+            elif mode == 'crawl':
+                crawl_result = app.crawl_url(
+                    url=tool_parameters['url'], 
+                    params=options,
+                )
+                for i in crawl_result:
+                    result += "URL: " + i.get('url', '') + "\n"
+                    result += "CONTENT: " + i.get('content', '') + "\n\n"
+        except Exception as e:
+            return self.create_text_message("An error occured", str(e))
+
+        return self.create_text_message(result)
diff --git a/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml
new file mode 100644
index 0000000000..4ebdce61ff
--- /dev/null
+++ b/api/core/tools/provider/builtin/spider/tools/scraper_crawler.yaml
@@ -0,0 +1,100 @@
+identity:
+  name: scraper_crawler
+  author: William Espegren
+  label:
+    en_US: Web Scraper & Crawler
+    zh_Hans: 网页抓取与爬虫
+description:
+  human:
+    en_US: A tool for scraping & crawling webpages. Input should be a url.
+    zh_Hans: 用于抓取和爬取网页的工具。输入应该是一个网址。
+  llm: A tool for scraping & crawling webpages. Input should be a url.
+parameters:
+  - name: url
+    type: string
+    required: true
+    label:
+      en_US: URL
+      zh_Hans: 网址
+    human_description:
+      en_US: url to be scraped or crawled
+      zh_Hans: 要抓取或爬取的网址
+    llm_description: url to either be scraped or crawled
+    form: llm
+  - name: mode
+    type: select
+    required: true
+    options:
+      - value: scrape
+        label:
+          en_US: scrape
+          zh_Hans: 抓取
+      - value: crawl
+        label:
+          en_US: crawl
+          zh_Hans: 爬取
+    default: crawl
+    label:
+      en_US: Mode
+      zh_Hans: 模式
+    human_description:
+      en_US: used for selecting to either scrape the website or crawl the entire website following subpages
+      zh_Hans: 用于选择抓取网站或爬取整个网站及其子页面
+    form: form
+  - name: limit
+    type: number
+    required: false
+    label:
+      en_US: maximum number of pages to crawl
+      zh_Hans: 最大爬取页面数
+    human_description:
+      en_US: specify the maximum number of pages to crawl per website. the crawler will stop after reaching this limit.
+      zh_Hans: 指定每个网站要爬取的最大页面数。爬虫将在达到此限制后停止。
+    form: form
+    min: 0
+    default: 0
+  - name: depth
+    type: number
+    required: false
+    label:
+      en_US: maximum depth of pages to crawl
+      zh_Hans: 最大爬取深度
+    human_description:
+      en_US: the crawl limit for maximum depth.
+      zh_Hans: 最大爬取深度的限制。
+    form: form
+    min: 0
+    default: 0
+  - name: blacklist
+    type: string
+    required: false
+    label:
+      en_US: url patterns to exclude
+      zh_Hans: 要排除的URL模式
+    human_description:
+      en_US: blacklist a set of paths that you do not want to crawl. you can use regex patterns to help with the list.
+      zh_Hans: 指定一组不想爬取的路径。您可以使用正则表达式模式来帮助定义列表。
+    placeholder: /blog/*, /about
+    form: form
+  - name: whitelist
+    type: string
+    required: false
+    label:
+      en_US: URL patterns to include
+      zh_Hans: 要包含的URL模式
+    human_description:
+      en_US: Whitelist a set of paths that you want to crawl, ignoring all other routes that do not match the patterns. You can use regex patterns to help with the list.
+      zh_Hans: 指定一组要爬取的路径，忽略所有不匹配模式的其他路由。您可以使用正则表达式模式来帮助定义列表。
+    placeholder: /blog/*, /about
+    form: form
+  - name: readability
+    type: boolean
+    required: false
+    label:
+      en_US: Pre-process the content for LLM usage
+      zh_Hans: 仅返回页面的主要内容
+    human_description:
+      en_US: Use Mozilla's readability to pre-process the content for reading. This may drastically improve the content for LLM usage.
+      zh_Hans: 如果启用，爬虫将仅返回页面的主要内容，不包括标题、导航、页脚等。
+    form: form
+    default: false