From f54905e685f62ee4856a676688d5de4143433250 Mon Sep 17 00:00:00 2001 From: Amir Mohsen Asaran Date: Mon, 7 Apr 2025 06:43:23 +0200 Subject: [PATCH] feat: Integrate WaterCrawl.dev as a new knowledge base provider (#16396) Co-authored-by: crazywoola <427733928@qq.com> --- api/controllers/console/datasets/website.py | 11 +- api/core/rag/extractor/extract_processor.py | 10 + api/core/rag/extractor/watercrawl/client.py | 161 +++++++++++++ .../rag/extractor/watercrawl/extractor.py | 57 +++++ api/core/rag/extractor/watercrawl/provider.py | 117 ++++++++++ api/services/auth/api_key_auth_factory.py | 4 + api/services/auth/auth_type.py | 1 + api/services/auth/watercrawl/__init__.py | 0 api/services/auth/watercrawl/watercrawl.py | 44 ++++ api/services/website_service.py | 24 ++ .../datasets/create/assets/watercrawl.svg | 20 ++ .../datasets/create/website/index.module.css | 7 + .../datasets/create/website/index.tsx | 88 ++++--- .../datasets/create/website/no-data.tsx | 5 + .../create/website/watercrawl/header.tsx | 43 ++++ .../create/website/watercrawl/index.tsx | 217 ++++++++++++++++++ .../create/website/watercrawl/options.tsx | 85 +++++++ .../config-watercrawl-modal.tsx | 161 +++++++++++++ .../data-source-website/index.tsx | 57 +++-- .../data-source-page/index.tsx | 1 + .../data-source-page/panel/index.tsx | 8 +- web/i18n/en-US/dataset-creation.ts | 11 + web/models/common.ts | 6 + web/service/datasets.ts | 19 ++ 24 files changed, 1102 insertions(+), 55 deletions(-) create mode 100644 api/core/rag/extractor/watercrawl/client.py create mode 100644 api/core/rag/extractor/watercrawl/extractor.py create mode 100644 api/core/rag/extractor/watercrawl/provider.py create mode 100644 api/services/auth/watercrawl/__init__.py create mode 100644 api/services/auth/watercrawl/watercrawl.py create mode 100644 web/app/components/datasets/create/assets/watercrawl.svg create mode 100644 web/app/components/datasets/create/website/watercrawl/header.tsx create mode 100644 web/app/components/datasets/create/website/watercrawl/index.tsx create mode 100644 web/app/components/datasets/create/website/watercrawl/options.tsx create mode 100644 web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx diff --git a/api/controllers/console/datasets/website.py b/api/controllers/console/datasets/website.py index da995537e7..33c926b4c9 100644 --- a/api/controllers/console/datasets/website.py +++ b/api/controllers/console/datasets/website.py @@ -14,7 +14,12 @@ class WebsiteCrawlApi(Resource): def post(self): parser = reqparse.RequestParser() parser.add_argument( - "provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json" + "provider", + type=str, + choices=["firecrawl", "watercrawl", "jinareader"], + required=True, + nullable=True, + location="json", ) parser.add_argument("url", type=str, required=True, nullable=True, location="json") parser.add_argument("options", type=dict, required=True, nullable=True, location="json") @@ -34,7 +39,9 @@ class WebsiteCrawlStatusApi(Resource): @account_initialization_required def get(self, job_id: str): parser = reqparse.RequestParser() - parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args") + parser.add_argument( + "provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], required=True, location="args" + ) args = parser.parse_args() # get crawl status try: diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py index e80d0f0bbe..bc19899ea5 100644 --- a/api/core/rag/extractor/extract_processor.py +++ b/api/core/rag/extractor/extract_processor.py @@ -26,6 +26,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor +from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor from core.rag.extractor.word_extractor import WordExtractor from core.rag.models.document import Document from extensions.ext_storage import storage @@ -183,6 +184,15 @@ class ExtractProcessor: only_main_content=extract_setting.website_info.only_main_content, ) return extractor.extract() + elif extract_setting.website_info.provider == "watercrawl": + extractor = WaterCrawlWebExtractor( + url=extract_setting.website_info.url, + job_id=extract_setting.website_info.job_id, + tenant_id=extract_setting.website_info.tenant_id, + mode=extract_setting.website_info.mode, + only_main_content=extract_setting.website_info.only_main_content, + ) + return extractor.extract() elif extract_setting.website_info.provider == "jinareader": extractor = JinaReaderWebExtractor( url=extract_setting.website_info.url, diff --git a/api/core/rag/extractor/watercrawl/client.py b/api/core/rag/extractor/watercrawl/client.py new file mode 100644 index 0000000000..6eaede7dbc --- /dev/null +++ b/api/core/rag/extractor/watercrawl/client.py @@ -0,0 +1,161 @@ +import json +from collections.abc import Generator +from typing import Union +from urllib.parse import urljoin + +import requests +from requests import Response + + +class BaseAPIClient: + def __init__(self, api_key, base_url): + self.api_key = api_key + self.base_url = base_url + self.session = self.init_session() + + def init_session(self): + session = requests.Session() + session.headers.update({"X-API-Key": self.api_key}) + session.headers.update({"Content-Type": "application/json"}) + session.headers.update({"Accept": "application/json"}) + session.headers.update({"User-Agent": "WaterCrawl-Plugin"}) + session.headers.update({"Accept-Language": "en-US"}) + return session + + def _get(self, endpoint: str, query_params: dict | None = None, **kwargs): + return self.session.get(urljoin(self.base_url, endpoint), params=query_params, **kwargs) + + def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.post(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.put(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs): + return self.session.delete(urljoin(self.base_url, endpoint), params=query_params, **kwargs) + + def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs): + return self.session.patch(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs) + + +class WaterCrawlAPIClient(BaseAPIClient): + def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"): + super().__init__(api_key, base_url) + + def process_eventstream(self, response: Response, download: bool = False) -> Generator: + for line in response.iter_lines(): + line = line.decode("utf-8") + if line.startswith("data:"): + line = line[5:].strip() + data = json.loads(line) + if data["type"] == "result" and download: + data["data"] = self.download_result(data["data"]) + yield data + + def process_response(self, response: Response) -> dict | bytes | list | None | Generator: + response.raise_for_status() + if response.status_code == 204: + return None + if response.headers.get("Content-Type") == "application/json": + return response.json() or {} + + if response.headers.get("Content-Type") == "application/octet-stream": + return response.content + + if response.headers.get("Content-Type") == "text/event-stream": + return self.process_eventstream(response) + + raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}") + + def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None): + query_params = {"page": page or 1, "page_size": page_size or 10} + return self.process_response( + self._get( + "/api/v1/core/crawl-requests/", + query_params=query_params, + ) + ) + + def get_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f"/api/v1/core/crawl-requests/{item_id}/", + ) + ) + + def create_crawl_request( + self, + url: Union[list, str] | None = None, + spider_options: dict | None = None, + page_options: dict | None = None, + plugin_options: dict | None = None, + ): + data = { + # 'urls': url if isinstance(url, list) else [url], + "url": url, + "options": { + "spider_options": spider_options or {}, + "page_options": page_options or {}, + "plugin_options": plugin_options or {}, + }, + } + return self.process_response( + self._post( + "/api/v1/core/crawl-requests/", + data=data, + ) + ) + + def stop_crawl_request(self, item_id: str): + return self.process_response( + self._delete( + f"/api/v1/core/crawl-requests/{item_id}/", + ) + ) + + def download_crawl_request(self, item_id: str): + return self.process_response( + self._get( + f"/api/v1/core/crawl-requests/{item_id}/download/", + ) + ) + + def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator: + query_params = {"prefetched": str(prefetched).lower()} + generator = self.process_response( + self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params), + ) + if not isinstance(generator, Generator): + raise ValueError("Generator expected") + yield from generator + + def get_crawl_request_results( + self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None + ): + query_params = query_params or {} + query_params.update({"page": page or 1, "page_size": page_size or 25}) + return self.process_response( + self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params) + ) + + def scrape_url( + self, + url: str, + page_options: dict | None = None, + plugin_options: dict | None = None, + sync: bool = True, + prefetched: bool = True, + ): + response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options) + if not sync: + return response_result + + for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched): + if event_data["type"] == "result": + return event_data["data"] + + def download_result(self, result_object: dict): + response = requests.get(result_object["result"]) + response.raise_for_status() + result_object["result"] = response.json() + return result_object diff --git a/api/core/rag/extractor/watercrawl/extractor.py b/api/core/rag/extractor/watercrawl/extractor.py new file mode 100644 index 0000000000..40d1740962 --- /dev/null +++ b/api/core/rag/extractor/watercrawl/extractor.py @@ -0,0 +1,57 @@ +from core.rag.extractor.extractor_base import BaseExtractor +from core.rag.models.document import Document +from services.website_service import WebsiteService + + +class WaterCrawlWebExtractor(BaseExtractor): + """ + Crawl and scrape websites and return content in clean llm-ready markdown. + + + Args: + url: The URL to scrape. + api_key: The API key for WaterCrawl. + base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'. + mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. + only_main_content: Only return the main content of the page excluding headers, navs, footers, etc. + """ + + def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True): + """Initialize with url, api_key, base_url and mode.""" + self._url = url + self.job_id = job_id + self.tenant_id = tenant_id + self.mode = mode + self.only_main_content = only_main_content + + def extract(self) -> list[Document]: + """Extract content from the URL.""" + documents = [] + if self.mode == "crawl": + crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id) + if crawl_data is None: + return [] + document = Document( + page_content=crawl_data.get("markdown", ""), + metadata={ + "source_url": crawl_data.get("source_url"), + "description": crawl_data.get("description"), + "title": crawl_data.get("title"), + }, + ) + documents.append(document) + elif self.mode == "scrape": + scrape_data = WebsiteService.get_scrape_url_data( + "watercrawl", self._url, self.tenant_id, self.only_main_content + ) + + document = Document( + page_content=scrape_data.get("markdown", ""), + metadata={ + "source_url": scrape_data.get("source_url"), + "description": scrape_data.get("description"), + "title": scrape_data.get("title"), + }, + ) + documents.append(document) + return documents diff --git a/api/core/rag/extractor/watercrawl/provider.py b/api/core/rag/extractor/watercrawl/provider.py new file mode 100644 index 0000000000..b8003b386b --- /dev/null +++ b/api/core/rag/extractor/watercrawl/provider.py @@ -0,0 +1,117 @@ +from collections.abc import Generator +from datetime import datetime +from typing import Any + +from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient + + +class WaterCrawlProvider: + def __init__(self, api_key, base_url: str | None = None): + self.client = WaterCrawlAPIClient(api_key, base_url) + + def crawl_url(self, url, options: dict | Any = None) -> dict: + options = options or {} + spider_options = { + "max_depth": 1, + "page_limit": 1, + "allowed_domains": [], + "exclude_paths": [], + "include_paths": [], + } + if options.get("crawl_sub_pages", True): + spider_options["page_limit"] = options.get("limit", 1) + spider_options["max_depth"] = options.get("depth", 1) + spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else [] + spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else [] + + wait_time = options.get("wait_time", 1000) + page_options = { + "exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [], + "include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [], + "wait_time": max(1000, wait_time), # minimum wait time is 1 second + "include_html": False, + "only_main_content": options.get("only_main_content", True), + "include_links": False, + "timeout": 15000, + "accept_cookies_selector": "#cookies-accept", + "locale": "en-US", + "actions": [], + } + result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options) + + return {"status": "active", "job_id": result.get("uuid")} + + def get_crawl_status(self, crawl_request_id) -> dict: + response = self.client.get_crawl_request(crawl_request_id) + data = [] + if response["status"] in ["new", "running"]: + status = "active" + else: + status = "completed" + data = list(self._get_results(crawl_request_id)) + + time_str = response.get("duration") + time_consuming: float = 0 + if time_str: + time_obj = datetime.strptime(time_str, "%H:%M:%S.%f") + time_consuming = ( + time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000 + ) + + return { + "status": status, + "job_id": response.get("uuid"), + "total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1), + "current": response.get("number_of_documents", 0), + "data": data, + "time_consuming": time_consuming, + } + + def get_crawl_url_data(self, job_id, url) -> dict | None: + if not job_id: + return self.scrape_url(url) + + for result in self._get_results( + job_id, + { + # filter by url + "url": url + }, + ): + return result + + return None + + def scrape_url(self, url: str) -> dict: + response = self.client.scrape_url(url=url, sync=True, prefetched=True) + return self._structure_data(response) + + def _structure_data(self, result_object: dict) -> dict: + if isinstance(result_object.get("result", {}), str): + raise ValueError("Invalid result object. Expected a dictionary.") + + metadata = result_object.get("result", {}).get("metadata", {}) + return { + "title": metadata.get("og:title") or metadata.get("title"), + "description": metadata.get("description"), + "source_url": result_object.get("url"), + "markdown": result_object.get("result", {}).get("markdown"), + } + + def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]: + page = 0 + page_size = 100 + + query_params = query_params or {} + query_params.update({"prefetched": "true"}) + while True: + page += 1 + response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params) + if not response["results"]: + break + + for result in response["results"]: + yield self._structure_data(result) + + if response["next"] is None: + break diff --git a/api/services/auth/api_key_auth_factory.py b/api/services/auth/api_key_auth_factory.py index f91c448fb9..7ae31b0768 100644 --- a/api/services/auth/api_key_auth_factory.py +++ b/api/services/auth/api_key_auth_factory.py @@ -17,6 +17,10 @@ class ApiKeyAuthFactory: from services.auth.firecrawl.firecrawl import FirecrawlAuth return FirecrawlAuth + case AuthType.WATERCRAWL: + from services.auth.watercrawl.watercrawl import WatercrawlAuth + + return WatercrawlAuth case AuthType.JINA: from services.auth.jina.jina import JinaAuth diff --git a/api/services/auth/auth_type.py b/api/services/auth/auth_type.py index 2e1946841f..ec7118df27 100644 --- a/api/services/auth/auth_type.py +++ b/api/services/auth/auth_type.py @@ -3,4 +3,5 @@ from enum import StrEnum class AuthType(StrEnum): FIRECRAWL = "firecrawl" + WATERCRAWL = "watercrawl" JINA = "jinareader" diff --git a/api/services/auth/watercrawl/__init__.py b/api/services/auth/watercrawl/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/services/auth/watercrawl/watercrawl.py b/api/services/auth/watercrawl/watercrawl.py new file mode 100644 index 0000000000..153ab5ba75 --- /dev/null +++ b/api/services/auth/watercrawl/watercrawl.py @@ -0,0 +1,44 @@ +import json +from urllib.parse import urljoin + +import requests + +from services.auth.api_key_auth_base import ApiKeyAuthBase + + +class WatercrawlAuth(ApiKeyAuthBase): + def __init__(self, credentials: dict): + super().__init__(credentials) + auth_type = credentials.get("auth_type") + if auth_type != "x-api-key": + raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key") + self.api_key = credentials.get("config", {}).get("api_key", None) + self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev") + + if not self.api_key: + raise ValueError("No API key provided") + + def validate_credentials(self): + headers = self._prepare_headers() + url = urljoin(self.base_url, "/api/v1/core/crawl-requests/") + response = self._get_request(url, headers) + if response.status_code == 200: + return True + else: + self._handle_error(response) + + def _prepare_headers(self): + return {"Content-Type": "application/json", "X-API-KEY": self.api_key} + + def _get_request(self, url, headers): + return requests.get(url, headers=headers) + + def _handle_error(self, response): + if response.status_code in {402, 409, 500}: + error_message = response.json().get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + else: + if response.text: + error_message = json.loads(response.text).get("error", "Unknown error occurred") + raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}") + raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}") diff --git a/api/services/website_service.py b/api/services/website_service.py index 85d32c9e8a..460a637a43 100644 --- a/api/services/website_service.py +++ b/api/services/website_service.py @@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore from core.helper import encrypter from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp +from core.rag.extractor.watercrawl.provider import WaterCrawlProvider from extensions.ext_redis import redis_client from extensions.ext_storage import storage from services.auth.api_key_auth_service import ApiKeyAuthService @@ -59,6 +60,13 @@ class WebsiteService: time = str(datetime.datetime.now().timestamp()) redis_client.setex(website_crawl_time_cache_key, 3600, time) return {"status": "active", "job_id": job_id} + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options) + elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -116,6 +124,14 @@ class WebsiteService: time_consuming = abs(end_time - float(start_time)) crawl_status_data["time_consuming"] = f"{time_consuming:.2f}" redis_client.delete(website_crawl_time_cache_key) + elif provider == "watercrawl": + # decrypt api_key + api_key = encrypter.decrypt_token( + tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") + ) + crawl_status_data = WaterCrawlProvider( + api_key, credentials.get("config").get("base_url", None) + ).get_crawl_status(job_id) elif provider == "jinareader": api_key = encrypter.decrypt_token( tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key") @@ -180,6 +196,11 @@ class WebsiteService: if item.get("source_url") == url: return dict(item) return None + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data( + job_id, url + ) elif provider == "jinareader": if not job_id: response = requests.get( @@ -223,5 +244,8 @@ class WebsiteService: params = {"onlyMainContent": only_main_content} result = firecrawl_app.scrape_url(url, params) return result + elif provider == "watercrawl": + api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key")) + return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url) else: raise ValueError("Invalid provider") diff --git a/web/app/components/datasets/create/assets/watercrawl.svg b/web/app/components/datasets/create/assets/watercrawl.svg new file mode 100644 index 0000000000..bd4e6bac6f --- /dev/null +++ b/web/app/components/datasets/create/assets/watercrawl.svg @@ -0,0 +1,20 @@ + + + + + + + + + + + + + + + + + + + + diff --git a/web/app/components/datasets/create/website/index.module.css b/web/app/components/datasets/create/website/index.module.css index abaab4bea4..cf6c36468d 100644 --- a/web/app/components/datasets/create/website/index.module.css +++ b/web/app/components/datasets/create/website/index.module.css @@ -4,3 +4,10 @@ background-image: url(../assets/jina.png); background-size: 16px; } + +.watercrawlLogo { + @apply w-5 h-5 bg-center bg-no-repeat inline-block; + /*background-color: #F5FAFF;*/ + background-image: url(../assets/watercrawl.svg); + background-size: 16px; +} diff --git a/web/app/components/datasets/create/website/index.tsx b/web/app/components/datasets/create/website/index.tsx index 1cd4562a58..5122ef6ed2 100644 --- a/web/app/components/datasets/create/website/index.tsx +++ b/web/app/components/datasets/create/website/index.tsx @@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next' import s from './index.module.css' import NoData from './no-data' import Firecrawl from './firecrawl' +import Watercrawl from './watercrawl' import JinaReader from './jina-reader' import cn from '@/utils/classnames' import { useModalContext } from '@/context/modal-context' @@ -47,7 +48,11 @@ const Website: FC = ({ // If users have configured one of the providers, select it. const availableProviders = res.sources.filter((item: DataSourceItem) => - [DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider), + [ + DataSourceProvider.jinaReader, + DataSourceProvider.fireCrawl, + DataSourceProvider.waterCrawl, + ].includes(item.provider), ) if (availableProviders.length > 0) @@ -70,6 +75,8 @@ const Website: FC = ({ if (!isLoaded) return null + const source = sources.find(source => source.provider === selectedProvider) + return (
@@ -86,7 +93,7 @@ const Website: FC = ({ )} onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)} > - + Jina Reader +
- - { - selectedProvider === DataSourceProvider.fireCrawl - ? sources.find(source => source.provider === DataSourceProvider.fireCrawl) - ? ( - - ) - : ( - - ) - : sources.find(source => source.provider === DataSourceProvider.jinaReader) - ? ( - - ) - : ( - - ) - } + {source && selectedProvider === DataSourceProvider.fireCrawl && ( + + )} + {source && selectedProvider === DataSourceProvider.waterCrawl && ( + + )} + {source && selectedProvider === DataSourceProvider.jinaReader && ( + + )} + {!source && ( + + )} ) } diff --git a/web/app/components/datasets/create/website/no-data.tsx b/web/app/components/datasets/create/website/no-data.tsx index dca063b56a..14be2e29f6 100644 --- a/web/app/components/datasets/create/website/no-data.tsx +++ b/web/app/components/datasets/create/website/no-data.tsx @@ -31,6 +31,11 @@ const NoData: FC = ({ title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`), description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`), }, + [DataSourceProvider.waterCrawl]: { + emoji: , + title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`), + description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`), + }, } const currentProvider = providerConfig[provider] diff --git a/web/app/components/datasets/create/website/watercrawl/header.tsx b/web/app/components/datasets/create/website/watercrawl/header.tsx new file mode 100644 index 0000000000..f2131fcd1a --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/header.tsx @@ -0,0 +1,43 @@ +'use client' +import type { FC } from 'react' +import React from 'react' +import { useTranslation } from 'react-i18next' +import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react' +import Button from '@/app/components/base/button' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onSetting: () => void +} + +const Header: FC = ({ + onSetting, +}) => { + const { t } = useTranslation() + + return ( +
+
+
{t(`${I18N_PREFIX}.watercrawlTitle`)}
+
+ +
+ + + {t(`${I18N_PREFIX}.watercrawlDoc`)} + +
+ ) +} +export default React.memo(Header) diff --git a/web/app/components/datasets/create/website/watercrawl/index.tsx b/web/app/components/datasets/create/website/watercrawl/index.tsx new file mode 100644 index 0000000000..bd4faa1383 --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/index.tsx @@ -0,0 +1,217 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useEffect, useState } from 'react' +import { useTranslation } from 'react-i18next' +import UrlInput from '../base/url-input' +import OptionsWrap from '../base/options-wrap' +import CrawledResult from '../base/crawled-result' +import Crawling from '../base/crawling' +import ErrorMessage from '../base/error-message' +import Header from './header' +import Options from './options' +import { useModalContext } from '@/context/modal-context' +import type { CrawlOptions, CrawlResultItem } from '@/models/datasets' +import Toast from '@/app/components/base/toast' +import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets' +import { sleep } from '@/utils' + +const ERROR_I18N_PREFIX = 'common.errorMsg' +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + onPreview: (payload: CrawlResultItem) => void + checkedCrawlResult: CrawlResultItem[] + onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void + onJobIdChange: (jobId: string) => void + crawlOptions: CrawlOptions + onCrawlOptionsChange: (payload: CrawlOptions) => void +} + +enum Step { + init = 'init', + running = 'running', + finished = 'finished', +} + +const WaterCrawl: FC = ({ + onPreview, + checkedCrawlResult, + onCheckedCrawlResultChange, + onJobIdChange, + crawlOptions, + onCrawlOptionsChange, +}) => { + const { t } = useTranslation() + const [step, setStep] = useState(Step.init) + const [controlFoldOptions, setControlFoldOptions] = useState(0) + useEffect(() => { + if (step !== Step.init) + setControlFoldOptions(Date.now()) + }, [step]) + const { setShowAccountSettingModal } = useModalContext() + const handleSetting = useCallback(() => { + setShowAccountSettingModal({ + payload: 'data-source', + }) + }, [setShowAccountSettingModal]) + + const checkValid = useCallback((url: string) => { + let errorMsg = '' + if (!url) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: 'url', + }) + } + + if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://')))) + errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`) + + if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) { + errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, { + field: t(`${I18N_PREFIX}.limit`), + }) + } + + return { + isValid: !errorMsg, + errorMsg, + } + }, [crawlOptions, t]) + + const isInit = step === Step.init + const isCrawlFinished = step === Step.finished + const isRunning = step === Step.running + const [crawlResult, setCrawlResult] = useState<{ + current: number + total: number + data: CrawlResultItem[] + time_consuming: number | string + } | undefined>(undefined) + const [crawlErrorMessage, setCrawlErrorMessage] = useState('') + const showError = isCrawlFinished && crawlErrorMessage + + const waitForCrawlFinished = useCallback(async (jobId: string): Promise => { + try { + const res = await checkWatercrawlTaskStatus(jobId) as any + if (res.status === 'completed') { + return { + isError: false, + data: { + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }, + } + } + if (res.status === 'error' || !res.status) { + // can't get the error message from the watercrawl api + return { + isError: true, + errorMessage: res.message, + data: { + data: [], + }, + } + } + // update the progress + setCrawlResult({ + ...res, + total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)), + }) + onCheckedCrawlResultChange(res.data || []) // default select the crawl result + await sleep(2500) + return await waitForCrawlFinished(jobId) + } + catch (e: any) { + const errorBody = await e.json() + return { + isError: true, + errorMessage: errorBody.message, + data: { + data: [], + }, + } + } + }, [crawlOptions.limit]) + + const handleRun = useCallback(async (url: string) => { + const { isValid, errorMsg } = checkValid(url) + if (!isValid) { + Toast.notify({ + message: errorMsg!, + type: 'error', + }) + return + } + setStep(Step.running) + try { + const passToServerCrawlOptions: any = { + ...crawlOptions, + } + if (crawlOptions.max_depth === '') + delete passToServerCrawlOptions.max_depth + + const res = await createWatercrawlTask({ + url, + options: passToServerCrawlOptions, + }) as any + const jobId = res.job_id + onJobIdChange(jobId) + const { isError, data, errorMessage } = await waitForCrawlFinished(jobId) + if (isError) { + setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`)) + } + else { + setCrawlResult(data) + onCheckedCrawlResultChange(data.data || []) // default select the crawl result + setCrawlErrorMessage('') + } + } + catch (e) { + setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!) + console.log(e) + } + finally { + setStep(Step.finished) + } + }, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished]) + + return ( +
+
+
+ + + + + + {!isInit && ( +
+ {isRunning + && } + {showError && ( + + )} + {isCrawlFinished && !showError + && + } +
+ )} +
+
+ ) +} +export default React.memo(WaterCrawl) diff --git a/web/app/components/datasets/create/website/watercrawl/options.tsx b/web/app/components/datasets/create/website/watercrawl/options.tsx new file mode 100644 index 0000000000..dea6c0e5b9 --- /dev/null +++ b/web/app/components/datasets/create/website/watercrawl/options.tsx @@ -0,0 +1,85 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback } from 'react' +import { useTranslation } from 'react-i18next' +import CheckboxWithLabel from '../base/checkbox-with-label' +import Field from '../base/field' +import cn from '@/utils/classnames' +import type { CrawlOptions } from '@/models/datasets' + +const I18N_PREFIX = 'datasetCreation.stepOne.website' + +type Props = { + className?: string + payload: CrawlOptions + onChange: (payload: CrawlOptions) => void +} + +const Options: FC = ({ + className = '', + payload, + onChange, +}) => { + const { t } = useTranslation() + + const handleChange = useCallback((key: keyof CrawlOptions) => { + return (value: any) => { + onChange({ + ...payload, + [key]: value, + }) + } + }, [payload, onChange]) + return ( +
+ +
+ + +
+ +
+ + +
+ +
+ ) +} +export default React.memo(Options) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx new file mode 100644 index 0000000000..6033a562ac --- /dev/null +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/config-watercrawl-modal.tsx @@ -0,0 +1,161 @@ +'use client' +import type { FC } from 'react' +import React, { useCallback, useState } from 'react' +import { useTranslation } from 'react-i18next' +import { + PortalToFollowElem, + PortalToFollowElemContent, +} from '@/app/components/base/portal-to-follow-elem' +import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security' +import Button from '@/app/components/base/button' +import type { WatercrawlConfig } from '@/models/common' +import Field from '@/app/components/datasets/create/website/base/field' +import Toast from '@/app/components/base/toast' +import { createDataSourceApiKeyBinding } from '@/service/datasets' +import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general' +type Props = { + onCancel: () => void + onSaved: () => void +} + +const I18N_PREFIX = 'datasetCreation.watercrawl' + +const DEFAULT_BASE_URL = 'https://app.watercrawl.dev' + +const ConfigWatercrawlModal: FC = ({ + onCancel, + onSaved, +}) => { + const { t } = useTranslation() + const [isSaving, setIsSaving] = useState(false) + const [config, setConfig] = useState({ + api_key: '', + base_url: '', + }) + + const handleConfigChange = useCallback((key: string) => { + return (value: string | number) => { + setConfig(prev => ({ ...prev, [key]: value as string })) + } + }, []) + + const handleSave = useCallback(async () => { + if (isSaving) + return + let errorMsg = '' + if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://')))) + errorMsg = t('common.errorMsg.urlError') + if (!errorMsg) { + if (!config.api_key) { + errorMsg = t('common.errorMsg.fieldRequired', { + field: 'API Key', + }) + } + } + + if (errorMsg) { + Toast.notify({ + type: 'error', + message: errorMsg, + }) + return + } + const postData = { + category: 'website', + provider: 'watercrawl', + credentials: { + auth_type: 'x-api-key', + config: { + api_key: config.api_key, + base_url: config.base_url || DEFAULT_BASE_URL, + }, + }, + } + try { + setIsSaving(true) + await createDataSourceApiKeyBinding(postData) + Toast.notify({ + type: 'success', + message: t('common.api.success'), + }) + } + finally { + setIsSaving(false) + } + + onSaved() + }, [config.api_key, config.base_url, onSaved, t, isSaving]) + + return ( + + +
+
+
+
+
{t(`${I18N_PREFIX}.configWatercrawl`)}
+
+ +
+ + +
+
+ + {t(`${I18N_PREFIX}.getApiKeyLinkText`)} + + +
+ + +
+ +
+
+
+
+ + {t('common.modelProvider.encrypted.front')} + + PKCS1_OAEP + + {t('common.modelProvider.encrypted.back')} +
+
+
+
+
+
+ ) +} +export default React.memo(ConfigWatercrawlModal) diff --git a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx index 50bf9938a8..911bf01c09 100644 --- a/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/data-source-website/index.tsx @@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next' import Panel from '../panel' import { DataSourceType } from '../panel/types' import ConfigFirecrawlModal from './config-firecrawl-modal' +import ConfigWatercrawlModal from './config-watercrawl-modal' import ConfigJinaReaderModal from './config-jina-reader-modal' import cn from '@/utils/classnames' import s from '@/app/components/datasets/create/website/index.module.css' import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets' -import type { - DataSourceItem, -} from '@/models/common' +import type { DataSourceItem } from '@/models/common' +import { DataSourceProvider } from '@/models/common' import { useAppContext } from '@/context/app-context' - -import { - DataSourceProvider, -} from '@/models/common' import Toast from '@/app/components/base/toast' type Props = { @@ -58,6 +54,16 @@ const DataSourceWebsite: FC = ({ provider }) => { return source?.id } + const getProviderName = (provider: DataSourceProvider): string => { + if (provider === DataSourceProvider.fireCrawl) + return 'Firecrawl' + + if (provider === DataSourceProvider.waterCrawl) + return 'WaterCrawl' + + return 'Jina Reader' + } + const handleRemove = useCallback((provider: DataSourceProvider) => { return async () => { const dataSourceId = getIdByProvider(provider) @@ -82,27 +88,42 @@ const DataSourceWebsite: FC = ({ provider }) => { readOnly={!isCurrentWorkspaceManager} configuredList={sources.filter(item => item.provider === provider).map(item => ({ id: item.id, - logo: ({ className }: { className: string }) => ( - item.provider === DataSourceProvider.fireCrawl - ? ( -
🔥
+ logo: ({ className }: { className: string }) => { + if (item.provider === DataSourceProvider.fireCrawl) { + return ( +
🔥
) - : ( -
- + } + + if (item.provider === DataSourceProvider.waterCrawl) { + return ( +
+
) - ), - name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader', + } + return ( +
+ +
+ ) + }, + name: getProviderName(item.provider), isActive: true, }))} onRemove={handleRemove(provider)} /> {configTarget === DataSourceProvider.fireCrawl && ( - + + )} + {configTarget === DataSourceProvider.waterCrawl && ( + )} {configTarget === DataSourceProvider.jinaReader && ( - + )} diff --git a/web/app/components/header/account-setting/data-source-page/index.tsx b/web/app/components/header/account-setting/data-source-page/index.tsx index 93dc2db854..78eeeeac5b 100644 --- a/web/app/components/header/account-setting/data-source-page/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/index.tsx @@ -15,6 +15,7 @@ export default function DataSourcePage() { +
) } diff --git a/web/app/components/header/account-setting/data-source-page/panel/index.tsx b/web/app/components/header/account-setting/data-source-page/panel/index.tsx index 8d64695e89..95ab9a6982 100644 --- a/web/app/components/header/account-setting/data-source-page/panel/index.tsx +++ b/web/app/components/header/account-setting/data-source-page/panel/index.tsx @@ -41,6 +41,12 @@ const Panel: FC = ({ const isNotion = type === DataSourceType.notion const isWebsite = type === DataSourceType.website + const getProviderName = (): string => { + if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl' + if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl' + return 'Jina Reader' + } + return (
@@ -50,7 +56,7 @@ const Panel: FC = ({
{t(`common.dataSource.${type}.title`)}
{isWebsite && (
- {t('common.dataSource.website.with')} { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'} + {t('common.dataSource.website.with')} {getProviderName()}
)}
diff --git a/web/i18n/en-US/dataset-creation.ts b/web/i18n/en-US/dataset-creation.ts index 72eb44c3de..c55a939ea3 100644 --- a/web/i18n/en-US/dataset-creation.ts +++ b/web/i18n/en-US/dataset-creation.ts @@ -15,6 +15,11 @@ const translation = { apiKeyPlaceholder: 'API key from firecrawl.dev', getApiKeyLinkText: 'Get your API key from firecrawl.dev', }, + watercrawl: { + configWatercrawl: 'Configure Watercrawl', + apiKeyPlaceholder: 'API key from watercrawl.dev', + getApiKeyLinkText: 'Get your API key from watercrawl.dev', + }, jinaReader: { configJinaReader: 'Configure Jina Reader', apiKeyPlaceholder: 'API key from jina.ai', @@ -64,15 +69,21 @@ const translation = { chooseProvider: 'Select a provider', fireCrawlNotConfigured: 'Firecrawl is not configured', fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.', + watercrawlNotConfigured: 'Watercrawl is not configured', + watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.', jinaReaderNotConfigured: 'Jina Reader is not configured', jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.', configure: 'Configure', configureFirecrawl: 'Configure Firecrawl', + configureWatercrawl: 'Configure Watercrawl', configureJinaReader: 'Configure Jina Reader', run: 'Run', firecrawlTitle: 'Extract web content with 🔥Firecrawl', firecrawlDoc: 'Firecrawl docs', firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', + watercrawlTitle: 'Extract web content with Watercrawl', + watercrawlDoc: 'Watercrawl docs', + watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website', jinaReaderTitle: 'Convert the entire site to Markdown', jinaReaderDoc: 'Learn more about Jina Reader', jinaReaderDocLink: 'https://jina.ai/reader', diff --git a/web/models/common.ts b/web/models/common.ts index 4086220e2e..0ee164aad8 100644 --- a/web/models/common.ts +++ b/web/models/common.ts @@ -178,6 +178,7 @@ export enum DataSourceCategory { export enum DataSourceProvider { fireCrawl = 'firecrawl', jinaReader = 'jinareader', + waterCrawl = 'watercrawl', } export type FirecrawlConfig = { @@ -185,6 +186,11 @@ export type FirecrawlConfig = { base_url: string } +export type WatercrawlConfig = { + api_key: string + base_url: string +} + export type DataSourceItem = { id: string category: DataSourceCategory diff --git a/web/service/datasets.ts b/web/service/datasets.ts index 53b55b375b..f9edb2eeaf 100644 --- a/web/service/datasets.ts +++ b/web/service/datasets.ts @@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher = (jobId }) } +export const createWatercrawlTask: Fetcher> = (body) => { + return post('website/crawl', { + body: { + ...body, + provider: DataSourceProvider.waterCrawl, + }, + }) +} + +export const checkWatercrawlTaskStatus: Fetcher = (jobId: string) => { + return get(`website/crawl/status/${jobId}`, { + params: { + provider: DataSourceProvider.waterCrawl, + }, + }, { + silent: true, + }) +} + type FileTypesRes = { allowed_extensions: string[] }