mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 06:29:03 +08:00
feat: Integrate WaterCrawl.dev as a new knowledge base provider (#16396)
Co-authored-by: crazywoola <427733928@qq.com>
This commit is contained in:
parent
0afad94378
commit
f54905e685
@ -14,7 +14,12 @@ class WebsiteCrawlApi(Resource):
|
|||||||
def post(self):
|
def post(self):
|
||||||
parser = reqparse.RequestParser()
|
parser = reqparse.RequestParser()
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"provider", type=str, choices=["firecrawl", "jinareader"], required=True, nullable=True, location="json"
|
"provider",
|
||||||
|
type=str,
|
||||||
|
choices=["firecrawl", "watercrawl", "jinareader"],
|
||||||
|
required=True,
|
||||||
|
nullable=True,
|
||||||
|
location="json",
|
||||||
)
|
)
|
||||||
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
parser.add_argument("url", type=str, required=True, nullable=True, location="json")
|
||||||
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
parser.add_argument("options", type=dict, required=True, nullable=True, location="json")
|
||||||
@ -34,7 +39,9 @@ class WebsiteCrawlStatusApi(Resource):
|
|||||||
@account_initialization_required
|
@account_initialization_required
|
||||||
def get(self, job_id: str):
|
def get(self, job_id: str):
|
||||||
parser = reqparse.RequestParser()
|
parser = reqparse.RequestParser()
|
||||||
parser.add_argument("provider", type=str, choices=["firecrawl", "jinareader"], required=True, location="args")
|
parser.add_argument(
|
||||||
|
"provider", type=str, choices=["firecrawl", "watercrawl", "jinareader"], required=True, location="args"
|
||||||
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
# get crawl status
|
# get crawl status
|
||||||
try:
|
try:
|
||||||
|
@ -26,6 +26,7 @@ from core.rag.extractor.unstructured.unstructured_msg_extractor import Unstructu
|
|||||||
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
|
||||||
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
|
from core.rag.extractor.unstructured.unstructured_xml_extractor import UnstructuredXmlExtractor
|
||||||
|
from core.rag.extractor.watercrawl.extractor import WaterCrawlWebExtractor
|
||||||
from core.rag.extractor.word_extractor import WordExtractor
|
from core.rag.extractor.word_extractor import WordExtractor
|
||||||
from core.rag.models.document import Document
|
from core.rag.models.document import Document
|
||||||
from extensions.ext_storage import storage
|
from extensions.ext_storage import storage
|
||||||
@ -183,6 +184,15 @@ class ExtractProcessor:
|
|||||||
only_main_content=extract_setting.website_info.only_main_content,
|
only_main_content=extract_setting.website_info.only_main_content,
|
||||||
)
|
)
|
||||||
return extractor.extract()
|
return extractor.extract()
|
||||||
|
elif extract_setting.website_info.provider == "watercrawl":
|
||||||
|
extractor = WaterCrawlWebExtractor(
|
||||||
|
url=extract_setting.website_info.url,
|
||||||
|
job_id=extract_setting.website_info.job_id,
|
||||||
|
tenant_id=extract_setting.website_info.tenant_id,
|
||||||
|
mode=extract_setting.website_info.mode,
|
||||||
|
only_main_content=extract_setting.website_info.only_main_content,
|
||||||
|
)
|
||||||
|
return extractor.extract()
|
||||||
elif extract_setting.website_info.provider == "jinareader":
|
elif extract_setting.website_info.provider == "jinareader":
|
||||||
extractor = JinaReaderWebExtractor(
|
extractor = JinaReaderWebExtractor(
|
||||||
url=extract_setting.website_info.url,
|
url=extract_setting.website_info.url,
|
||||||
|
161
api/core/rag/extractor/watercrawl/client.py
Normal file
161
api/core/rag/extractor/watercrawl/client.py
Normal file
@ -0,0 +1,161 @@
|
|||||||
|
import json
|
||||||
|
from collections.abc import Generator
|
||||||
|
from typing import Union
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from requests import Response
|
||||||
|
|
||||||
|
|
||||||
|
class BaseAPIClient:
|
||||||
|
def __init__(self, api_key, base_url):
|
||||||
|
self.api_key = api_key
|
||||||
|
self.base_url = base_url
|
||||||
|
self.session = self.init_session()
|
||||||
|
|
||||||
|
def init_session(self):
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({"X-API-Key": self.api_key})
|
||||||
|
session.headers.update({"Content-Type": "application/json"})
|
||||||
|
session.headers.update({"Accept": "application/json"})
|
||||||
|
session.headers.update({"User-Agent": "WaterCrawl-Plugin"})
|
||||||
|
session.headers.update({"Accept-Language": "en-US"})
|
||||||
|
return session
|
||||||
|
|
||||||
|
def _get(self, endpoint: str, query_params: dict | None = None, **kwargs):
|
||||||
|
return self.session.get(urljoin(self.base_url, endpoint), params=query_params, **kwargs)
|
||||||
|
|
||||||
|
def _post(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||||
|
return self.session.post(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs)
|
||||||
|
|
||||||
|
def _put(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||||
|
return self.session.put(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs)
|
||||||
|
|
||||||
|
def _delete(self, endpoint: str, query_params: dict | None = None, **kwargs):
|
||||||
|
return self.session.delete(urljoin(self.base_url, endpoint), params=query_params, **kwargs)
|
||||||
|
|
||||||
|
def _patch(self, endpoint: str, query_params: dict | None = None, data: dict | None = None, **kwargs):
|
||||||
|
return self.session.patch(urljoin(self.base_url, endpoint), params=query_params, json=data, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class WaterCrawlAPIClient(BaseAPIClient):
|
||||||
|
def __init__(self, api_key, base_url: str | None = "https://app.watercrawl.dev/"):
|
||||||
|
super().__init__(api_key, base_url)
|
||||||
|
|
||||||
|
def process_eventstream(self, response: Response, download: bool = False) -> Generator:
|
||||||
|
for line in response.iter_lines():
|
||||||
|
line = line.decode("utf-8")
|
||||||
|
if line.startswith("data:"):
|
||||||
|
line = line[5:].strip()
|
||||||
|
data = json.loads(line)
|
||||||
|
if data["type"] == "result" and download:
|
||||||
|
data["data"] = self.download_result(data["data"])
|
||||||
|
yield data
|
||||||
|
|
||||||
|
def process_response(self, response: Response) -> dict | bytes | list | None | Generator:
|
||||||
|
response.raise_for_status()
|
||||||
|
if response.status_code == 204:
|
||||||
|
return None
|
||||||
|
if response.headers.get("Content-Type") == "application/json":
|
||||||
|
return response.json() or {}
|
||||||
|
|
||||||
|
if response.headers.get("Content-Type") == "application/octet-stream":
|
||||||
|
return response.content
|
||||||
|
|
||||||
|
if response.headers.get("Content-Type") == "text/event-stream":
|
||||||
|
return self.process_eventstream(response)
|
||||||
|
|
||||||
|
raise Exception(f"Unknown response type: {response.headers.get('Content-Type')}")
|
||||||
|
|
||||||
|
def get_crawl_requests_list(self, page: int | None = None, page_size: int | None = None):
|
||||||
|
query_params = {"page": page or 1, "page_size": page_size or 10}
|
||||||
|
return self.process_response(
|
||||||
|
self._get(
|
||||||
|
"/api/v1/core/crawl-requests/",
|
||||||
|
query_params=query_params,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_crawl_request(self, item_id: str):
|
||||||
|
return self.process_response(
|
||||||
|
self._get(
|
||||||
|
f"/api/v1/core/crawl-requests/{item_id}/",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def create_crawl_request(
|
||||||
|
self,
|
||||||
|
url: Union[list, str] | None = None,
|
||||||
|
spider_options: dict | None = None,
|
||||||
|
page_options: dict | None = None,
|
||||||
|
plugin_options: dict | None = None,
|
||||||
|
):
|
||||||
|
data = {
|
||||||
|
# 'urls': url if isinstance(url, list) else [url],
|
||||||
|
"url": url,
|
||||||
|
"options": {
|
||||||
|
"spider_options": spider_options or {},
|
||||||
|
"page_options": page_options or {},
|
||||||
|
"plugin_options": plugin_options or {},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
return self.process_response(
|
||||||
|
self._post(
|
||||||
|
"/api/v1/core/crawl-requests/",
|
||||||
|
data=data,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def stop_crawl_request(self, item_id: str):
|
||||||
|
return self.process_response(
|
||||||
|
self._delete(
|
||||||
|
f"/api/v1/core/crawl-requests/{item_id}/",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def download_crawl_request(self, item_id: str):
|
||||||
|
return self.process_response(
|
||||||
|
self._get(
|
||||||
|
f"/api/v1/core/crawl-requests/{item_id}/download/",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
def monitor_crawl_request(self, item_id: str, prefetched=False) -> Generator:
|
||||||
|
query_params = {"prefetched": str(prefetched).lower()}
|
||||||
|
generator = self.process_response(
|
||||||
|
self._get(f"/api/v1/core/crawl-requests/{item_id}/status/", stream=True, query_params=query_params),
|
||||||
|
)
|
||||||
|
if not isinstance(generator, Generator):
|
||||||
|
raise ValueError("Generator expected")
|
||||||
|
yield from generator
|
||||||
|
|
||||||
|
def get_crawl_request_results(
|
||||||
|
self, item_id: str, page: int = 1, page_size: int = 25, query_params: dict | None = None
|
||||||
|
):
|
||||||
|
query_params = query_params or {}
|
||||||
|
query_params.update({"page": page or 1, "page_size": page_size or 25})
|
||||||
|
return self.process_response(
|
||||||
|
self._get(f"/api/v1/core/crawl-requests/{item_id}/results/", query_params=query_params)
|
||||||
|
)
|
||||||
|
|
||||||
|
def scrape_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
page_options: dict | None = None,
|
||||||
|
plugin_options: dict | None = None,
|
||||||
|
sync: bool = True,
|
||||||
|
prefetched: bool = True,
|
||||||
|
):
|
||||||
|
response_result = self.create_crawl_request(url=url, page_options=page_options, plugin_options=plugin_options)
|
||||||
|
if not sync:
|
||||||
|
return response_result
|
||||||
|
|
||||||
|
for event_data in self.monitor_crawl_request(response_result["uuid"], prefetched):
|
||||||
|
if event_data["type"] == "result":
|
||||||
|
return event_data["data"]
|
||||||
|
|
||||||
|
def download_result(self, result_object: dict):
|
||||||
|
response = requests.get(result_object["result"])
|
||||||
|
response.raise_for_status()
|
||||||
|
result_object["result"] = response.json()
|
||||||
|
return result_object
|
57
api/core/rag/extractor/watercrawl/extractor.py
Normal file
57
api/core/rag/extractor/watercrawl/extractor.py
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
from core.rag.extractor.extractor_base import BaseExtractor
|
||||||
|
from core.rag.models.document import Document
|
||||||
|
from services.website_service import WebsiteService
|
||||||
|
|
||||||
|
|
||||||
|
class WaterCrawlWebExtractor(BaseExtractor):
|
||||||
|
"""
|
||||||
|
Crawl and scrape websites and return content in clean llm-ready markdown.
|
||||||
|
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL to scrape.
|
||||||
|
api_key: The API key for WaterCrawl.
|
||||||
|
base_url: The base URL for the Firecrawl API. Defaults to 'https://app.firecrawl.dev'.
|
||||||
|
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||||
|
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
|
||||||
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
|
self._url = url
|
||||||
|
self.job_id = job_id
|
||||||
|
self.tenant_id = tenant_id
|
||||||
|
self.mode = mode
|
||||||
|
self.only_main_content = only_main_content
|
||||||
|
|
||||||
|
def extract(self) -> list[Document]:
|
||||||
|
"""Extract content from the URL."""
|
||||||
|
documents = []
|
||||||
|
if self.mode == "crawl":
|
||||||
|
crawl_data = WebsiteService.get_crawl_url_data(self.job_id, "watercrawl", self._url, self.tenant_id)
|
||||||
|
if crawl_data is None:
|
||||||
|
return []
|
||||||
|
document = Document(
|
||||||
|
page_content=crawl_data.get("markdown", ""),
|
||||||
|
metadata={
|
||||||
|
"source_url": crawl_data.get("source_url"),
|
||||||
|
"description": crawl_data.get("description"),
|
||||||
|
"title": crawl_data.get("title"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
documents.append(document)
|
||||||
|
elif self.mode == "scrape":
|
||||||
|
scrape_data = WebsiteService.get_scrape_url_data(
|
||||||
|
"watercrawl", self._url, self.tenant_id, self.only_main_content
|
||||||
|
)
|
||||||
|
|
||||||
|
document = Document(
|
||||||
|
page_content=scrape_data.get("markdown", ""),
|
||||||
|
metadata={
|
||||||
|
"source_url": scrape_data.get("source_url"),
|
||||||
|
"description": scrape_data.get("description"),
|
||||||
|
"title": scrape_data.get("title"),
|
||||||
|
},
|
||||||
|
)
|
||||||
|
documents.append(document)
|
||||||
|
return documents
|
117
api/core/rag/extractor/watercrawl/provider.py
Normal file
117
api/core/rag/extractor/watercrawl/provider.py
Normal file
@ -0,0 +1,117 @@
|
|||||||
|
from collections.abc import Generator
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from core.rag.extractor.watercrawl.client import WaterCrawlAPIClient
|
||||||
|
|
||||||
|
|
||||||
|
class WaterCrawlProvider:
|
||||||
|
def __init__(self, api_key, base_url: str | None = None):
|
||||||
|
self.client = WaterCrawlAPIClient(api_key, base_url)
|
||||||
|
|
||||||
|
def crawl_url(self, url, options: dict | Any = None) -> dict:
|
||||||
|
options = options or {}
|
||||||
|
spider_options = {
|
||||||
|
"max_depth": 1,
|
||||||
|
"page_limit": 1,
|
||||||
|
"allowed_domains": [],
|
||||||
|
"exclude_paths": [],
|
||||||
|
"include_paths": [],
|
||||||
|
}
|
||||||
|
if options.get("crawl_sub_pages", True):
|
||||||
|
spider_options["page_limit"] = options.get("limit", 1)
|
||||||
|
spider_options["max_depth"] = options.get("depth", 1)
|
||||||
|
spider_options["include_paths"] = options.get("includes", "").split(",") if options.get("includes") else []
|
||||||
|
spider_options["exclude_paths"] = options.get("excludes", "").split(",") if options.get("excludes") else []
|
||||||
|
|
||||||
|
wait_time = options.get("wait_time", 1000)
|
||||||
|
page_options = {
|
||||||
|
"exclude_tags": options.get("exclude_tags", "").split(",") if options.get("exclude_tags") else [],
|
||||||
|
"include_tags": options.get("include_tags", "").split(",") if options.get("include_tags") else [],
|
||||||
|
"wait_time": max(1000, wait_time), # minimum wait time is 1 second
|
||||||
|
"include_html": False,
|
||||||
|
"only_main_content": options.get("only_main_content", True),
|
||||||
|
"include_links": False,
|
||||||
|
"timeout": 15000,
|
||||||
|
"accept_cookies_selector": "#cookies-accept",
|
||||||
|
"locale": "en-US",
|
||||||
|
"actions": [],
|
||||||
|
}
|
||||||
|
result = self.client.create_crawl_request(url=url, spider_options=spider_options, page_options=page_options)
|
||||||
|
|
||||||
|
return {"status": "active", "job_id": result.get("uuid")}
|
||||||
|
|
||||||
|
def get_crawl_status(self, crawl_request_id) -> dict:
|
||||||
|
response = self.client.get_crawl_request(crawl_request_id)
|
||||||
|
data = []
|
||||||
|
if response["status"] in ["new", "running"]:
|
||||||
|
status = "active"
|
||||||
|
else:
|
||||||
|
status = "completed"
|
||||||
|
data = list(self._get_results(crawl_request_id))
|
||||||
|
|
||||||
|
time_str = response.get("duration")
|
||||||
|
time_consuming: float = 0
|
||||||
|
if time_str:
|
||||||
|
time_obj = datetime.strptime(time_str, "%H:%M:%S.%f")
|
||||||
|
time_consuming = (
|
||||||
|
time_obj.hour * 3600 + time_obj.minute * 60 + time_obj.second + time_obj.microsecond / 1_000_000
|
||||||
|
)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"status": status,
|
||||||
|
"job_id": response.get("uuid"),
|
||||||
|
"total": response.get("options", {}).get("spider_options", {}).get("page_limit", 1),
|
||||||
|
"current": response.get("number_of_documents", 0),
|
||||||
|
"data": data,
|
||||||
|
"time_consuming": time_consuming,
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_crawl_url_data(self, job_id, url) -> dict | None:
|
||||||
|
if not job_id:
|
||||||
|
return self.scrape_url(url)
|
||||||
|
|
||||||
|
for result in self._get_results(
|
||||||
|
job_id,
|
||||||
|
{
|
||||||
|
# filter by url
|
||||||
|
"url": url
|
||||||
|
},
|
||||||
|
):
|
||||||
|
return result
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def scrape_url(self, url: str) -> dict:
|
||||||
|
response = self.client.scrape_url(url=url, sync=True, prefetched=True)
|
||||||
|
return self._structure_data(response)
|
||||||
|
|
||||||
|
def _structure_data(self, result_object: dict) -> dict:
|
||||||
|
if isinstance(result_object.get("result", {}), str):
|
||||||
|
raise ValueError("Invalid result object. Expected a dictionary.")
|
||||||
|
|
||||||
|
metadata = result_object.get("result", {}).get("metadata", {})
|
||||||
|
return {
|
||||||
|
"title": metadata.get("og:title") or metadata.get("title"),
|
||||||
|
"description": metadata.get("description"),
|
||||||
|
"source_url": result_object.get("url"),
|
||||||
|
"markdown": result_object.get("result", {}).get("markdown"),
|
||||||
|
}
|
||||||
|
|
||||||
|
def _get_results(self, crawl_request_id: str, query_params: dict | None = None) -> Generator[dict, None, None]:
|
||||||
|
page = 0
|
||||||
|
page_size = 100
|
||||||
|
|
||||||
|
query_params = query_params or {}
|
||||||
|
query_params.update({"prefetched": "true"})
|
||||||
|
while True:
|
||||||
|
page += 1
|
||||||
|
response = self.client.get_crawl_request_results(crawl_request_id, page, page_size, query_params)
|
||||||
|
if not response["results"]:
|
||||||
|
break
|
||||||
|
|
||||||
|
for result in response["results"]:
|
||||||
|
yield self._structure_data(result)
|
||||||
|
|
||||||
|
if response["next"] is None:
|
||||||
|
break
|
@ -17,6 +17,10 @@ class ApiKeyAuthFactory:
|
|||||||
from services.auth.firecrawl.firecrawl import FirecrawlAuth
|
from services.auth.firecrawl.firecrawl import FirecrawlAuth
|
||||||
|
|
||||||
return FirecrawlAuth
|
return FirecrawlAuth
|
||||||
|
case AuthType.WATERCRAWL:
|
||||||
|
from services.auth.watercrawl.watercrawl import WatercrawlAuth
|
||||||
|
|
||||||
|
return WatercrawlAuth
|
||||||
case AuthType.JINA:
|
case AuthType.JINA:
|
||||||
from services.auth.jina.jina import JinaAuth
|
from services.auth.jina.jina import JinaAuth
|
||||||
|
|
||||||
|
@ -3,4 +3,5 @@ from enum import StrEnum
|
|||||||
|
|
||||||
class AuthType(StrEnum):
|
class AuthType(StrEnum):
|
||||||
FIRECRAWL = "firecrawl"
|
FIRECRAWL = "firecrawl"
|
||||||
|
WATERCRAWL = "watercrawl"
|
||||||
JINA = "jinareader"
|
JINA = "jinareader"
|
||||||
|
0
api/services/auth/watercrawl/__init__.py
Normal file
0
api/services/auth/watercrawl/__init__.py
Normal file
44
api/services/auth/watercrawl/watercrawl.py
Normal file
44
api/services/auth/watercrawl/watercrawl.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
import json
|
||||||
|
from urllib.parse import urljoin
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from services.auth.api_key_auth_base import ApiKeyAuthBase
|
||||||
|
|
||||||
|
|
||||||
|
class WatercrawlAuth(ApiKeyAuthBase):
|
||||||
|
def __init__(self, credentials: dict):
|
||||||
|
super().__init__(credentials)
|
||||||
|
auth_type = credentials.get("auth_type")
|
||||||
|
if auth_type != "x-api-key":
|
||||||
|
raise ValueError("Invalid auth type, WaterCrawl auth type must be x-api-key")
|
||||||
|
self.api_key = credentials.get("config", {}).get("api_key", None)
|
||||||
|
self.base_url = credentials.get("config", {}).get("base_url", "https://app.watercrawl.dev")
|
||||||
|
|
||||||
|
if not self.api_key:
|
||||||
|
raise ValueError("No API key provided")
|
||||||
|
|
||||||
|
def validate_credentials(self):
|
||||||
|
headers = self._prepare_headers()
|
||||||
|
url = urljoin(self.base_url, "/api/v1/core/crawl-requests/")
|
||||||
|
response = self._get_request(url, headers)
|
||||||
|
if response.status_code == 200:
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
self._handle_error(response)
|
||||||
|
|
||||||
|
def _prepare_headers(self):
|
||||||
|
return {"Content-Type": "application/json", "X-API-KEY": self.api_key}
|
||||||
|
|
||||||
|
def _get_request(self, url, headers):
|
||||||
|
return requests.get(url, headers=headers)
|
||||||
|
|
||||||
|
def _handle_error(self, response):
|
||||||
|
if response.status_code in {402, 409, 500}:
|
||||||
|
error_message = response.json().get("error", "Unknown error occurred")
|
||||||
|
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||||
|
else:
|
||||||
|
if response.text:
|
||||||
|
error_message = json.loads(response.text).get("error", "Unknown error occurred")
|
||||||
|
raise Exception(f"Failed to authorize. Status code: {response.status_code}. Error: {error_message}")
|
||||||
|
raise Exception(f"Unexpected error occurred while trying to authorize. Status code: {response.status_code}")
|
@ -7,6 +7,7 @@ from flask_login import current_user # type: ignore
|
|||||||
|
|
||||||
from core.helper import encrypter
|
from core.helper import encrypter
|
||||||
from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
|
from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
|
||||||
|
from core.rag.extractor.watercrawl.provider import WaterCrawlProvider
|
||||||
from extensions.ext_redis import redis_client
|
from extensions.ext_redis import redis_client
|
||||||
from extensions.ext_storage import storage
|
from extensions.ext_storage import storage
|
||||||
from services.auth.api_key_auth_service import ApiKeyAuthService
|
from services.auth.api_key_auth_service import ApiKeyAuthService
|
||||||
@ -59,6 +60,13 @@ class WebsiteService:
|
|||||||
time = str(datetime.datetime.now().timestamp())
|
time = str(datetime.datetime.now().timestamp())
|
||||||
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
redis_client.setex(website_crawl_time_cache_key, 3600, time)
|
||||||
return {"status": "active", "job_id": job_id}
|
return {"status": "active", "job_id": job_id}
|
||||||
|
elif provider == "watercrawl":
|
||||||
|
# decrypt api_key
|
||||||
|
api_key = encrypter.decrypt_token(
|
||||||
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
|
)
|
||||||
|
return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).crawl_url(url, options)
|
||||||
|
|
||||||
elif provider == "jinareader":
|
elif provider == "jinareader":
|
||||||
api_key = encrypter.decrypt_token(
|
api_key = encrypter.decrypt_token(
|
||||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
@ -116,6 +124,14 @@ class WebsiteService:
|
|||||||
time_consuming = abs(end_time - float(start_time))
|
time_consuming = abs(end_time - float(start_time))
|
||||||
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
|
||||||
redis_client.delete(website_crawl_time_cache_key)
|
redis_client.delete(website_crawl_time_cache_key)
|
||||||
|
elif provider == "watercrawl":
|
||||||
|
# decrypt api_key
|
||||||
|
api_key = encrypter.decrypt_token(
|
||||||
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
|
)
|
||||||
|
crawl_status_data = WaterCrawlProvider(
|
||||||
|
api_key, credentials.get("config").get("base_url", None)
|
||||||
|
).get_crawl_status(job_id)
|
||||||
elif provider == "jinareader":
|
elif provider == "jinareader":
|
||||||
api_key = encrypter.decrypt_token(
|
api_key = encrypter.decrypt_token(
|
||||||
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
|
||||||
@ -180,6 +196,11 @@ class WebsiteService:
|
|||||||
if item.get("source_url") == url:
|
if item.get("source_url") == url:
|
||||||
return dict(item)
|
return dict(item)
|
||||||
return None
|
return None
|
||||||
|
elif provider == "watercrawl":
|
||||||
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||||
|
return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).get_crawl_url_data(
|
||||||
|
job_id, url
|
||||||
|
)
|
||||||
elif provider == "jinareader":
|
elif provider == "jinareader":
|
||||||
if not job_id:
|
if not job_id:
|
||||||
response = requests.get(
|
response = requests.get(
|
||||||
@ -223,5 +244,8 @@ class WebsiteService:
|
|||||||
params = {"onlyMainContent": only_main_content}
|
params = {"onlyMainContent": only_main_content}
|
||||||
result = firecrawl_app.scrape_url(url, params)
|
result = firecrawl_app.scrape_url(url, params)
|
||||||
return result
|
return result
|
||||||
|
elif provider == "watercrawl":
|
||||||
|
api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
|
||||||
|
return WaterCrawlProvider(api_key, credentials.get("config").get("base_url", None)).scrape_url(url)
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid provider")
|
raise ValueError("Invalid provider")
|
||||||
|
20
web/app/components/datasets/create/assets/watercrawl.svg
Normal file
20
web/app/components/datasets/create/assets/watercrawl.svg
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 500 500">
|
||||||
|
<path style="fill: rgb(0, 23, 87); stroke: rgb(13, 14, 52);" d="M 247.794 213.903 L 246.81 76.976 L 254.345 76.963 L 254.592 213.989 L 247.794 213.903 Z"/>
|
||||||
|
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.025" cy="43.859" rx="33.966" ry="33.906"/>
|
||||||
|
<path style="fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 282.472 260.389 L 414.181 330.253 L 410.563 336.234 L 279.38 265.739 L 282.472 260.389 Z"/>
|
||||||
|
<path style="fill: rgb(15, 17, 57); stroke: rgb(13, 14, 52);" d="M 255.105 281.394 L 254.485 417.656 L 246.156 417.691 L 246.688 280.51 L 255.105 281.394 Z"/>
|
||||||
|
<path style="paint-order: fill; fill: rgb(30, 141, 166); stroke: rgb(30, 141, 166);" d="M 279.486 229.517 L 410.351 160.07 L 413.923 167.04 L 283.727 235.998 L 279.486 229.517 Z"/>
|
||||||
|
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 88.545 164.884 L 219.797 236.07 L 222.867 229.568 L 90.887 159.47 L 88.545 164.884 Z"/>
|
||||||
|
<path style="fill: rgb(15, 164, 161); stroke: rgb(15, 164, 161);" d="M 224.76 266.9 L 95.55 334.829 L 92.878 328.37 L 219.955 261.275 L 224.76 266.9 Z"/>
|
||||||
|
<ellipse style="paint-order: fill; fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="251.242" cy="247.466" rx="33.966" ry="33.906"/>
|
||||||
|
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 279.502 433.617 L 408.666 359.443 C 408.666 359.443 412.398 366.965 412.398 366.916 C 412.398 366.867 281.544 440.217 281.544 440.217 L 279.502 433.617 Z"/>
|
||||||
|
<path style="fill: rgb(13, 14, 52); stroke: rgb(13, 14, 52);" d="M 223.119 431.408 L 96.643 361.068 L 93.265 368.047 L 218.895 438.099 L 223.119 431.408 Z"/>
|
||||||
|
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="250.504" cy="451.168" rx="33.966" ry="33.906"/>
|
||||||
|
<path style="fill: rgb(90, 191, 187); stroke: rgb(90, 191, 187);" d="M 435.665 180.895 L 435.859 316.869 L 443.103 315.579 L 442.56 180.697 L 435.665 180.895 Z"/>
|
||||||
|
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="441.06" cy="349.665" rx="33.966" ry="33.906"/>
|
||||||
|
<ellipse style="fill: rgb(2, 181, 225); stroke: rgb(2, 181, 225);" cx="441.512" cy="147.767" rx="33.966" ry="33.906"/>
|
||||||
|
<path style="fill: rgb(84, 187, 181); stroke: rgb(84, 187, 181);" d="M 64.755 314.523 L 57.928 315.006 L 58.307 182.961 L 65.169 182.865 L 64.755 314.523 Z"/>
|
||||||
|
<ellipse style="fill: rgb(0, 23, 87); stroke: rgb(0, 23, 87);" cx="58.177" cy="149.757" rx="33.966" ry="33.906"/>
|
||||||
|
<ellipse style="fill: rgb(61, 224, 203); stroke: rgb(61, 224, 203);" cx="65.909" cy="348.17" rx="33.966" ry="33.906"/>
|
||||||
|
</svg>
|
After Width: | Height: | Size: 2.6 KiB |
@ -4,3 +4,10 @@
|
|||||||
background-image: url(../assets/jina.png);
|
background-image: url(../assets/jina.png);
|
||||||
background-size: 16px;
|
background-size: 16px;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
.watercrawlLogo {
|
||||||
|
@apply w-5 h-5 bg-center bg-no-repeat inline-block;
|
||||||
|
/*background-color: #F5FAFF;*/
|
||||||
|
background-image: url(../assets/watercrawl.svg);
|
||||||
|
background-size: 16px;
|
||||||
|
}
|
||||||
|
@ -5,6 +5,7 @@ import { useTranslation } from 'react-i18next'
|
|||||||
import s from './index.module.css'
|
import s from './index.module.css'
|
||||||
import NoData from './no-data'
|
import NoData from './no-data'
|
||||||
import Firecrawl from './firecrawl'
|
import Firecrawl from './firecrawl'
|
||||||
|
import Watercrawl from './watercrawl'
|
||||||
import JinaReader from './jina-reader'
|
import JinaReader from './jina-reader'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import { useModalContext } from '@/context/modal-context'
|
import { useModalContext } from '@/context/modal-context'
|
||||||
@ -47,7 +48,11 @@ const Website: FC<Props> = ({
|
|||||||
|
|
||||||
// If users have configured one of the providers, select it.
|
// If users have configured one of the providers, select it.
|
||||||
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
const availableProviders = res.sources.filter((item: DataSourceItem) =>
|
||||||
[DataSourceProvider.jinaReader, DataSourceProvider.fireCrawl].includes(item.provider),
|
[
|
||||||
|
DataSourceProvider.jinaReader,
|
||||||
|
DataSourceProvider.fireCrawl,
|
||||||
|
DataSourceProvider.waterCrawl,
|
||||||
|
].includes(item.provider),
|
||||||
)
|
)
|
||||||
|
|
||||||
if (availableProviders.length > 0)
|
if (availableProviders.length > 0)
|
||||||
@ -70,6 +75,8 @@ const Website: FC<Props> = ({
|
|||||||
if (!isLoaded)
|
if (!isLoaded)
|
||||||
return null
|
return null
|
||||||
|
|
||||||
|
const source = sources.find(source => source.provider === selectedProvider)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div>
|
<div>
|
||||||
<div className="mb-4">
|
<div className="mb-4">
|
||||||
@ -86,7 +93,7 @@ const Website: FC<Props> = ({
|
|||||||
)}
|
)}
|
||||||
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
onClick={() => setSelectedProvider(DataSourceProvider.jinaReader)}
|
||||||
>
|
>
|
||||||
<span className={cn(s.jinaLogo, 'mr-2')} />
|
<span className={cn(s.jinaLogo, 'mr-2')}/>
|
||||||
<span>Jina Reader</span>
|
<span>Jina Reader</span>
|
||||||
</button>
|
</button>
|
||||||
<button
|
<button
|
||||||
@ -100,40 +107,53 @@ const Website: FC<Props> = ({
|
|||||||
>
|
>
|
||||||
🔥 Firecrawl
|
🔥 Firecrawl
|
||||||
</button>
|
</button>
|
||||||
|
<button
|
||||||
|
className={cn('flex items-center justify-center rounded-lg px-4 py-2',
|
||||||
|
selectedProvider === DataSourceProvider.waterCrawl
|
||||||
|
? 'system-sm-medium border-[1.5px] border-components-option-card-option-selected-border bg-components-option-card-option-selected-bg text-text-primary'
|
||||||
|
: `system-sm-regular border border-components-option-card-option-border bg-components-option-card-option-bg text-text-secondary
|
||||||
|
hover:border-components-option-card-option-border-hover hover:bg-components-option-card-option-bg-hover hover:shadow-xs hover:shadow-shadow-shadow-3`,
|
||||||
|
)}
|
||||||
|
onClick={() => setSelectedProvider(DataSourceProvider.waterCrawl)}
|
||||||
|
>
|
||||||
|
<span className={cn(s.watercrawlLogo, 'mr-2')}/>
|
||||||
|
<span>WaterCrawl</span>
|
||||||
|
</button>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
{source && selectedProvider === DataSourceProvider.fireCrawl && (
|
||||||
{
|
<Firecrawl
|
||||||
selectedProvider === DataSourceProvider.fireCrawl
|
onPreview={onPreview}
|
||||||
? sources.find(source => source.provider === DataSourceProvider.fireCrawl)
|
checkedCrawlResult={checkedCrawlResult}
|
||||||
? (
|
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||||
<Firecrawl
|
onJobIdChange={onJobIdChange}
|
||||||
onPreview={onPreview}
|
crawlOptions={crawlOptions}
|
||||||
checkedCrawlResult={checkedCrawlResult}
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
/>
|
||||||
onJobIdChange={onJobIdChange}
|
)}
|
||||||
crawlOptions={crawlOptions}
|
{source && selectedProvider === DataSourceProvider.waterCrawl && (
|
||||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
<Watercrawl
|
||||||
/>
|
onPreview={onPreview}
|
||||||
)
|
checkedCrawlResult={checkedCrawlResult}
|
||||||
: (
|
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
onJobIdChange={onJobIdChange}
|
||||||
)
|
crawlOptions={crawlOptions}
|
||||||
: sources.find(source => source.provider === DataSourceProvider.jinaReader)
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
? (
|
/>
|
||||||
<JinaReader
|
)}
|
||||||
onPreview={onPreview}
|
{source && selectedProvider === DataSourceProvider.jinaReader && (
|
||||||
checkedCrawlResult={checkedCrawlResult}
|
<JinaReader
|
||||||
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
onPreview={onPreview}
|
||||||
onJobIdChange={onJobIdChange}
|
checkedCrawlResult={checkedCrawlResult}
|
||||||
crawlOptions={crawlOptions}
|
onCheckedCrawlResultChange={onCheckedCrawlResultChange}
|
||||||
onCrawlOptionsChange={onCrawlOptionsChange}
|
onJobIdChange={onJobIdChange}
|
||||||
/>
|
crawlOptions={crawlOptions}
|
||||||
)
|
onCrawlOptionsChange={onCrawlOptionsChange}
|
||||||
: (
|
/>
|
||||||
<NoData onConfig={handleOnConfig} provider={selectedProvider} />
|
)}
|
||||||
)
|
{!source && (
|
||||||
}
|
<NoData onConfig={handleOnConfig} provider={selectedProvider}/>
|
||||||
|
)}
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -31,6 +31,11 @@ const NoData: FC<Props> = ({
|
|||||||
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
title: t(`${I18N_PREFIX}.fireCrawlNotConfigured`),
|
||||||
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
description: t(`${I18N_PREFIX}.fireCrawlNotConfiguredDescription`),
|
||||||
},
|
},
|
||||||
|
[DataSourceProvider.waterCrawl]: {
|
||||||
|
emoji: <span className={s.watercrawlLogo} />,
|
||||||
|
title: t(`${I18N_PREFIX}.waterCrawlNotConfigured`),
|
||||||
|
description: t(`${I18N_PREFIX}.waterCrawlNotConfiguredDescription`),
|
||||||
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
const currentProvider = providerConfig[provider]
|
const currentProvider = providerConfig[provider]
|
||||||
|
@ -0,0 +1,43 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import { RiBookOpenLine, RiEqualizer2Line } from '@remixicon/react'
|
||||||
|
import Button from '@/app/components/base/button'
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
onSetting: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const Header: FC<Props> = ({
|
||||||
|
onSetting,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div className='flex h-6 items-center justify-between'>
|
||||||
|
<div className='flex items-center'>
|
||||||
|
<div className='text-base font-medium text-text-secondary'>{t(`${I18N_PREFIX}.watercrawlTitle`)}</div>
|
||||||
|
<div className='ml-2 mr-2 w-px h-3.5 bg-divider-regular' />
|
||||||
|
<Button className='flex items-center gap-x-[1px] h-6 px-1.5' onClick={onSetting}>
|
||||||
|
<RiEqualizer2Line className='w-3.5 h-3.5 text-components-button-secondary-text' />
|
||||||
|
<span className='text-components-button-secondary-text text-xs font-medium px-[3px]'>
|
||||||
|
{t(`${I18N_PREFIX}.configureWatercrawl`)}
|
||||||
|
</span>
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
<a
|
||||||
|
href='https://docs.watercrawl.dev/'
|
||||||
|
target='_blank'
|
||||||
|
rel='noopener noreferrer'
|
||||||
|
className='inline-flex items-center gap-x-1 text-xs font-medium text-text-accent'
|
||||||
|
>
|
||||||
|
<RiBookOpenLine className='w-3.5 h-3.5 text-text-accent' />
|
||||||
|
<span>{t(`${I18N_PREFIX}.watercrawlDoc`)}</span>
|
||||||
|
</a>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(Header)
|
217
web/app/components/datasets/create/website/watercrawl/index.tsx
Normal file
217
web/app/components/datasets/create/website/watercrawl/index.tsx
Normal file
@ -0,0 +1,217 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback, useEffect, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import UrlInput from '../base/url-input'
|
||||||
|
import OptionsWrap from '../base/options-wrap'
|
||||||
|
import CrawledResult from '../base/crawled-result'
|
||||||
|
import Crawling from '../base/crawling'
|
||||||
|
import ErrorMessage from '../base/error-message'
|
||||||
|
import Header from './header'
|
||||||
|
import Options from './options'
|
||||||
|
import { useModalContext } from '@/context/modal-context'
|
||||||
|
import type { CrawlOptions, CrawlResultItem } from '@/models/datasets'
|
||||||
|
import Toast from '@/app/components/base/toast'
|
||||||
|
import { checkWatercrawlTaskStatus, createWatercrawlTask } from '@/service/datasets'
|
||||||
|
import { sleep } from '@/utils'
|
||||||
|
|
||||||
|
const ERROR_I18N_PREFIX = 'common.errorMsg'
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
onPreview: (payload: CrawlResultItem) => void
|
||||||
|
checkedCrawlResult: CrawlResultItem[]
|
||||||
|
onCheckedCrawlResultChange: (payload: CrawlResultItem[]) => void
|
||||||
|
onJobIdChange: (jobId: string) => void
|
||||||
|
crawlOptions: CrawlOptions
|
||||||
|
onCrawlOptionsChange: (payload: CrawlOptions) => void
|
||||||
|
}
|
||||||
|
|
||||||
|
enum Step {
|
||||||
|
init = 'init',
|
||||||
|
running = 'running',
|
||||||
|
finished = 'finished',
|
||||||
|
}
|
||||||
|
|
||||||
|
const WaterCrawl: FC<Props> = ({
|
||||||
|
onPreview,
|
||||||
|
checkedCrawlResult,
|
||||||
|
onCheckedCrawlResultChange,
|
||||||
|
onJobIdChange,
|
||||||
|
crawlOptions,
|
||||||
|
onCrawlOptionsChange,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const [step, setStep] = useState<Step>(Step.init)
|
||||||
|
const [controlFoldOptions, setControlFoldOptions] = useState<number>(0)
|
||||||
|
useEffect(() => {
|
||||||
|
if (step !== Step.init)
|
||||||
|
setControlFoldOptions(Date.now())
|
||||||
|
}, [step])
|
||||||
|
const { setShowAccountSettingModal } = useModalContext()
|
||||||
|
const handleSetting = useCallback(() => {
|
||||||
|
setShowAccountSettingModal({
|
||||||
|
payload: 'data-source',
|
||||||
|
})
|
||||||
|
}, [setShowAccountSettingModal])
|
||||||
|
|
||||||
|
const checkValid = useCallback((url: string) => {
|
||||||
|
let errorMsg = ''
|
||||||
|
if (!url) {
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||||
|
field: 'url',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!errorMsg && !((url.startsWith('http://') || url.startsWith('https://'))))
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.urlError`)
|
||||||
|
|
||||||
|
if (!errorMsg && (crawlOptions.limit === null || crawlOptions.limit === undefined || crawlOptions.limit === '')) {
|
||||||
|
errorMsg = t(`${ERROR_I18N_PREFIX}.fieldRequired`, {
|
||||||
|
field: t(`${I18N_PREFIX}.limit`),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
isValid: !errorMsg,
|
||||||
|
errorMsg,
|
||||||
|
}
|
||||||
|
}, [crawlOptions, t])
|
||||||
|
|
||||||
|
const isInit = step === Step.init
|
||||||
|
const isCrawlFinished = step === Step.finished
|
||||||
|
const isRunning = step === Step.running
|
||||||
|
const [crawlResult, setCrawlResult] = useState<{
|
||||||
|
current: number
|
||||||
|
total: number
|
||||||
|
data: CrawlResultItem[]
|
||||||
|
time_consuming: number | string
|
||||||
|
} | undefined>(undefined)
|
||||||
|
const [crawlErrorMessage, setCrawlErrorMessage] = useState('')
|
||||||
|
const showError = isCrawlFinished && crawlErrorMessage
|
||||||
|
|
||||||
|
const waitForCrawlFinished = useCallback(async (jobId: string): Promise<any> => {
|
||||||
|
try {
|
||||||
|
const res = await checkWatercrawlTaskStatus(jobId) as any
|
||||||
|
if (res.status === 'completed') {
|
||||||
|
return {
|
||||||
|
isError: false,
|
||||||
|
data: {
|
||||||
|
...res,
|
||||||
|
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (res.status === 'error' || !res.status) {
|
||||||
|
// can't get the error message from the watercrawl api
|
||||||
|
return {
|
||||||
|
isError: true,
|
||||||
|
errorMessage: res.message,
|
||||||
|
data: {
|
||||||
|
data: [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// update the progress
|
||||||
|
setCrawlResult({
|
||||||
|
...res,
|
||||||
|
total: Math.min(res.total, Number.parseFloat(crawlOptions.limit as string)),
|
||||||
|
})
|
||||||
|
onCheckedCrawlResultChange(res.data || []) // default select the crawl result
|
||||||
|
await sleep(2500)
|
||||||
|
return await waitForCrawlFinished(jobId)
|
||||||
|
}
|
||||||
|
catch (e: any) {
|
||||||
|
const errorBody = await e.json()
|
||||||
|
return {
|
||||||
|
isError: true,
|
||||||
|
errorMessage: errorBody.message,
|
||||||
|
data: {
|
||||||
|
data: [],
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}, [crawlOptions.limit])
|
||||||
|
|
||||||
|
const handleRun = useCallback(async (url: string) => {
|
||||||
|
const { isValid, errorMsg } = checkValid(url)
|
||||||
|
if (!isValid) {
|
||||||
|
Toast.notify({
|
||||||
|
message: errorMsg!,
|
||||||
|
type: 'error',
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
setStep(Step.running)
|
||||||
|
try {
|
||||||
|
const passToServerCrawlOptions: any = {
|
||||||
|
...crawlOptions,
|
||||||
|
}
|
||||||
|
if (crawlOptions.max_depth === '')
|
||||||
|
delete passToServerCrawlOptions.max_depth
|
||||||
|
|
||||||
|
const res = await createWatercrawlTask({
|
||||||
|
url,
|
||||||
|
options: passToServerCrawlOptions,
|
||||||
|
}) as any
|
||||||
|
const jobId = res.job_id
|
||||||
|
onJobIdChange(jobId)
|
||||||
|
const { isError, data, errorMessage } = await waitForCrawlFinished(jobId)
|
||||||
|
if (isError) {
|
||||||
|
setCrawlErrorMessage(errorMessage || t(`${I18N_PREFIX}.unknownError`))
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
setCrawlResult(data)
|
||||||
|
onCheckedCrawlResultChange(data.data || []) // default select the crawl result
|
||||||
|
setCrawlErrorMessage('')
|
||||||
|
}
|
||||||
|
}
|
||||||
|
catch (e) {
|
||||||
|
setCrawlErrorMessage(t(`${I18N_PREFIX}.unknownError`)!)
|
||||||
|
console.log(e)
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
setStep(Step.finished)
|
||||||
|
}
|
||||||
|
}, [checkValid, crawlOptions, onJobIdChange, t, waitForCrawlFinished])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<Header onSetting={handleSetting} />
|
||||||
|
<div className='mt-2 rounded-xl border border-components-panel-border bg-background-default-subtle p-4 pb-0'>
|
||||||
|
<UrlInput onRun={handleRun} isRunning={isRunning} />
|
||||||
|
<OptionsWrap
|
||||||
|
className='mt-4'
|
||||||
|
controlFoldOptions={controlFoldOptions}
|
||||||
|
>
|
||||||
|
<Options className='mt-2' payload={crawlOptions} onChange={onCrawlOptionsChange} />
|
||||||
|
</OptionsWrap>
|
||||||
|
|
||||||
|
{!isInit && (
|
||||||
|
<div className='relative left-[-16px] mt-3 w-[calc(100%_+_32px)] rounded-b-xl'>
|
||||||
|
{isRunning
|
||||||
|
&& <Crawling
|
||||||
|
className='mt-2'
|
||||||
|
crawledNum={crawlResult?.current || 0}
|
||||||
|
totalNum={crawlResult?.total || Number.parseFloat(crawlOptions.limit as string) || 0}
|
||||||
|
/>}
|
||||||
|
{showError && (
|
||||||
|
<ErrorMessage className='rounded-b-xl' title={t(`${I18N_PREFIX}.exceptionErrorTitle`)} errorMsg={crawlErrorMessage} />
|
||||||
|
)}
|
||||||
|
{isCrawlFinished && !showError
|
||||||
|
&& <CrawledResult
|
||||||
|
className='mb-2'
|
||||||
|
list={crawlResult?.data || []}
|
||||||
|
checkedList={checkedCrawlResult}
|
||||||
|
onSelectedChange={onCheckedCrawlResultChange}
|
||||||
|
onPreview={onPreview}
|
||||||
|
usedTime={Number.parseFloat(crawlResult?.time_consuming as string) || 0}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
</div>
|
||||||
|
)}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(WaterCrawl)
|
@ -0,0 +1,85 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import CheckboxWithLabel from '../base/checkbox-with-label'
|
||||||
|
import Field from '../base/field'
|
||||||
|
import cn from '@/utils/classnames'
|
||||||
|
import type { CrawlOptions } from '@/models/datasets'
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.stepOne.website'
|
||||||
|
|
||||||
|
type Props = {
|
||||||
|
className?: string
|
||||||
|
payload: CrawlOptions
|
||||||
|
onChange: (payload: CrawlOptions) => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const Options: FC<Props> = ({
|
||||||
|
className = '',
|
||||||
|
payload,
|
||||||
|
onChange,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
|
||||||
|
const handleChange = useCallback((key: keyof CrawlOptions) => {
|
||||||
|
return (value: any) => {
|
||||||
|
onChange({
|
||||||
|
...payload,
|
||||||
|
[key]: value,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}, [payload, onChange])
|
||||||
|
return (
|
||||||
|
<div className={cn(className, ' space-y-2')}>
|
||||||
|
<CheckboxWithLabel
|
||||||
|
label={t(`${I18N_PREFIX}.crawlSubPage`)}
|
||||||
|
isChecked={payload.crawl_sub_pages}
|
||||||
|
onChange={handleChange('crawl_sub_pages')}
|
||||||
|
labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary'
|
||||||
|
/>
|
||||||
|
<div className='flex justify-between space-x-4'>
|
||||||
|
<Field
|
||||||
|
className='shrink-0 grow'
|
||||||
|
label={t(`${I18N_PREFIX}.limit`)}
|
||||||
|
value={payload.limit}
|
||||||
|
onChange={handleChange('limit')}
|
||||||
|
isNumber
|
||||||
|
isRequired
|
||||||
|
/>
|
||||||
|
<Field
|
||||||
|
className='shrink-0 grow'
|
||||||
|
label={t(`${I18N_PREFIX}.maxDepth`)}
|
||||||
|
value={payload.max_depth}
|
||||||
|
onChange={handleChange('max_depth')}
|
||||||
|
isNumber
|
||||||
|
tooltip={t(`${I18N_PREFIX}.maxDepthTooltip`)!}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className='flex justify-between space-x-4'>
|
||||||
|
<Field
|
||||||
|
className='shrink-0 grow'
|
||||||
|
label={t(`${I18N_PREFIX}.excludePaths`)}
|
||||||
|
value={payload.excludes}
|
||||||
|
onChange={handleChange('excludes')}
|
||||||
|
placeholder='blog/*, /about/*'
|
||||||
|
/>
|
||||||
|
<Field
|
||||||
|
className='shrink-0 grow'
|
||||||
|
label={t(`${I18N_PREFIX}.includeOnlyPaths`)}
|
||||||
|
value={payload.includes}
|
||||||
|
onChange={handleChange('includes')}
|
||||||
|
placeholder='articles/*'
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<CheckboxWithLabel
|
||||||
|
label={t(`${I18N_PREFIX}.extractOnlyMainContent`)}
|
||||||
|
isChecked={payload.only_main_content}
|
||||||
|
onChange={handleChange('only_main_content')}
|
||||||
|
labelClassName='text-[13px] leading-[16px] font-medium text-text-secondary'
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(Options)
|
@ -0,0 +1,161 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import React, { useCallback, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import {
|
||||||
|
PortalToFollowElem,
|
||||||
|
PortalToFollowElemContent,
|
||||||
|
} from '@/app/components/base/portal-to-follow-elem'
|
||||||
|
import { Lock01 } from '@/app/components/base/icons/src/vender/solid/security'
|
||||||
|
import Button from '@/app/components/base/button'
|
||||||
|
import type { WatercrawlConfig } from '@/models/common'
|
||||||
|
import Field from '@/app/components/datasets/create/website/base/field'
|
||||||
|
import Toast from '@/app/components/base/toast'
|
||||||
|
import { createDataSourceApiKeyBinding } from '@/service/datasets'
|
||||||
|
import { LinkExternal02 } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
type Props = {
|
||||||
|
onCancel: () => void
|
||||||
|
onSaved: () => void
|
||||||
|
}
|
||||||
|
|
||||||
|
const I18N_PREFIX = 'datasetCreation.watercrawl'
|
||||||
|
|
||||||
|
const DEFAULT_BASE_URL = 'https://app.watercrawl.dev'
|
||||||
|
|
||||||
|
const ConfigWatercrawlModal: FC<Props> = ({
|
||||||
|
onCancel,
|
||||||
|
onSaved,
|
||||||
|
}) => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const [isSaving, setIsSaving] = useState(false)
|
||||||
|
const [config, setConfig] = useState<WatercrawlConfig>({
|
||||||
|
api_key: '',
|
||||||
|
base_url: '',
|
||||||
|
})
|
||||||
|
|
||||||
|
const handleConfigChange = useCallback((key: string) => {
|
||||||
|
return (value: string | number) => {
|
||||||
|
setConfig(prev => ({ ...prev, [key]: value as string }))
|
||||||
|
}
|
||||||
|
}, [])
|
||||||
|
|
||||||
|
const handleSave = useCallback(async () => {
|
||||||
|
if (isSaving)
|
||||||
|
return
|
||||||
|
let errorMsg = ''
|
||||||
|
if (config.base_url && !((config.base_url.startsWith('http://') || config.base_url.startsWith('https://'))))
|
||||||
|
errorMsg = t('common.errorMsg.urlError')
|
||||||
|
if (!errorMsg) {
|
||||||
|
if (!config.api_key) {
|
||||||
|
errorMsg = t('common.errorMsg.fieldRequired', {
|
||||||
|
field: 'API Key',
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (errorMsg) {
|
||||||
|
Toast.notify({
|
||||||
|
type: 'error',
|
||||||
|
message: errorMsg,
|
||||||
|
})
|
||||||
|
return
|
||||||
|
}
|
||||||
|
const postData = {
|
||||||
|
category: 'website',
|
||||||
|
provider: 'watercrawl',
|
||||||
|
credentials: {
|
||||||
|
auth_type: 'x-api-key',
|
||||||
|
config: {
|
||||||
|
api_key: config.api_key,
|
||||||
|
base_url: config.base_url || DEFAULT_BASE_URL,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
setIsSaving(true)
|
||||||
|
await createDataSourceApiKeyBinding(postData)
|
||||||
|
Toast.notify({
|
||||||
|
type: 'success',
|
||||||
|
message: t('common.api.success'),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
finally {
|
||||||
|
setIsSaving(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
onSaved()
|
||||||
|
}, [config.api_key, config.base_url, onSaved, t, isSaving])
|
||||||
|
|
||||||
|
return (
|
||||||
|
<PortalToFollowElem open>
|
||||||
|
<PortalToFollowElemContent className='w-full h-full z-[60]'>
|
||||||
|
<div className='fixed inset-0 flex items-center justify-center bg-background-overlay'>
|
||||||
|
<div className='mx-2 w-[640px] max-h-[calc(100vh-120px)] bg-components-panel-bg shadow-xl rounded-2xl overflow-y-auto'>
|
||||||
|
<div className='px-8 pt-8'>
|
||||||
|
<div className='flex justify-between items-center mb-4'>
|
||||||
|
<div className='system-xl-semibold text-text-primary'>{t(`${I18N_PREFIX}.configWatercrawl`)}</div>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div className='space-y-4'>
|
||||||
|
<Field
|
||||||
|
label='API Key'
|
||||||
|
labelClassName='!text-sm'
|
||||||
|
isRequired
|
||||||
|
value={config.api_key}
|
||||||
|
onChange={handleConfigChange('api_key')}
|
||||||
|
placeholder={t(`${I18N_PREFIX}.apiKeyPlaceholder`)!}
|
||||||
|
/>
|
||||||
|
<Field
|
||||||
|
label='Base URL'
|
||||||
|
labelClassName='!text-sm'
|
||||||
|
value={config.base_url}
|
||||||
|
onChange={handleConfigChange('base_url')}
|
||||||
|
placeholder={DEFAULT_BASE_URL}
|
||||||
|
/>
|
||||||
|
</div>
|
||||||
|
<div className='my-8 flex justify-between items-center h-8'>
|
||||||
|
<a className='flex items-center space-x-1 leading-[18px] text-xs font-normal text-text-accent' target='_blank' href='https://app.watercrawl.dev/'>
|
||||||
|
<span>{t(`${I18N_PREFIX}.getApiKeyLinkText`)}</span>
|
||||||
|
<LinkExternal02 className='w-3 h-3' />
|
||||||
|
</a>
|
||||||
|
<div className='flex'>
|
||||||
|
<Button
|
||||||
|
size='large'
|
||||||
|
className='mr-2'
|
||||||
|
onClick={onCancel}
|
||||||
|
>
|
||||||
|
{t('common.operation.cancel')}
|
||||||
|
</Button>
|
||||||
|
<Button
|
||||||
|
variant='primary'
|
||||||
|
size='large'
|
||||||
|
onClick={handleSave}
|
||||||
|
loading={isSaving}
|
||||||
|
>
|
||||||
|
{t('common.operation.save')}
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div className='border-t-[0.5px] border-t-divider-regular'>
|
||||||
|
<div className='flex justify-center items-center py-3 bg-background-section-burn text-xs text-text-tertiary'>
|
||||||
|
<Lock01 className='mr-1 w-3 h-3 text-text-tertiary' />
|
||||||
|
{t('common.modelProvider.encrypted.front')}
|
||||||
|
<a
|
||||||
|
className='text-text-accent mx-1'
|
||||||
|
target='_blank' rel='noopener noreferrer'
|
||||||
|
href='https://pycryptodome.readthedocs.io/en/latest/src/cipher/oaep.html'
|
||||||
|
>
|
||||||
|
PKCS1_OAEP
|
||||||
|
</a>
|
||||||
|
{t('common.modelProvider.encrypted.back')}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</PortalToFollowElemContent>
|
||||||
|
</PortalToFollowElem>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default React.memo(ConfigWatercrawlModal)
|
@ -5,19 +5,15 @@ import { useTranslation } from 'react-i18next'
|
|||||||
import Panel from '../panel'
|
import Panel from '../panel'
|
||||||
import { DataSourceType } from '../panel/types'
|
import { DataSourceType } from '../panel/types'
|
||||||
import ConfigFirecrawlModal from './config-firecrawl-modal'
|
import ConfigFirecrawlModal from './config-firecrawl-modal'
|
||||||
|
import ConfigWatercrawlModal from './config-watercrawl-modal'
|
||||||
import ConfigJinaReaderModal from './config-jina-reader-modal'
|
import ConfigJinaReaderModal from './config-jina-reader-modal'
|
||||||
import cn from '@/utils/classnames'
|
import cn from '@/utils/classnames'
|
||||||
import s from '@/app/components/datasets/create/website/index.module.css'
|
import s from '@/app/components/datasets/create/website/index.module.css'
|
||||||
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
|
import { fetchDataSources, removeDataSourceApiKeyBinding } from '@/service/datasets'
|
||||||
|
|
||||||
import type {
|
import type { DataSourceItem } from '@/models/common'
|
||||||
DataSourceItem,
|
import { DataSourceProvider } from '@/models/common'
|
||||||
} from '@/models/common'
|
|
||||||
import { useAppContext } from '@/context/app-context'
|
import { useAppContext } from '@/context/app-context'
|
||||||
|
|
||||||
import {
|
|
||||||
DataSourceProvider,
|
|
||||||
} from '@/models/common'
|
|
||||||
import Toast from '@/app/components/base/toast'
|
import Toast from '@/app/components/base/toast'
|
||||||
|
|
||||||
type Props = {
|
type Props = {
|
||||||
@ -58,6 +54,16 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
|
|||||||
return source?.id
|
return source?.id
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const getProviderName = (provider: DataSourceProvider): string => {
|
||||||
|
if (provider === DataSourceProvider.fireCrawl)
|
||||||
|
return 'Firecrawl'
|
||||||
|
|
||||||
|
if (provider === DataSourceProvider.waterCrawl)
|
||||||
|
return 'WaterCrawl'
|
||||||
|
|
||||||
|
return 'Jina Reader'
|
||||||
|
}
|
||||||
|
|
||||||
const handleRemove = useCallback((provider: DataSourceProvider) => {
|
const handleRemove = useCallback((provider: DataSourceProvider) => {
|
||||||
return async () => {
|
return async () => {
|
||||||
const dataSourceId = getIdByProvider(provider)
|
const dataSourceId = getIdByProvider(provider)
|
||||||
@ -82,27 +88,42 @@ const DataSourceWebsite: FC<Props> = ({ provider }) => {
|
|||||||
readOnly={!isCurrentWorkspaceManager}
|
readOnly={!isCurrentWorkspaceManager}
|
||||||
configuredList={sources.filter(item => item.provider === provider).map(item => ({
|
configuredList={sources.filter(item => item.provider === provider).map(item => ({
|
||||||
id: item.id,
|
id: item.id,
|
||||||
logo: ({ className }: { className: string }) => (
|
logo: ({ className }: { className: string }) => {
|
||||||
item.provider === DataSourceProvider.fireCrawl
|
if (item.provider === DataSourceProvider.fireCrawl) {
|
||||||
? (
|
return (
|
||||||
<div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div>
|
<div
|
||||||
|
className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>🔥</div>
|
||||||
)
|
)
|
||||||
: (
|
}
|
||||||
<div className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>
|
|
||||||
<span className={s.jinaLogo} />
|
if (item.provider === DataSourceProvider.waterCrawl) {
|
||||||
|
return (
|
||||||
|
<div
|
||||||
|
className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>
|
||||||
|
<span className={s.watercrawlLogo}/>
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
),
|
}
|
||||||
name: item.provider === DataSourceProvider.fireCrawl ? 'Firecrawl' : 'Jina Reader',
|
return (
|
||||||
|
<div
|
||||||
|
className={cn(className, 'ml-3 flex h-5 w-5 items-center justify-center rounded border border-divider-subtle !bg-background-default text-xs font-medium text-text-tertiary')}>
|
||||||
|
<span className={s.jinaLogo}/>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
},
|
||||||
|
name: getProviderName(item.provider),
|
||||||
isActive: true,
|
isActive: true,
|
||||||
}))}
|
}))}
|
||||||
onRemove={handleRemove(provider)}
|
onRemove={handleRemove(provider)}
|
||||||
/>
|
/>
|
||||||
{configTarget === DataSourceProvider.fireCrawl && (
|
{configTarget === DataSourceProvider.fireCrawl && (
|
||||||
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig} />
|
<ConfigFirecrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||||
|
)}
|
||||||
|
{configTarget === DataSourceProvider.waterCrawl && (
|
||||||
|
<ConfigWatercrawlModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||||
)}
|
)}
|
||||||
{configTarget === DataSourceProvider.jinaReader && (
|
{configTarget === DataSourceProvider.jinaReader && (
|
||||||
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig} />
|
<ConfigJinaReaderModal onSaved={handleAdded} onCancel={hideConfig}/>
|
||||||
)}
|
)}
|
||||||
</>
|
</>
|
||||||
|
|
||||||
|
@ -15,6 +15,7 @@ export default function DataSourcePage() {
|
|||||||
<DataSourceNotion workspaces={notionWorkspaces} />
|
<DataSourceNotion workspaces={notionWorkspaces} />
|
||||||
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
|
<DataSourceWebsite provider={DataSourceProvider.jinaReader} />
|
||||||
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
|
<DataSourceWebsite provider={DataSourceProvider.fireCrawl} />
|
||||||
|
<DataSourceWebsite provider={DataSourceProvider.waterCrawl} />
|
||||||
</div>
|
</div>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -41,6 +41,12 @@ const Panel: FC<Props> = ({
|
|||||||
const isNotion = type === DataSourceType.notion
|
const isNotion = type === DataSourceType.notion
|
||||||
const isWebsite = type === DataSourceType.website
|
const isWebsite = type === DataSourceType.website
|
||||||
|
|
||||||
|
const getProviderName = (): string => {
|
||||||
|
if (provider === DataSourceProvider.fireCrawl) return '🔥 Firecrawl'
|
||||||
|
if (provider === DataSourceProvider.waterCrawl) return 'WaterCrawl'
|
||||||
|
return 'Jina Reader'
|
||||||
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<div className='mb-2 rounded-xl bg-background-section-burn'>
|
<div className='mb-2 rounded-xl bg-background-section-burn'>
|
||||||
<div className='flex items-center px-3 py-[9px]'>
|
<div className='flex items-center px-3 py-[9px]'>
|
||||||
@ -50,7 +56,7 @@ const Panel: FC<Props> = ({
|
|||||||
<div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div>
|
<div className='text-sm font-medium text-text-primary'>{t(`common.dataSource.${type}.title`)}</div>
|
||||||
{isWebsite && (
|
{isWebsite && (
|
||||||
<div className='ml-1 rounded-md bg-components-badge-white-to-dark px-1.5 text-xs font-medium leading-[18px] text-text-secondary'>
|
<div className='ml-1 rounded-md bg-components-badge-white-to-dark px-1.5 text-xs font-medium leading-[18px] text-text-secondary'>
|
||||||
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> { provider === DataSourceProvider.fireCrawl ? '🔥 Firecrawl' : 'Jina Reader'}
|
<span className='text-text-tertiary'>{t('common.dataSource.website.with')}</span> {getProviderName()}
|
||||||
</div>
|
</div>
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
@ -15,6 +15,11 @@ const translation = {
|
|||||||
apiKeyPlaceholder: 'API key from firecrawl.dev',
|
apiKeyPlaceholder: 'API key from firecrawl.dev',
|
||||||
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
|
getApiKeyLinkText: 'Get your API key from firecrawl.dev',
|
||||||
},
|
},
|
||||||
|
watercrawl: {
|
||||||
|
configWatercrawl: 'Configure Watercrawl',
|
||||||
|
apiKeyPlaceholder: 'API key from watercrawl.dev',
|
||||||
|
getApiKeyLinkText: 'Get your API key from watercrawl.dev',
|
||||||
|
},
|
||||||
jinaReader: {
|
jinaReader: {
|
||||||
configJinaReader: 'Configure Jina Reader',
|
configJinaReader: 'Configure Jina Reader',
|
||||||
apiKeyPlaceholder: 'API key from jina.ai',
|
apiKeyPlaceholder: 'API key from jina.ai',
|
||||||
@ -64,15 +69,21 @@ const translation = {
|
|||||||
chooseProvider: 'Select a provider',
|
chooseProvider: 'Select a provider',
|
||||||
fireCrawlNotConfigured: 'Firecrawl is not configured',
|
fireCrawlNotConfigured: 'Firecrawl is not configured',
|
||||||
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
|
fireCrawlNotConfiguredDescription: 'Configure Firecrawl with API key to use it.',
|
||||||
|
watercrawlNotConfigured: 'Watercrawl is not configured',
|
||||||
|
watercrawlNotConfiguredDescription: 'Configure Watercrawl with API key to use it.',
|
||||||
jinaReaderNotConfigured: 'Jina Reader is not configured',
|
jinaReaderNotConfigured: 'Jina Reader is not configured',
|
||||||
jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
|
jinaReaderNotConfiguredDescription: 'Set up Jina Reader by entering your free API key for access.',
|
||||||
configure: 'Configure',
|
configure: 'Configure',
|
||||||
configureFirecrawl: 'Configure Firecrawl',
|
configureFirecrawl: 'Configure Firecrawl',
|
||||||
|
configureWatercrawl: 'Configure Watercrawl',
|
||||||
configureJinaReader: 'Configure Jina Reader',
|
configureJinaReader: 'Configure Jina Reader',
|
||||||
run: 'Run',
|
run: 'Run',
|
||||||
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
|
firecrawlTitle: 'Extract web content with 🔥Firecrawl',
|
||||||
firecrawlDoc: 'Firecrawl docs',
|
firecrawlDoc: 'Firecrawl docs',
|
||||||
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
firecrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
||||||
|
watercrawlTitle: 'Extract web content with Watercrawl',
|
||||||
|
watercrawlDoc: 'Watercrawl docs',
|
||||||
|
watercrawlDocLink: 'https://docs.dify.ai/guides/knowledge-base/sync-from-website',
|
||||||
jinaReaderTitle: 'Convert the entire site to Markdown',
|
jinaReaderTitle: 'Convert the entire site to Markdown',
|
||||||
jinaReaderDoc: 'Learn more about Jina Reader',
|
jinaReaderDoc: 'Learn more about Jina Reader',
|
||||||
jinaReaderDocLink: 'https://jina.ai/reader',
|
jinaReaderDocLink: 'https://jina.ai/reader',
|
||||||
|
@ -178,6 +178,7 @@ export enum DataSourceCategory {
|
|||||||
export enum DataSourceProvider {
|
export enum DataSourceProvider {
|
||||||
fireCrawl = 'firecrawl',
|
fireCrawl = 'firecrawl',
|
||||||
jinaReader = 'jinareader',
|
jinaReader = 'jinareader',
|
||||||
|
waterCrawl = 'watercrawl',
|
||||||
}
|
}
|
||||||
|
|
||||||
export type FirecrawlConfig = {
|
export type FirecrawlConfig = {
|
||||||
@ -185,6 +186,11 @@ export type FirecrawlConfig = {
|
|||||||
base_url: string
|
base_url: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type WatercrawlConfig = {
|
||||||
|
api_key: string
|
||||||
|
base_url: string
|
||||||
|
}
|
||||||
|
|
||||||
export type DataSourceItem = {
|
export type DataSourceItem = {
|
||||||
id: string
|
id: string
|
||||||
category: DataSourceCategory
|
category: DataSourceCategory
|
||||||
|
@ -253,6 +253,25 @@ export const checkJinaReaderTaskStatus: Fetcher<CommonResponse, string> = (jobId
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const createWatercrawlTask: Fetcher<CommonResponse, Record<string, any>> = (body) => {
|
||||||
|
return post<CommonResponse>('website/crawl', {
|
||||||
|
body: {
|
||||||
|
...body,
|
||||||
|
provider: DataSourceProvider.waterCrawl,
|
||||||
|
},
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
export const checkWatercrawlTaskStatus: Fetcher<CommonResponse, string> = (jobId: string) => {
|
||||||
|
return get<CommonResponse>(`website/crawl/status/${jobId}`, {
|
||||||
|
params: {
|
||||||
|
provider: DataSourceProvider.waterCrawl,
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
silent: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
type FileTypesRes = {
|
type FileTypesRes = {
|
||||||
allowed_extensions: string[]
|
allowed_extensions: string[]
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user