mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-10 04:49:03 +08:00
refactor: Update Firecrawl API parameters and default settings (#13082)
This commit is contained in:
parent
d44882c1b5
commit
d0a21086bd
@ -13,9 +13,10 @@ class FirecrawlWebExtractor(BaseExtractor):
|
||||
api_key: The API key for Firecrawl.
|
||||
base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'.
|
||||
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||
"""
|
||||
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
|
||||
"""Initialize with url, api_key, base_url and mode."""
|
||||
self._url = url
|
||||
self.job_id = job_id
|
||||
|
@ -21,8 +21,8 @@ class FirecrawlAuth(ApiKeyAuthBase):
|
||||
headers = self._prepare_headers()
|
||||
options = {
|
||||
"url": "https://example.com",
|
||||
"excludes": [],
|
||||
"includes": [],
|
||||
"includePaths": [],
|
||||
"excludePaths": [],
|
||||
"limit": 1,
|
||||
"scrapeOptions": {"onlyMainContent": True},
|
||||
}
|
||||
|
@ -38,9 +38,8 @@ class WebsiteService:
|
||||
only_main_content = options.get("only_main_content", False)
|
||||
if not crawl_sub_pages:
|
||||
params = {
|
||||
"includes": [],
|
||||
"excludes": [],
|
||||
"generateImgAltText": True,
|
||||
"includePaths": [],
|
||||
"excludePaths": [],
|
||||
"limit": 1,
|
||||
"scrapeOptions": {"onlyMainContent": only_main_content},
|
||||
}
|
||||
@ -48,9 +47,8 @@ class WebsiteService:
|
||||
includes = options.get("includes").split(",") if options.get("includes") else []
|
||||
excludes = options.get("excludes").split(",") if options.get("excludes") else []
|
||||
params = {
|
||||
"includes": includes,
|
||||
"excludes": excludes,
|
||||
"generateImgAltText": True,
|
||||
"includePaths": includes,
|
||||
"excludePaths": excludes,
|
||||
"limit": options.get("limit", 1),
|
||||
"scrapeOptions": {"onlyMainContent": only_main_content},
|
||||
}
|
||||
|
@ -10,9 +10,8 @@ def test_firecrawl_web_extractor_crawl_mode(mocker):
|
||||
base_url = "https://api.firecrawl.dev"
|
||||
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
|
||||
params = {
|
||||
"includes": [],
|
||||
"excludes": [],
|
||||
"generateImgAltText": True,
|
||||
"includePaths": [],
|
||||
"excludePaths": [],
|
||||
"maxDepth": 1,
|
||||
"limit": 1,
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user