mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-10 07:19:01 +08:00
refactor: Update Firecrawl API parameters and default settings (#13082)
This commit is contained in:
parent
d44882c1b5
commit
d0a21086bd
@ -13,9 +13,10 @@ class FirecrawlWebExtractor(BaseExtractor):
|
|||||||
api_key: The API key for Firecrawl.
|
api_key: The API key for Firecrawl.
|
||||||
base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'.
|
base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'.
|
||||||
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
|
||||||
|
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False):
|
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
|
||||||
"""Initialize with url, api_key, base_url and mode."""
|
"""Initialize with url, api_key, base_url and mode."""
|
||||||
self._url = url
|
self._url = url
|
||||||
self.job_id = job_id
|
self.job_id = job_id
|
||||||
|
@ -21,8 +21,8 @@ class FirecrawlAuth(ApiKeyAuthBase):
|
|||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
options = {
|
options = {
|
||||||
"url": "https://example.com",
|
"url": "https://example.com",
|
||||||
"excludes": [],
|
"includePaths": [],
|
||||||
"includes": [],
|
"excludePaths": [],
|
||||||
"limit": 1,
|
"limit": 1,
|
||||||
"scrapeOptions": {"onlyMainContent": True},
|
"scrapeOptions": {"onlyMainContent": True},
|
||||||
}
|
}
|
||||||
|
@ -38,9 +38,8 @@ class WebsiteService:
|
|||||||
only_main_content = options.get("only_main_content", False)
|
only_main_content = options.get("only_main_content", False)
|
||||||
if not crawl_sub_pages:
|
if not crawl_sub_pages:
|
||||||
params = {
|
params = {
|
||||||
"includes": [],
|
"includePaths": [],
|
||||||
"excludes": [],
|
"excludePaths": [],
|
||||||
"generateImgAltText": True,
|
|
||||||
"limit": 1,
|
"limit": 1,
|
||||||
"scrapeOptions": {"onlyMainContent": only_main_content},
|
"scrapeOptions": {"onlyMainContent": only_main_content},
|
||||||
}
|
}
|
||||||
@ -48,9 +47,8 @@ class WebsiteService:
|
|||||||
includes = options.get("includes").split(",") if options.get("includes") else []
|
includes = options.get("includes").split(",") if options.get("includes") else []
|
||||||
excludes = options.get("excludes").split(",") if options.get("excludes") else []
|
excludes = options.get("excludes").split(",") if options.get("excludes") else []
|
||||||
params = {
|
params = {
|
||||||
"includes": includes,
|
"includePaths": includes,
|
||||||
"excludes": excludes,
|
"excludePaths": excludes,
|
||||||
"generateImgAltText": True,
|
|
||||||
"limit": options.get("limit", 1),
|
"limit": options.get("limit", 1),
|
||||||
"scrapeOptions": {"onlyMainContent": only_main_content},
|
"scrapeOptions": {"onlyMainContent": only_main_content},
|
||||||
}
|
}
|
||||||
|
@ -10,9 +10,8 @@ def test_firecrawl_web_extractor_crawl_mode(mocker):
|
|||||||
base_url = "https://api.firecrawl.dev"
|
base_url = "https://api.firecrawl.dev"
|
||||||
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
|
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
|
||||||
params = {
|
params = {
|
||||||
"includes": [],
|
"includePaths": [],
|
||||||
"excludes": [],
|
"excludePaths": [],
|
||||||
"generateImgAltText": True,
|
|
||||||
"maxDepth": 1,
|
"maxDepth": 1,
|
||||||
"limit": 1,
|
"limit": 1,
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user