refactor: Update Firecrawl API parameters and default settings (#13082)

This commit is contained in:
Ademílson Tonato 2025-01-29 03:21:05 +00:00 committed by GitHub
parent d44882c1b5
commit d0a21086bd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 10 additions and 12 deletions

View File

@ -13,9 +13,10 @@ class FirecrawlWebExtractor(BaseExtractor):
api_key: The API key for Firecrawl. api_key: The API key for Firecrawl.
base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'. base_url: The base URL for the Firecrawl API. Defaults to 'https://api.firecrawl.dev'.
mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'. mode: The mode of operation. Defaults to 'scrape'. Options are 'crawl', 'scrape' and 'crawl_return_urls'.
only_main_content: Only return the main content of the page excluding headers, navs, footers, etc.
""" """
def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = False): def __init__(self, url: str, job_id: str, tenant_id: str, mode: str = "crawl", only_main_content: bool = True):
"""Initialize with url, api_key, base_url and mode.""" """Initialize with url, api_key, base_url and mode."""
self._url = url self._url = url
self.job_id = job_id self.job_id = job_id

View File

@ -21,8 +21,8 @@ class FirecrawlAuth(ApiKeyAuthBase):
headers = self._prepare_headers() headers = self._prepare_headers()
options = { options = {
"url": "https://example.com", "url": "https://example.com",
"excludes": [], "includePaths": [],
"includes": [], "excludePaths": [],
"limit": 1, "limit": 1,
"scrapeOptions": {"onlyMainContent": True}, "scrapeOptions": {"onlyMainContent": True},
} }

View File

@ -38,9 +38,8 @@ class WebsiteService:
only_main_content = options.get("only_main_content", False) only_main_content = options.get("only_main_content", False)
if not crawl_sub_pages: if not crawl_sub_pages:
params = { params = {
"includes": [], "includePaths": [],
"excludes": [], "excludePaths": [],
"generateImgAltText": True,
"limit": 1, "limit": 1,
"scrapeOptions": {"onlyMainContent": only_main_content}, "scrapeOptions": {"onlyMainContent": only_main_content},
} }
@ -48,9 +47,8 @@ class WebsiteService:
includes = options.get("includes").split(",") if options.get("includes") else [] includes = options.get("includes").split(",") if options.get("includes") else []
excludes = options.get("excludes").split(",") if options.get("excludes") else [] excludes = options.get("excludes").split(",") if options.get("excludes") else []
params = { params = {
"includes": includes, "includePaths": includes,
"excludes": excludes, "excludePaths": excludes,
"generateImgAltText": True,
"limit": options.get("limit", 1), "limit": options.get("limit", 1),
"scrapeOptions": {"onlyMainContent": only_main_content}, "scrapeOptions": {"onlyMainContent": only_main_content},
} }

View File

@ -10,9 +10,8 @@ def test_firecrawl_web_extractor_crawl_mode(mocker):
base_url = "https://api.firecrawl.dev" base_url = "https://api.firecrawl.dev"
firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url) firecrawl_app = FirecrawlApp(api_key=api_key, base_url=base_url)
params = { params = {
"includes": [], "includePaths": [],
"excludes": [], "excludePaths": [],
"generateImgAltText": True,
"maxDepth": 1, "maxDepth": 1,
"limit": 1, "limit": 1,
} }