mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 06:28:59 +08:00
async functions
This commit is contained in:
parent
5db76992de
commit
0915db515c
@ -2792,225 +2792,472 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
||||||
|
|
||||||
async def batch_scrape_urls(
|
async def batch_scrape_urls(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse:
|
*,
|
||||||
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
include_tags: Optional[List[str]] = None,
|
||||||
|
exclude_tags: Optional[List[str]] = None,
|
||||||
|
only_main_content: Optional[bool] = None,
|
||||||
|
wait_for: Optional[int] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
location: Optional[LocationConfig] = None,
|
||||||
|
mobile: Optional[bool] = None,
|
||||||
|
skip_tls_verification: Optional[bool] = None,
|
||||||
|
remove_base64_images: Optional[bool] = None,
|
||||||
|
block_ads: Optional[bool] = None,
|
||||||
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
||||||
|
extract: Optional[ExtractConfig] = None,
|
||||||
|
json_options: Optional[ExtractConfig] = None,
|
||||||
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
|
agent: Optional[AgentOptions] = None,
|
||||||
|
poll_interval: Optional[int] = 2,
|
||||||
|
idempotency_key: Optional[str] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchScrapeStatusResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrape multiple URLs and monitor until completion.
|
Asynchronously scrape multiple URLs and monitor until completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): URLs to scrape
|
urls (List[str]): URLs to scrape
|
||||||
params (Optional[ScrapeParams]): See ScrapeParams model:
|
formats (Optional[List[Literal]]): Content formats to retrieve
|
||||||
Content Options:
|
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
||||||
* formats - Content formats to retrieve
|
include_tags (Optional[List[str]]): HTML tags to include
|
||||||
* includeTags - HTML tags to include
|
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
||||||
* excludeTags - HTML tags to exclude
|
only_main_content (Optional[bool]): Extract main content only
|
||||||
* onlyMainContent - Extract main content only
|
wait_for (Optional[int]): Wait time in milliseconds
|
||||||
|
timeout (Optional[int]): Request timeout in milliseconds
|
||||||
Request Options:
|
location (Optional[LocationConfig]): Location configuration
|
||||||
* headers - Custom HTTP headers
|
mobile (Optional[bool]): Use mobile user agent
|
||||||
* timeout - Request timeout (ms)
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
||||||
* mobile - Use mobile user agent
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
||||||
* proxy - Proxy type
|
block_ads (Optional[bool]): Block advertisements
|
||||||
|
proxy (Optional[Literal]): Proxy type to use
|
||||||
Extraction Options:
|
extract (Optional[ExtractConfig]): Content extraction config
|
||||||
* extract - Content extraction config
|
json_options (Optional[ExtractConfig]): JSON extraction config
|
||||||
* jsonOptions - JSON extraction config
|
actions (Optional[List[Union]]): Actions to perform
|
||||||
* actions - Actions to perform
|
agent (Optional[AgentOptions]): Agent configuration
|
||||||
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||||
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
**kwargs: Additional parameters to pass to the API
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BatchScrapeStatusResponse with:
|
BatchScrapeStatusResponse with:
|
||||||
* Scraping status and progress
|
* Scraping status and progress
|
||||||
* Scraped content for each URL
|
* Scraped content for each URL
|
||||||
* Success/error information
|
* Success/error information
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If batch scrape fails
|
Exception: If batch scrape fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers()
|
scrape_params = {}
|
||||||
json_data = {'urls': urls}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
endpoint = f'/v1/batch/scrape'
|
# Add individual parameters
|
||||||
|
if formats is not None:
|
||||||
|
scrape_params['formats'] = formats
|
||||||
|
if headers is not None:
|
||||||
|
scrape_params['headers'] = headers
|
||||||
|
if include_tags is not None:
|
||||||
|
scrape_params['includeTags'] = include_tags
|
||||||
|
if exclude_tags is not None:
|
||||||
|
scrape_params['excludeTags'] = exclude_tags
|
||||||
|
if only_main_content is not None:
|
||||||
|
scrape_params['onlyMainContent'] = only_main_content
|
||||||
|
if wait_for is not None:
|
||||||
|
scrape_params['waitFor'] = wait_for
|
||||||
|
if timeout is not None:
|
||||||
|
scrape_params['timeout'] = timeout
|
||||||
|
if location is not None:
|
||||||
|
scrape_params['location'] = location.dict(exclude_none=True)
|
||||||
|
if mobile is not None:
|
||||||
|
scrape_params['mobile'] = mobile
|
||||||
|
if skip_tls_verification is not None:
|
||||||
|
scrape_params['skipTlsVerification'] = skip_tls_verification
|
||||||
|
if remove_base64_images is not None:
|
||||||
|
scrape_params['removeBase64Images'] = remove_base64_images
|
||||||
|
if block_ads is not None:
|
||||||
|
scrape_params['blockAds'] = block_ads
|
||||||
|
if proxy is not None:
|
||||||
|
scrape_params['proxy'] = proxy
|
||||||
|
if extract is not None:
|
||||||
|
if hasattr(extract.schema, 'schema'):
|
||||||
|
extract.schema = extract.schema.schema()
|
||||||
|
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||||
|
if json_options is not None:
|
||||||
|
if hasattr(json_options.schema, 'schema'):
|
||||||
|
json_options.schema = json_options.schema.schema()
|
||||||
|
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||||
|
if actions is not None:
|
||||||
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||||
|
if agent is not None:
|
||||||
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
scrape_params.update(kwargs)
|
||||||
|
|
||||||
|
# Create final params object
|
||||||
|
final_params = ScrapeParams(**scrape_params)
|
||||||
|
params_dict = final_params.dict(exclude_none=True)
|
||||||
|
params_dict['urls'] = urls
|
||||||
|
params_dict['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
response = await self._async_post_request(
|
response = await self._async_post_request(
|
||||||
f'{self.api_url}{endpoint}',
|
f'{self.api_url}/v1/batch/scrape',
|
||||||
json_data,
|
params_dict,
|
||||||
headers
|
headers
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.get('success') and 'id' in response:
|
if response.status_code == 200:
|
||||||
return await self._async_monitor_job_status(response['id'], headers)
|
try:
|
||||||
|
id = response.json().get('id')
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
return self._monitor_job_status(id, headers, poll_interval)
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to start batch scrape. Error: {response.get("error")}')
|
self._handle_error(response, 'start batch scrape job')
|
||||||
|
|
||||||
|
|
||||||
async def async_batch_scrape_urls(
|
async def async_batch_scrape_urls(
|
||||||
self,
|
self,
|
||||||
urls: List[str],
|
urls: List[str],
|
||||||
params: Optional[ScrapeParams] = None,
|
*,
|
||||||
idempotency_key: Optional[str] = None) -> BatchScrapeResponse:
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||||
|
headers: Optional[Dict[str, str]] = None,
|
||||||
|
include_tags: Optional[List[str]] = None,
|
||||||
|
exclude_tags: Optional[List[str]] = None,
|
||||||
|
only_main_content: Optional[bool] = None,
|
||||||
|
wait_for: Optional[int] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
location: Optional[LocationConfig] = None,
|
||||||
|
mobile: Optional[bool] = None,
|
||||||
|
skip_tls_verification: Optional[bool] = None,
|
||||||
|
remove_base64_images: Optional[bool] = None,
|
||||||
|
block_ads: Optional[bool] = None,
|
||||||
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
||||||
|
extract: Optional[ExtractConfig] = None,
|
||||||
|
json_options: Optional[ExtractConfig] = None,
|
||||||
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None,
|
||||||
|
agent: Optional[AgentOptions] = None,
|
||||||
|
idempotency_key: Optional[str] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> BatchScrapeResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous batch scrape job without waiting for completion.
|
Initiate a batch scrape job asynchronously.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): List of URLs to scrape
|
urls (List[str]): URLs to scrape
|
||||||
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
formats (Optional[List[Literal]]): Content formats to retrieve
|
||||||
Content Options:
|
headers (Optional[Dict[str, str]]): Custom HTTP headers
|
||||||
* formats - Content formats to retrieve
|
include_tags (Optional[List[str]]): HTML tags to include
|
||||||
* includeTags - HTML tags to include
|
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
||||||
* excludeTags - HTML tags to exclude
|
only_main_content (Optional[bool]): Extract main content only
|
||||||
* onlyMainContent - Extract main content only
|
wait_for (Optional[int]): Wait time in milliseconds
|
||||||
|
timeout (Optional[int]): Request timeout in milliseconds
|
||||||
Request Options:
|
location (Optional[LocationConfig]): Location configuration
|
||||||
* headers - Custom HTTP headers
|
mobile (Optional[bool]): Use mobile user agent
|
||||||
* timeout - Request timeout (ms)
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
||||||
* mobile - Use mobile user agent
|
remove_base64_images (Optional[bool]): Remove base64 encoded images
|
||||||
* proxy - Proxy type
|
block_ads (Optional[bool]): Block advertisements
|
||||||
|
proxy (Optional[Literal]): Proxy type to use
|
||||||
Extraction Options:
|
extract (Optional[ExtractConfig]): Content extraction config
|
||||||
* extract - Content extraction config
|
json_options (Optional[ExtractConfig]): JSON extraction config
|
||||||
* jsonOptions - JSON extraction config
|
actions (Optional[List[Union]]): Actions to perform
|
||||||
* actions - Actions to perform
|
agent (Optional[AgentOptions]): Agent configuration
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
**kwargs: Additional parameters to pass to the API
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
BatchScrapeResponse with:
|
BatchScrapeResponse with:
|
||||||
* success - Whether job started successfully
|
* success - Whether job started successfully
|
||||||
* id - Unique identifier for the job
|
* id - Unique identifier for the job
|
||||||
* url - Status check URL
|
* url - Status check URL
|
||||||
* error - Error message if start failed
|
* error - Error message if start failed
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If job initiation fails
|
Exception: If job initiation fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers(idempotency_key)
|
scrape_params = {}
|
||||||
json_data = {'urls': urls}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
endpoint = f'/v1/batch/scrape'
|
# Add individual parameters
|
||||||
return await self._async_post_request(
|
if formats is not None:
|
||||||
f'{self.api_url}{endpoint}',
|
scrape_params['formats'] = formats
|
||||||
json_data,
|
if headers is not None:
|
||||||
|
scrape_params['headers'] = headers
|
||||||
|
if include_tags is not None:
|
||||||
|
scrape_params['includeTags'] = include_tags
|
||||||
|
if exclude_tags is not None:
|
||||||
|
scrape_params['excludeTags'] = exclude_tags
|
||||||
|
if only_main_content is not None:
|
||||||
|
scrape_params['onlyMainContent'] = only_main_content
|
||||||
|
if wait_for is not None:
|
||||||
|
scrape_params['waitFor'] = wait_for
|
||||||
|
if timeout is not None:
|
||||||
|
scrape_params['timeout'] = timeout
|
||||||
|
if location is not None:
|
||||||
|
scrape_params['location'] = location.dict(exclude_none=True)
|
||||||
|
if mobile is not None:
|
||||||
|
scrape_params['mobile'] = mobile
|
||||||
|
if skip_tls_verification is not None:
|
||||||
|
scrape_params['skipTlsVerification'] = skip_tls_verification
|
||||||
|
if remove_base64_images is not None:
|
||||||
|
scrape_params['removeBase64Images'] = remove_base64_images
|
||||||
|
if block_ads is not None:
|
||||||
|
scrape_params['blockAds'] = block_ads
|
||||||
|
if proxy is not None:
|
||||||
|
scrape_params['proxy'] = proxy
|
||||||
|
if extract is not None:
|
||||||
|
if hasattr(extract.schema, 'schema'):
|
||||||
|
extract.schema = extract.schema.schema()
|
||||||
|
scrape_params['extract'] = extract.dict(exclude_none=True)
|
||||||
|
if json_options is not None:
|
||||||
|
if hasattr(json_options.schema, 'schema'):
|
||||||
|
json_options.schema = json_options.schema.schema()
|
||||||
|
scrape_params['jsonOptions'] = json_options.dict(exclude_none=True)
|
||||||
|
if actions is not None:
|
||||||
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||||
|
if agent is not None:
|
||||||
|
scrape_params['agent'] = agent.dict(exclude_none=True)
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
scrape_params.update(kwargs)
|
||||||
|
|
||||||
|
# Create final params object
|
||||||
|
final_params = ScrapeParams(**scrape_params)
|
||||||
|
params_dict = final_params.dict(exclude_none=True)
|
||||||
|
params_dict['urls'] = urls
|
||||||
|
params_dict['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
|
response = await self._async_post_request(
|
||||||
|
f'{self.api_url}/v1/batch/scrape',
|
||||||
|
params_dict,
|
||||||
headers
|
headers
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return BatchScrapeResponse(**response.json())
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, 'start batch scrape job')
|
||||||
|
|
||||||
async def crawl_url(
|
async def crawl_url(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
params: Optional[CrawlParams] = None,
|
*,
|
||||||
poll_interval: int = 2,
|
include_paths: Optional[List[str]] = None,
|
||||||
idempotency_key: Optional[str] = None) -> CrawlStatusResponse:
|
exclude_paths: Optional[List[str]] = None,
|
||||||
|
max_depth: Optional[int] = None,
|
||||||
|
max_discovery_depth: Optional[int] = None,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
allow_backward_links: Optional[bool] = None,
|
||||||
|
allow_external_links: Optional[bool] = None,
|
||||||
|
ignore_sitemap: Optional[bool] = None,
|
||||||
|
scrape_options: Optional[CommonOptions] = None,
|
||||||
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
||||||
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
poll_interval: Optional[int] = 2,
|
||||||
|
idempotency_key: Optional[str] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> CrawlStatusResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously crawl a website starting from a URL and monitor until completion.
|
Crawl a website starting from a URL.
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): Target URL to start crawling from
|
|
||||||
params (Optional[CrawlParams]): See CrawlParams model:
|
|
||||||
URL Discovery:
|
|
||||||
* includePaths - Patterns of URLs to include
|
|
||||||
* excludePaths - Patterns of URLs to exclude
|
|
||||||
* maxDepth - Maximum crawl depth
|
|
||||||
* maxDiscoveryDepth - Maximum depth for finding new URLs
|
|
||||||
* limit - Maximum pages to crawl
|
|
||||||
|
|
||||||
Link Following:
|
|
||||||
* allowBackwardLinks - Follow parent directory links
|
|
||||||
* allowExternalLinks - Follow external domain links
|
|
||||||
* ignoreSitemap - Skip sitemap.xml processing
|
|
||||||
|
|
||||||
Advanced:
|
|
||||||
* scrapeOptions - Page scraping configuration
|
|
||||||
* webhook - Notification webhook settings
|
|
||||||
* deduplicateSimilarURLs - Remove similar URLs
|
|
||||||
* ignoreQueryParameters - Ignore URL parameters
|
|
||||||
* regexOnFullURL - Apply regex to full URLs
|
|
||||||
poll_interval (int): Seconds between status checks (default: 2)
|
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
CrawlStatusResponse with:
|
|
||||||
* Crawling status and progress
|
|
||||||
* Crawled page contents
|
|
||||||
* Success/error information
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If crawl fails
|
|
||||||
"""
|
|
||||||
headers = self._prepare_headers(idempotency_key)
|
|
||||||
json_data = {'url': url}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
endpoint = f'/v1/crawl'
|
|
||||||
response = await self._async_post_request(
|
|
||||||
f'{self.api_url}{endpoint}',
|
|
||||||
json_data,
|
|
||||||
headers
|
|
||||||
)
|
|
||||||
|
|
||||||
if response.get('success') and 'id' in response:
|
|
||||||
return await self._async_monitor_job_status(response['id'], headers, poll_interval)
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to start crawl. Error: {response.get("error")}')
|
|
||||||
|
|
||||||
async def async_crawl_url(
|
|
||||||
self,
|
|
||||||
url: str,
|
|
||||||
params: Optional[CrawlParams] = None,
|
|
||||||
idempotency_key: Optional[str] = None) -> CrawlResponse:
|
|
||||||
"""
|
|
||||||
Initiate an asynchronous crawl job without waiting for completion.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to start crawling from
|
url (str): Target URL to start crawling from
|
||||||
params (Optional[CrawlParams]): See CrawlParams model:
|
include_paths (Optional[List[str]]): Patterns of URLs to include
|
||||||
URL Discovery:
|
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
||||||
* includePaths - Patterns of URLs to include
|
max_depth (Optional[int]): Maximum crawl depth
|
||||||
* excludePaths - Patterns of URLs to exclude
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||||
* maxDepth - Maximum crawl depth
|
limit (Optional[int]): Maximum pages to crawl
|
||||||
* maxDiscoveryDepth - Maximum depth for finding new URLs
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||||
* limit - Maximum pages to crawl
|
allow_external_links (Optional[bool]): Follow external domain links
|
||||||
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||||
Link Following:
|
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
||||||
* allowBackwardLinks - Follow parent directory links
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
||||||
* allowExternalLinks - Follow external domain links
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||||
* ignoreSitemap - Skip sitemap.xml processing
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||||
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||||
Advanced:
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||||
* scrapeOptions - Page scraping configuration
|
|
||||||
* webhook - Notification webhook settings
|
|
||||||
* deduplicateSimilarURLs - Remove similar URLs
|
|
||||||
* ignoreQueryParameters - Ignore URL parameters
|
|
||||||
* regexOnFullURL - Apply regex to full URLs
|
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
**kwargs: Additional parameters to pass to the API
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
CrawlResponse with:
|
CrawlStatusResponse with:
|
||||||
* success - Whether job started successfully
|
* Crawling status and progress
|
||||||
* id - Unique identifier for the job
|
* Crawled page contents
|
||||||
* url - Status check URL
|
* Success/error information
|
||||||
* error - Error message if start failed
|
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If job initiation fails
|
Exception: If crawl fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers(idempotency_key)
|
crawl_params = {}
|
||||||
json_data = {'url': url}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
json_data['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
endpoint = f'/v1/crawl'
|
# Add individual parameters
|
||||||
return await self._async_post_request(
|
if include_paths is not None:
|
||||||
f'{self.api_url}{endpoint}',
|
crawl_params['includePaths'] = include_paths
|
||||||
json_data,
|
if exclude_paths is not None:
|
||||||
headers
|
crawl_params['excludePaths'] = exclude_paths
|
||||||
|
if max_depth is not None:
|
||||||
|
crawl_params['maxDepth'] = max_depth
|
||||||
|
if max_discovery_depth is not None:
|
||||||
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||||
|
if limit is not None:
|
||||||
|
crawl_params['limit'] = limit
|
||||||
|
if allow_backward_links is not None:
|
||||||
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||||
|
if allow_external_links is not None:
|
||||||
|
crawl_params['allowExternalLinks'] = allow_external_links
|
||||||
|
if ignore_sitemap is not None:
|
||||||
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
||||||
|
if scrape_options is not None:
|
||||||
|
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
||||||
|
if webhook is not None:
|
||||||
|
crawl_params['webhook'] = webhook
|
||||||
|
if deduplicate_similar_urls is not None:
|
||||||
|
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
||||||
|
if ignore_query_parameters is not None:
|
||||||
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
|
if regex_on_full_url is not None:
|
||||||
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
crawl_params.update(kwargs)
|
||||||
|
|
||||||
|
# Create final params object
|
||||||
|
final_params = CrawlParams(**crawl_params)
|
||||||
|
params_dict = final_params.dict(exclude_none=True)
|
||||||
|
params_dict['url'] = url
|
||||||
|
params_dict['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
|
response = await self._async_post_request(
|
||||||
|
f'{self.api_url}/v1/crawl', params_dict, headers)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
id = response.json().get('id')
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
return self._monitor_job_status(id, headers, poll_interval)
|
||||||
|
else:
|
||||||
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
|
|
||||||
|
async def async_crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
*,
|
||||||
|
include_paths: Optional[List[str]] = None,
|
||||||
|
exclude_paths: Optional[List[str]] = None,
|
||||||
|
max_depth: Optional[int] = None,
|
||||||
|
max_discovery_depth: Optional[int] = None,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
allow_backward_links: Optional[bool] = None,
|
||||||
|
allow_external_links: Optional[bool] = None,
|
||||||
|
ignore_sitemap: Optional[bool] = None,
|
||||||
|
scrape_options: Optional[CommonOptions] = None,
|
||||||
|
webhook: Optional[Union[str, WebhookConfig]] = None,
|
||||||
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
idempotency_key: Optional[str] = None,
|
||||||
|
**kwargs
|
||||||
|
) -> CrawlResponse:
|
||||||
|
"""
|
||||||
|
Start an asynchronous crawl job.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url (str): Target URL to start crawling from
|
||||||
|
include_paths (Optional[List[str]]): Patterns of URLs to include
|
||||||
|
exclude_paths (Optional[List[str]]): Patterns of URLs to exclude
|
||||||
|
max_depth (Optional[int]): Maximum crawl depth
|
||||||
|
max_discovery_depth (Optional[int]): Maximum depth for finding new URLs
|
||||||
|
limit (Optional[int]): Maximum pages to crawl
|
||||||
|
allow_backward_links (Optional[bool]): Follow parent directory links
|
||||||
|
allow_external_links (Optional[bool]): Follow external domain links
|
||||||
|
ignore_sitemap (Optional[bool]): Skip sitemap.xml processing
|
||||||
|
scrape_options (Optional[CommonOptions]): Page scraping configuration
|
||||||
|
webhook (Optional[Union[str, WebhookConfig]]): Notification webhook settings
|
||||||
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||||
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||||
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||||
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
|
**kwargs: Additional parameters to pass to the API
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
CrawlResponse with:
|
||||||
|
* success - Whether crawl started successfully
|
||||||
|
* id - Unique identifier for the crawl job
|
||||||
|
* url - Status check URL for the crawl
|
||||||
|
* error - Error message if start failed
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
Exception: If crawl initiation fails
|
||||||
|
"""
|
||||||
|
crawl_params = {}
|
||||||
|
|
||||||
|
# Add individual parameters
|
||||||
|
if include_paths is not None:
|
||||||
|
crawl_params['includePaths'] = include_paths
|
||||||
|
if exclude_paths is not None:
|
||||||
|
crawl_params['excludePaths'] = exclude_paths
|
||||||
|
if max_depth is not None:
|
||||||
|
crawl_params['maxDepth'] = max_depth
|
||||||
|
if max_discovery_depth is not None:
|
||||||
|
crawl_params['maxDiscoveryDepth'] = max_discovery_depth
|
||||||
|
if limit is not None:
|
||||||
|
crawl_params['limit'] = limit
|
||||||
|
if allow_backward_links is not None:
|
||||||
|
crawl_params['allowBackwardLinks'] = allow_backward_links
|
||||||
|
if allow_external_links is not None:
|
||||||
|
crawl_params['allowExternalLinks'] = allow_external_links
|
||||||
|
if ignore_sitemap is not None:
|
||||||
|
crawl_params['ignoreSitemap'] = ignore_sitemap
|
||||||
|
if scrape_options is not None:
|
||||||
|
crawl_params['scrapeOptions'] = scrape_options.dict(exclude_none=True)
|
||||||
|
if webhook is not None:
|
||||||
|
crawl_params['webhook'] = webhook
|
||||||
|
if deduplicate_similar_urls is not None:
|
||||||
|
crawl_params['deduplicateSimilarURLs'] = deduplicate_similar_urls
|
||||||
|
if ignore_query_parameters is not None:
|
||||||
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
|
if regex_on_full_url is not None:
|
||||||
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
crawl_params.update(kwargs)
|
||||||
|
|
||||||
|
# Create final params object
|
||||||
|
final_params = CrawlParams(**crawl_params)
|
||||||
|
params_dict = final_params.dict(exclude_none=True)
|
||||||
|
params_dict['url'] = url
|
||||||
|
params_dict['origin'] = f"python-sdk@{version}"
|
||||||
|
|
||||||
|
# Make request
|
||||||
|
headers = self._prepare_headers(idempotency_key)
|
||||||
|
response = await self._async_post_request(
|
||||||
|
f'{self.api_url}/v1/crawl',
|
||||||
|
params_dict,
|
||||||
|
headers
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
try:
|
||||||
|
return CrawlResponse(**response.json())
|
||||||
|
except:
|
||||||
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
|
else:
|
||||||
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
async def check_crawl_status(self, id: str) -> CrawlStatusResponse:
|
||||||
"""
|
"""
|
||||||
Check the status and results of an asynchronous crawl job.
|
Check the status and results of an asynchronous crawl job.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user