mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-17 23:35:56 +08:00
Add delay parameter to crawl options in all SDKs (#1514)
* Add delay parameter to crawl options in all SDKs Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev> * Update terminology from 'between crawl requests' to 'between scrapes' Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev> * Apply suggestions from code review --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: mogery@sideguide.dev <mogery@sideguide.dev> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
411ecdf04b
commit
0512ad6bce
@ -215,6 +215,11 @@ export interface CrawlParams {
|
||||
deduplicateSimilarURLs?: boolean;
|
||||
ignoreQueryParameters?: boolean;
|
||||
regexOnFullURL?: boolean;
|
||||
/**
|
||||
* Delay in seconds between scrapes. This helps respect website rate limits.
|
||||
* If not provided, the crawler may use the robots.txt crawl delay if available.
|
||||
*/
|
||||
delay?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel):
|
||||
deduplicateSimilarURLs: Optional[bool] = None
|
||||
ignoreQueryParameters: Optional[bool] = None
|
||||
regexOnFullURL: Optional[bool] = None
|
||||
delay: Optional[int] = None # Delay in seconds between scrapes
|
||||
|
||||
class CrawlResponse(pydantic.BaseModel):
|
||||
"""Response from crawling operations."""
|
||||
@ -681,6 +682,7 @@ class FirecrawlApp:
|
||||
deduplicate_similar_urls: Optional[bool] = None,
|
||||
ignore_query_parameters: Optional[bool] = None,
|
||||
regex_on_full_url: Optional[bool] = None,
|
||||
delay: Optional[int] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None,
|
||||
**kwargs
|
||||
@ -703,6 +705,7 @@ class FirecrawlApp:
|
||||
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||
delay (Optional[int]): Delay in seconds between scrapes
|
||||
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||
**kwargs: Additional parameters to pass to the API
|
||||
@ -748,6 +751,8 @@ class FirecrawlApp:
|
||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||
if regex_on_full_url is not None:
|
||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||
if delay is not None:
|
||||
crawl_params['delay'] = delay
|
||||
|
||||
# Add any additional kwargs
|
||||
crawl_params.update(kwargs)
|
||||
@ -788,6 +793,7 @@ class FirecrawlApp:
|
||||
deduplicate_similar_urls: Optional[bool] = None,
|
||||
ignore_query_parameters: Optional[bool] = None,
|
||||
regex_on_full_url: Optional[bool] = None,
|
||||
delay: Optional[int] = None,
|
||||
idempotency_key: Optional[str] = None,
|
||||
**kwargs
|
||||
) -> CrawlResponse:
|
||||
@ -854,6 +860,8 @@ class FirecrawlApp:
|
||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||
if regex_on_full_url is not None:
|
||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||
if delay is not None:
|
||||
crawl_params['delay'] = delay
|
||||
|
||||
# Add any additional kwargs
|
||||
crawl_params.update(kwargs)
|
||||
@ -3240,6 +3248,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
deduplicate_similar_urls: Optional[bool] = None,
|
||||
ignore_query_parameters: Optional[bool] = None,
|
||||
regex_on_full_url: Optional[bool] = None,
|
||||
delay: Optional[int] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None,
|
||||
**kwargs
|
||||
@ -3262,6 +3271,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||
delay (Optional[int]): Delay in seconds between scrapes
|
||||
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||
**kwargs: Additional parameters to pass to the API
|
||||
@ -3307,6 +3317,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||
if regex_on_full_url is not None:
|
||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||
if delay is not None:
|
||||
crawl_params['delay'] = delay
|
||||
|
||||
# Add any additional kwargs
|
||||
crawl_params.update(kwargs)
|
||||
@ -3348,6 +3360,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
deduplicate_similar_urls: Optional[bool] = None,
|
||||
ignore_query_parameters: Optional[bool] = None,
|
||||
regex_on_full_url: Optional[bool] = None,
|
||||
delay: Optional[int] = None,
|
||||
poll_interval: Optional[int] = 2,
|
||||
idempotency_key: Optional[str] = None,
|
||||
**kwargs
|
||||
@ -3412,6 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||
if regex_on_full_url is not None:
|
||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||
if delay is not None:
|
||||
crawl_params['delay'] = delay
|
||||
|
||||
# Add any additional kwargs
|
||||
crawl_params.update(kwargs)
|
||||
|
@ -138,6 +138,8 @@ pub struct CrawlOptions {
|
||||
#[serde(skip)]
|
||||
pub idempotency_key: Option<String>,
|
||||
|
||||
pub delay: Option<u32>,
|
||||
|
||||
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
|
||||
#[serde(skip)]
|
||||
pub poll_interval: Option<u64>,
|
||||
|
Loading…
x
Reference in New Issue
Block a user