Add delay parameter to crawl options in all SDKs (#1514)

* Add delay parameter to crawl options in all SDKs

Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev>

* Update terminology from 'between crawl requests' to 'between scrapes'

Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev>

* Apply suggestions from code review

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: mogery@sideguide.dev <mogery@sideguide.dev>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
devin-ai-integration[bot] 2025-05-02 18:00:15 +02:00 committed by GitHub
parent 411ecdf04b
commit 0512ad6bce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 22 additions and 0 deletions

View File

@ -215,6 +215,11 @@ export interface CrawlParams {
deduplicateSimilarURLs?: boolean;
ignoreQueryParameters?: boolean;
regexOnFullURL?: boolean;
/**
* Delay in seconds between scrapes. This helps respect website rate limits.
* If not provided, the crawler may use the robots.txt crawl delay if available.
*/
delay?: number;
}
/**

View File

@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel):
deduplicateSimilarURLs: Optional[bool] = None
ignoreQueryParameters: Optional[bool] = None
regexOnFullURL: Optional[bool] = None
delay: Optional[int] = None # Delay in seconds between scrapes
class CrawlResponse(pydantic.BaseModel):
"""Response from crawling operations."""
@ -681,6 +682,7 @@ class FirecrawlApp:
deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None,
regex_on_full_url: Optional[bool] = None,
delay: Optional[int] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None,
**kwargs
@ -703,6 +705,7 @@ class FirecrawlApp:
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters
regex_on_full_url (Optional[bool]): Apply regex to full URLs
delay (Optional[int]): Delay in seconds between scrapes
poll_interval (Optional[int]): Seconds between status checks (default: 2)
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
**kwargs: Additional parameters to pass to the API
@ -748,6 +751,8 @@ class FirecrawlApp:
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
if regex_on_full_url is not None:
crawl_params['regexOnFullURL'] = regex_on_full_url
if delay is not None:
crawl_params['delay'] = delay
# Add any additional kwargs
crawl_params.update(kwargs)
@ -788,6 +793,7 @@ class FirecrawlApp:
deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None,
regex_on_full_url: Optional[bool] = None,
delay: Optional[int] = None,
idempotency_key: Optional[str] = None,
**kwargs
) -> CrawlResponse:
@ -854,6 +860,8 @@ class FirecrawlApp:
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
if regex_on_full_url is not None:
crawl_params['regexOnFullURL'] = regex_on_full_url
if delay is not None:
crawl_params['delay'] = delay
# Add any additional kwargs
crawl_params.update(kwargs)
@ -3240,6 +3248,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None,
regex_on_full_url: Optional[bool] = None,
delay: Optional[int] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None,
**kwargs
@ -3262,6 +3271,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
ignore_query_parameters (Optional[bool]): Ignore URL parameters
regex_on_full_url (Optional[bool]): Apply regex to full URLs
delay (Optional[int]): Delay in seconds between scrapes
poll_interval (Optional[int]): Seconds between status checks (default: 2)
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
**kwargs: Additional parameters to pass to the API
@ -3307,6 +3317,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
if regex_on_full_url is not None:
crawl_params['regexOnFullURL'] = regex_on_full_url
if delay is not None:
crawl_params['delay'] = delay
# Add any additional kwargs
crawl_params.update(kwargs)
@ -3348,6 +3360,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
deduplicate_similar_urls: Optional[bool] = None,
ignore_query_parameters: Optional[bool] = None,
regex_on_full_url: Optional[bool] = None,
delay: Optional[int] = None,
poll_interval: Optional[int] = 2,
idempotency_key: Optional[str] = None,
**kwargs
@ -3412,6 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
if regex_on_full_url is not None:
crawl_params['regexOnFullURL'] = regex_on_full_url
if delay is not None:
crawl_params['delay'] = delay
# Add any additional kwargs
crawl_params.update(kwargs)

View File

@ -138,6 +138,8 @@ pub struct CrawlOptions {
#[serde(skip)]
pub idempotency_key: Option<String>,
pub delay: Option<u32>,
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
#[serde(skip)]
pub poll_interval: Option<u64>,