From 0512ad6bce0ba31db183483184f646c4cdd8d648 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 2 May 2025 18:00:15 +0200 Subject: [PATCH] Add delay parameter to crawl options in all SDKs (#1514) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add delay parameter to crawl options in all SDKs Co-Authored-By: mogery@sideguide.dev * Update terminology from 'between crawl requests' to 'between scrapes' Co-Authored-By: mogery@sideguide.dev * Apply suggestions from code review --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: mogery@sideguide.dev Co-authored-by: Gergő Móricz --- apps/js-sdk/firecrawl/src/index.ts | 5 +++++ apps/python-sdk/firecrawl/firecrawl.py | 15 +++++++++++++++ apps/rust-sdk/src/crawl.rs | 2 ++ 3 files changed, 22 insertions(+) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index ed090e10..ccb64521 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -215,6 +215,11 @@ export interface CrawlParams { deduplicateSimilarURLs?: boolean; ignoreQueryParameters?: boolean; regexOnFullURL?: boolean; + /** + * Delay in seconds between scrapes. This helps respect website rate limits. + * If not provided, the crawler may use the robots.txt crawl delay if available. + */ + delay?: number; } /** diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 0942fe45..d51c3208 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel): deduplicateSimilarURLs: Optional[bool] = None ignoreQueryParameters: Optional[bool] = None regexOnFullURL: Optional[bool] = None + delay: Optional[int] = None # Delay in seconds between scrapes class CrawlResponse(pydantic.BaseModel): """Response from crawling operations.""" @@ -681,6 +682,7 @@ class FirecrawlApp: deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, regex_on_full_url: Optional[bool] = None, + delay: Optional[int] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None, **kwargs @@ -703,6 +705,7 @@ class FirecrawlApp: deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters regex_on_full_url (Optional[bool]): Apply regex to full URLs + delay (Optional[int]): Delay in seconds between scrapes poll_interval (Optional[int]): Seconds between status checks (default: 2) idempotency_key (Optional[str]): Unique key to prevent duplicate requests **kwargs: Additional parameters to pass to the API @@ -748,6 +751,8 @@ class FirecrawlApp: crawl_params['ignoreQueryParameters'] = ignore_query_parameters if regex_on_full_url is not None: crawl_params['regexOnFullURL'] = regex_on_full_url + if delay is not None: + crawl_params['delay'] = delay # Add any additional kwargs crawl_params.update(kwargs) @@ -788,6 +793,7 @@ class FirecrawlApp: deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, regex_on_full_url: Optional[bool] = None, + delay: Optional[int] = None, idempotency_key: Optional[str] = None, **kwargs ) -> CrawlResponse: @@ -854,6 +860,8 @@ class FirecrawlApp: crawl_params['ignoreQueryParameters'] = ignore_query_parameters if regex_on_full_url is not None: crawl_params['regexOnFullURL'] = regex_on_full_url + if delay is not None: + crawl_params['delay'] = delay # Add any additional kwargs crawl_params.update(kwargs) @@ -3240,6 +3248,7 @@ class AsyncFirecrawlApp(FirecrawlApp): deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, regex_on_full_url: Optional[bool] = None, + delay: Optional[int] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None, **kwargs @@ -3262,6 +3271,7 @@ class AsyncFirecrawlApp(FirecrawlApp): deduplicate_similar_urls (Optional[bool]): Remove similar URLs ignore_query_parameters (Optional[bool]): Ignore URL parameters regex_on_full_url (Optional[bool]): Apply regex to full URLs + delay (Optional[int]): Delay in seconds between scrapes poll_interval (Optional[int]): Seconds between status checks (default: 2) idempotency_key (Optional[str]): Unique key to prevent duplicate requests **kwargs: Additional parameters to pass to the API @@ -3307,6 +3317,8 @@ class AsyncFirecrawlApp(FirecrawlApp): crawl_params['ignoreQueryParameters'] = ignore_query_parameters if regex_on_full_url is not None: crawl_params['regexOnFullURL'] = regex_on_full_url + if delay is not None: + crawl_params['delay'] = delay # Add any additional kwargs crawl_params.update(kwargs) @@ -3348,6 +3360,7 @@ class AsyncFirecrawlApp(FirecrawlApp): deduplicate_similar_urls: Optional[bool] = None, ignore_query_parameters: Optional[bool] = None, regex_on_full_url: Optional[bool] = None, + delay: Optional[int] = None, poll_interval: Optional[int] = 2, idempotency_key: Optional[str] = None, **kwargs @@ -3412,6 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp): crawl_params['ignoreQueryParameters'] = ignore_query_parameters if regex_on_full_url is not None: crawl_params['regexOnFullURL'] = regex_on_full_url + if delay is not None: + crawl_params['delay'] = delay # Add any additional kwargs crawl_params.update(kwargs) diff --git a/apps/rust-sdk/src/crawl.rs b/apps/rust-sdk/src/crawl.rs index a5f30f40..b6522c45 100644 --- a/apps/rust-sdk/src/crawl.rs +++ b/apps/rust-sdk/src/crawl.rs @@ -138,6 +138,8 @@ pub struct CrawlOptions { #[serde(skip)] pub idempotency_key: Option, + pub delay: Option, + /// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`) #[serde(skip)] pub poll_interval: Option,