mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-19 00:05:54 +08:00
Add delay parameter to crawl options in all SDKs (#1514)
* Add delay parameter to crawl options in all SDKs Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev> * Update terminology from 'between crawl requests' to 'between scrapes' Co-Authored-By: mogery@sideguide.dev <mogery@sideguide.dev> * Apply suggestions from code review --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: mogery@sideguide.dev <mogery@sideguide.dev> Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
411ecdf04b
commit
0512ad6bce
@ -215,6 +215,11 @@ export interface CrawlParams {
|
|||||||
deduplicateSimilarURLs?: boolean;
|
deduplicateSimilarURLs?: boolean;
|
||||||
ignoreQueryParameters?: boolean;
|
ignoreQueryParameters?: boolean;
|
||||||
regexOnFullURL?: boolean;
|
regexOnFullURL?: boolean;
|
||||||
|
/**
|
||||||
|
* Delay in seconds between scrapes. This helps respect website rate limits.
|
||||||
|
* If not provided, the crawler may use the robots.txt crawl delay if available.
|
||||||
|
*/
|
||||||
|
delay?: number;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -259,6 +259,7 @@ class CrawlParams(pydantic.BaseModel):
|
|||||||
deduplicateSimilarURLs: Optional[bool] = None
|
deduplicateSimilarURLs: Optional[bool] = None
|
||||||
ignoreQueryParameters: Optional[bool] = None
|
ignoreQueryParameters: Optional[bool] = None
|
||||||
regexOnFullURL: Optional[bool] = None
|
regexOnFullURL: Optional[bool] = None
|
||||||
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
||||||
|
|
||||||
class CrawlResponse(pydantic.BaseModel):
|
class CrawlResponse(pydantic.BaseModel):
|
||||||
"""Response from crawling operations."""
|
"""Response from crawling operations."""
|
||||||
@ -681,6 +682,7 @@ class FirecrawlApp:
|
|||||||
deduplicate_similar_urls: Optional[bool] = None,
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
ignore_query_parameters: Optional[bool] = None,
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
regex_on_full_url: Optional[bool] = None,
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
delay: Optional[int] = None,
|
||||||
poll_interval: Optional[int] = 2,
|
poll_interval: Optional[int] = 2,
|
||||||
idempotency_key: Optional[str] = None,
|
idempotency_key: Optional[str] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
@ -703,6 +705,7 @@ class FirecrawlApp:
|
|||||||
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||||
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||||
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||||
|
delay (Optional[int]): Delay in seconds between scrapes
|
||||||
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
**kwargs: Additional parameters to pass to the API
|
**kwargs: Additional parameters to pass to the API
|
||||||
@ -748,6 +751,8 @@ class FirecrawlApp:
|
|||||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
if regex_on_full_url is not None:
|
if regex_on_full_url is not None:
|
||||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
if delay is not None:
|
||||||
|
crawl_params['delay'] = delay
|
||||||
|
|
||||||
# Add any additional kwargs
|
# Add any additional kwargs
|
||||||
crawl_params.update(kwargs)
|
crawl_params.update(kwargs)
|
||||||
@ -788,6 +793,7 @@ class FirecrawlApp:
|
|||||||
deduplicate_similar_urls: Optional[bool] = None,
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
ignore_query_parameters: Optional[bool] = None,
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
regex_on_full_url: Optional[bool] = None,
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
delay: Optional[int] = None,
|
||||||
idempotency_key: Optional[str] = None,
|
idempotency_key: Optional[str] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> CrawlResponse:
|
) -> CrawlResponse:
|
||||||
@ -854,6 +860,8 @@ class FirecrawlApp:
|
|||||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
if regex_on_full_url is not None:
|
if regex_on_full_url is not None:
|
||||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
if delay is not None:
|
||||||
|
crawl_params['delay'] = delay
|
||||||
|
|
||||||
# Add any additional kwargs
|
# Add any additional kwargs
|
||||||
crawl_params.update(kwargs)
|
crawl_params.update(kwargs)
|
||||||
@ -3240,6 +3248,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
deduplicate_similar_urls: Optional[bool] = None,
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
ignore_query_parameters: Optional[bool] = None,
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
regex_on_full_url: Optional[bool] = None,
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
delay: Optional[int] = None,
|
||||||
poll_interval: Optional[int] = 2,
|
poll_interval: Optional[int] = 2,
|
||||||
idempotency_key: Optional[str] = None,
|
idempotency_key: Optional[str] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
@ -3262,6 +3271,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
deduplicate_similar_urls (Optional[bool]): Remove similar URLs
|
||||||
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
ignore_query_parameters (Optional[bool]): Ignore URL parameters
|
||||||
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
regex_on_full_url (Optional[bool]): Apply regex to full URLs
|
||||||
|
delay (Optional[int]): Delay in seconds between scrapes
|
||||||
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
poll_interval (Optional[int]): Seconds between status checks (default: 2)
|
||||||
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
idempotency_key (Optional[str]): Unique key to prevent duplicate requests
|
||||||
**kwargs: Additional parameters to pass to the API
|
**kwargs: Additional parameters to pass to the API
|
||||||
@ -3307,6 +3317,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
if regex_on_full_url is not None:
|
if regex_on_full_url is not None:
|
||||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
if delay is not None:
|
||||||
|
crawl_params['delay'] = delay
|
||||||
|
|
||||||
# Add any additional kwargs
|
# Add any additional kwargs
|
||||||
crawl_params.update(kwargs)
|
crawl_params.update(kwargs)
|
||||||
@ -3348,6 +3360,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
deduplicate_similar_urls: Optional[bool] = None,
|
deduplicate_similar_urls: Optional[bool] = None,
|
||||||
ignore_query_parameters: Optional[bool] = None,
|
ignore_query_parameters: Optional[bool] = None,
|
||||||
regex_on_full_url: Optional[bool] = None,
|
regex_on_full_url: Optional[bool] = None,
|
||||||
|
delay: Optional[int] = None,
|
||||||
poll_interval: Optional[int] = 2,
|
poll_interval: Optional[int] = 2,
|
||||||
idempotency_key: Optional[str] = None,
|
idempotency_key: Optional[str] = None,
|
||||||
**kwargs
|
**kwargs
|
||||||
@ -3412,6 +3425,8 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
crawl_params['ignoreQueryParameters'] = ignore_query_parameters
|
||||||
if regex_on_full_url is not None:
|
if regex_on_full_url is not None:
|
||||||
crawl_params['regexOnFullURL'] = regex_on_full_url
|
crawl_params['regexOnFullURL'] = regex_on_full_url
|
||||||
|
if delay is not None:
|
||||||
|
crawl_params['delay'] = delay
|
||||||
|
|
||||||
# Add any additional kwargs
|
# Add any additional kwargs
|
||||||
crawl_params.update(kwargs)
|
crawl_params.update(kwargs)
|
||||||
|
@ -138,6 +138,8 @@ pub struct CrawlOptions {
|
|||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub idempotency_key: Option<String>,
|
pub idempotency_key: Option<String>,
|
||||||
|
|
||||||
|
pub delay: Option<u32>,
|
||||||
|
|
||||||
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
|
/// When using `FirecrawlApp::crawl_url`, this is how often the status of the job should be checked, in milliseconds. (default: `2000`)
|
||||||
#[serde(skip)]
|
#[serde(skip)]
|
||||||
pub poll_interval: Option<u64>,
|
pub poll_interval: Option<u64>,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user