async scrape

This commit is contained in:
rafaelmmiller 2025-04-18 01:06:58 -07:00
parent a655d24e7c
commit 8cd82b5600

View File

@ -2308,29 +2308,41 @@ class AsyncFirecrawlApp(FirecrawlApp):
async def scrape_url( async def scrape_url(
self, self,
url: str, url: str,
params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]: formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None,
wait_for: Optional[int] = None,
timeout: Optional[int] = None,
location: Optional[LocationConfig] = None,
mobile: Optional[bool] = None,
skip_tls_verification: Optional[bool] = None,
remove_base64_images: Optional[bool] = None,
block_ads: Optional[bool] = None,
proxy: Optional[Literal["basic", "stealth"]] = None,
extract: Optional[ExtractConfig] = None,
json_options: Optional[ExtractConfig] = None,
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
""" """
Asynchronously scrape and extract content from a URL. Scrape and extract content from a URL asynchronously.
Args: Args:
url (str): Target URL to scrape url (str): Target URL to scrape
params (Optional[ScrapeParams]): See ScrapeParams model for configuration: formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
Content Options: include_tags (Optional[List[str]]): HTML tags to include
* formats - Content types to retrieve (markdown/html/etc) exclude_tags (Optional[List[str]]): HTML tags to exclude
* includeTags - HTML tags to include only_main_content (Optional[bool]): Extract main content only
* excludeTags - HTML tags to exclude wait_for (Optional[int]): Wait for a specific element to appear
* onlyMainContent - Extract main content only timeout (Optional[int]): Request timeout (ms)
location (Optional[LocationConfig]): Location configuration
Request Options: mobile (Optional[bool]): Use mobile user agent
* headers - Custom HTTP headers skip_tls_verification (Optional[bool]): Skip TLS verification
* timeout - Request timeout (ms) remove_base64_images (Optional[bool]): Remove base64 images
* mobile - Use mobile user agent block_ads (Optional[bool]): Block ads
* proxy - Proxy type (basic/stealth) proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
extract (Optional[ExtractConfig]): Content extraction settings
Extraction Options: json_options (Optional[ExtractConfig]): JSON extraction settings
* extract - Content extraction settings actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
* jsonOptions - JSON extraction settings
* actions - Actions to perform
Returns: Returns:
ScrapeResponse with: ScrapeResponse with:
@ -2343,19 +2355,52 @@ class AsyncFirecrawlApp(FirecrawlApp):
Exception: If scraping fails Exception: If scraping fails
""" """
headers = self._prepare_headers() headers = self._prepare_headers()
scrape_params = {'url': url, 'origin': f'python-sdk@{version}'}
if params: # Build scrape parameters
extract = params.get('extract', {}) scrape_params = {
'url': url,
'origin': f"python-sdk@{version}"
}
# Add optional parameters if provided and not None
if formats:
scrape_params['formats'] = formats
if include_tags:
scrape_params['includeTags'] = include_tags
if exclude_tags:
scrape_params['excludeTags'] = exclude_tags
if only_main_content is not None:
scrape_params['onlyMainContent'] = only_main_content
if wait_for:
scrape_params['waitFor'] = wait_for
if timeout:
scrape_params['timeout'] = timeout
if location:
scrape_params['location'] = location.dict(exclude_none=True)
if mobile is not None:
scrape_params['mobile'] = mobile
if skip_tls_verification is not None:
scrape_params['skipTlsVerification'] = skip_tls_verification
if remove_base64_images is not None:
scrape_params['removeBase64Images'] = remove_base64_images
if block_ads is not None:
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if extract: if extract:
if 'schema' in extract and hasattr(extract['schema'], 'schema'): extract_dict = extract.dict(exclude_none=True)
extract['schema'] = extract['schema'].schema() if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
scrape_params['extract'] = extract extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
scrape_params['extract'] = extract_dict
for key, value in params.items(): if json_options:
if key not in ['extract']: json_options_dict = json_options.dict(exclude_none=True)
scrape_params[key] = value if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
scrape_params['jsonOptions'] = json_options_dict
if actions:
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
# Make async request
endpoint = f'/v1/scrape' endpoint = f'/v1/scrape'
response = await self._async_post_request( response = await self._async_post_request(
f'{self.api_url}{endpoint}', f'{self.api_url}{endpoint}',
@ -2364,11 +2409,13 @@ class AsyncFirecrawlApp(FirecrawlApp):
) )
if response.get('success') and 'data' in response: if response.get('success') and 'data' in response:
return response['data'] return ScrapeResponse(**response['data'])
elif "error" in response: elif "error" in response:
raise Exception(f'Failed to scrape URL. Error: {response["error"]}') raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
else: else:
raise Exception(f'Failed to scrape URL. Error: {response}') # Use the response content directly if possible, otherwise a generic message
error_content = response.get('error', str(response))
raise Exception(f'Failed to scrape URL. Error: {error_content}')
async def batch_scrape_urls( async def batch_scrape_urls(
self, self,