mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-04 13:40:37 +08:00
async scrape
This commit is contained in:
parent
a655d24e7c
commit
8cd82b5600
@ -2308,29 +2308,41 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
async def scrape_url(
|
async def scrape_url(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]:
|
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||||
|
include_tags: Optional[List[str]] = None,
|
||||||
|
exclude_tags: Optional[List[str]] = None,
|
||||||
|
only_main_content: Optional[bool] = None,
|
||||||
|
wait_for: Optional[int] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
location: Optional[LocationConfig] = None,
|
||||||
|
mobile: Optional[bool] = None,
|
||||||
|
skip_tls_verification: Optional[bool] = None,
|
||||||
|
remove_base64_images: Optional[bool] = None,
|
||||||
|
block_ads: Optional[bool] = None,
|
||||||
|
proxy: Optional[Literal["basic", "stealth"]] = None,
|
||||||
|
extract: Optional[ExtractConfig] = None,
|
||||||
|
json_options: Optional[ExtractConfig] = None,
|
||||||
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None) -> ScrapeResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrape and extract content from a URL.
|
Scrape and extract content from a URL asynchronously.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to scrape
|
url (str): Target URL to scrape
|
||||||
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
||||||
Content Options:
|
include_tags (Optional[List[str]]): HTML tags to include
|
||||||
* formats - Content types to retrieve (markdown/html/etc)
|
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
||||||
* includeTags - HTML tags to include
|
only_main_content (Optional[bool]): Extract main content only
|
||||||
* excludeTags - HTML tags to exclude
|
wait_for (Optional[int]): Wait for a specific element to appear
|
||||||
* onlyMainContent - Extract main content only
|
timeout (Optional[int]): Request timeout (ms)
|
||||||
|
location (Optional[LocationConfig]): Location configuration
|
||||||
Request Options:
|
mobile (Optional[bool]): Use mobile user agent
|
||||||
* headers - Custom HTTP headers
|
skip_tls_verification (Optional[bool]): Skip TLS verification
|
||||||
* timeout - Request timeout (ms)
|
remove_base64_images (Optional[bool]): Remove base64 images
|
||||||
* mobile - Use mobile user agent
|
block_ads (Optional[bool]): Block ads
|
||||||
* proxy - Proxy type (basic/stealth)
|
proxy (Optional[Literal["basic", "stealth"]]): Proxy type (basic/stealth)
|
||||||
|
extract (Optional[ExtractConfig]): Content extraction settings
|
||||||
Extraction Options:
|
json_options (Optional[ExtractConfig]): JSON extraction settings
|
||||||
* extract - Content extraction settings
|
actions (Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]]): Actions to perform
|
||||||
* jsonOptions - JSON extraction settings
|
|
||||||
* actions - Actions to perform
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ScrapeResponse with:
|
ScrapeResponse with:
|
||||||
@ -2340,22 +2352,55 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
* Success/error status
|
* Success/error status
|
||||||
|
|
||||||
Raises:
|
Raises:
|
||||||
Exception: If scraping fails
|
Exception: If scraping fails
|
||||||
"""
|
"""
|
||||||
headers = self._prepare_headers()
|
headers = self._prepare_headers()
|
||||||
scrape_params = {'url': url, 'origin': f'python-sdk@{version}'}
|
|
||||||
|
|
||||||
if params:
|
# Build scrape parameters
|
||||||
extract = params.get('extract', {})
|
scrape_params = {
|
||||||
if extract:
|
'url': url,
|
||||||
if 'schema' in extract and hasattr(extract['schema'], 'schema'):
|
'origin': f"python-sdk@{version}"
|
||||||
extract['schema'] = extract['schema'].schema()
|
}
|
||||||
scrape_params['extract'] = extract
|
|
||||||
|
|
||||||
for key, value in params.items():
|
# Add optional parameters if provided and not None
|
||||||
if key not in ['extract']:
|
if formats:
|
||||||
scrape_params[key] = value
|
scrape_params['formats'] = formats
|
||||||
|
if include_tags:
|
||||||
|
scrape_params['includeTags'] = include_tags
|
||||||
|
if exclude_tags:
|
||||||
|
scrape_params['excludeTags'] = exclude_tags
|
||||||
|
if only_main_content is not None:
|
||||||
|
scrape_params['onlyMainContent'] = only_main_content
|
||||||
|
if wait_for:
|
||||||
|
scrape_params['waitFor'] = wait_for
|
||||||
|
if timeout:
|
||||||
|
scrape_params['timeout'] = timeout
|
||||||
|
if location:
|
||||||
|
scrape_params['location'] = location.dict(exclude_none=True)
|
||||||
|
if mobile is not None:
|
||||||
|
scrape_params['mobile'] = mobile
|
||||||
|
if skip_tls_verification is not None:
|
||||||
|
scrape_params['skipTlsVerification'] = skip_tls_verification
|
||||||
|
if remove_base64_images is not None:
|
||||||
|
scrape_params['removeBase64Images'] = remove_base64_images
|
||||||
|
if block_ads is not None:
|
||||||
|
scrape_params['blockAds'] = block_ads
|
||||||
|
if proxy:
|
||||||
|
scrape_params['proxy'] = proxy
|
||||||
|
if extract:
|
||||||
|
extract_dict = extract.dict(exclude_none=True)
|
||||||
|
if 'schema' in extract_dict and hasattr(extract.schema, 'schema'):
|
||||||
|
extract_dict['schema'] = extract.schema.schema() # Ensure pydantic model schema is converted
|
||||||
|
scrape_params['extract'] = extract_dict
|
||||||
|
if json_options:
|
||||||
|
json_options_dict = json_options.dict(exclude_none=True)
|
||||||
|
if 'schema' in json_options_dict and hasattr(json_options.schema, 'schema'):
|
||||||
|
json_options_dict['schema'] = json_options.schema.schema() # Ensure pydantic model schema is converted
|
||||||
|
scrape_params['jsonOptions'] = json_options_dict
|
||||||
|
if actions:
|
||||||
|
scrape_params['actions'] = [action.dict(exclude_none=True) for action in actions]
|
||||||
|
|
||||||
|
# Make async request
|
||||||
endpoint = f'/v1/scrape'
|
endpoint = f'/v1/scrape'
|
||||||
response = await self._async_post_request(
|
response = await self._async_post_request(
|
||||||
f'{self.api_url}{endpoint}',
|
f'{self.api_url}{endpoint}',
|
||||||
@ -2364,11 +2409,13 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
)
|
)
|
||||||
|
|
||||||
if response.get('success') and 'data' in response:
|
if response.get('success') and 'data' in response:
|
||||||
return response['data']
|
return ScrapeResponse(**response['data'])
|
||||||
elif "error" in response:
|
elif "error" in response:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response}')
|
# Use the response content directly if possible, otherwise a generic message
|
||||||
|
error_content = response.get('error', str(response))
|
||||||
|
raise Exception(f'Failed to scrape URL. Error: {error_content}')
|
||||||
|
|
||||||
async def batch_scrape_urls(
|
async def batch_scrape_urls(
|
||||||
self,
|
self,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user