sdk(v3): scrape and async scrape ok

This commit is contained in:
rafaelmmiller 2025-05-27 11:23:02 -03:00
parent 94220a772b
commit ab1e244693
6 changed files with 1263 additions and 943 deletions

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
__version__ = "2.6.0"
__version__ = "3.0.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,386 @@
import pydantic
from typing import Optional, List, Dict, Literal, Any, Union, Generic, TypeVar
from datetime import datetime
import warnings
T = TypeVar('T')
# Suppress Pydantic warnings about attribute shadowing
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
# """Metadata for a Firecrawl document."""
# title: Optional[str] = None
# description: Optional[str] = None
# language: Optional[str] = None
# keywords: Optional[str] = None
# robots: Optional[str] = None
# ogTitle: Optional[str] = None
# ogDescription: Optional[str] = None
# ogUrl: Optional[str] = None
# ogImage: Optional[str] = None
# ogAudio: Optional[str] = None
# ogDeterminer: Optional[str] = None
# ogLocale: Optional[str] = None
# ogLocaleAlternate: Optional[List[str]] = None
# ogSiteName: Optional[str] = None
# ogVideo: Optional[str] = None
# dctermsCreated: Optional[str] = None
# dcDateCreated: Optional[str] = None
# dcDate: Optional[str] = None
# dctermsType: Optional[str] = None
# dcType: Optional[str] = None
# dctermsAudience: Optional[str] = None
# dctermsSubject: Optional[str] = None
# dcSubject: Optional[str] = None
# dcDescription: Optional[str] = None
# dctermsKeywords: Optional[str] = None
# modifiedTime: Optional[str] = None
# publishedTime: Optional[str] = None
# articleTag: Optional[str] = None
# articleSection: Optional[str] = None
# sourceURL: Optional[str] = None
# statusCode: Optional[int] = None
# error: Optional[str] = None
class AgentOptions(pydantic.BaseModel):
"""Configuration for the agent."""
model: Literal["FIRE-1"] = "FIRE-1"
prompt: Optional[str] = None
class AgentOptionsExtract(pydantic.BaseModel):
"""Configuration for the agent in extract operations."""
model: Literal["FIRE-1"] = "FIRE-1"
class ActionsResult(pydantic.BaseModel):
"""Result of actions performed during scraping."""
screenshots: List[str]
class ChangeTrackingData(pydantic.BaseModel):
"""
Data for the change tracking format.
"""
previous_scrape_at: Optional[str] = None
change_status: str # "new" | "same" | "changed" | "removed"
visibility: str # "visible" | "hidden"
diff: Optional[Dict[str, Any]] = None
json: Optional[Any] = None
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
"""Document retrieved or processed by Firecrawl."""
url: Optional[str] = None
markdown: Optional[str] = None
html: Optional[str] = None
raw_html: Optional[str] = None
links: Optional[List[str]] = None
extract: Optional[T] = None
json: Optional[T] = None
screenshot: Optional[str] = None
metadata: Optional[Any] = None
actions: Optional[ActionsResult] = None
title: Optional[str] = None # v1 search only
description: Optional[str] = None # v1 search only
change_tracking: Optional[ChangeTrackingData] = None
class LocationConfig(pydantic.BaseModel):
"""Location configuration for scraping."""
country: Optional[str] = None
languages: Optional[List[str]] = None
class WebhookConfig(pydantic.BaseModel):
"""Configuration for webhooks."""
url: str
headers: Optional[Dict[str, str]] = None
metadata: Optional[Dict[str, str]] = None
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
class ChangeTrackingOptions(pydantic.BaseModel):
"""Configuration for change tracking."""
modes: Optional[List[Literal["git-diff", "json"]]] = None
schema: Optional[Any] = None
prompt: Optional[str] = None
class ScrapeOptions(pydantic.BaseModel):
"""Parameters for scraping operations."""
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None
headers: Optional[Dict[str, str]] = None
include_tags: Optional[List[str]] = None
exclude_tags: Optional[List[str]] = None
only_main_content: Optional[bool] = None
wait_for: Optional[int] = None
timeout: Optional[int] = None
location: Optional[LocationConfig] = None
mobile: Optional[bool] = None
skip_tls_verification: Optional[bool] = None
remove_base64_images: Optional[bool] = None
block_ads: Optional[bool] = None
proxy: Optional[Literal["basic", "stealth"]] = None
change_tracking_options: Optional[ChangeTrackingOptions] = None
class WaitAction(pydantic.BaseModel):
"""Wait action to perform during scraping."""
type: Literal["wait"] = pydantic.Field(default="wait")
milliseconds: Optional[int] = None
selector: Optional[str] = None
class ScreenshotAction(pydantic.BaseModel):
"""Screenshot action to perform during scraping."""
type: Literal["screenshot"] = pydantic.Field(default="screenshot")
full_page: Optional[bool] = None
class ClickAction(pydantic.BaseModel):
"""Click action to perform during scraping."""
type: Literal["click"] = pydantic.Field(default="click")
selector: str
class WriteAction(pydantic.BaseModel):
"""Write action to perform during scraping."""
type: Literal["write"] = pydantic.Field(default="write")
text: str
class PressAction(pydantic.BaseModel):
"""Press action to perform during scraping."""
type: Literal["press"] = pydantic.Field(default="press")
key: str
class ScrollAction(pydantic.BaseModel):
"""Scroll action to perform during scraping."""
type: Literal["scroll"] = pydantic.Field(default="scroll")
direction: Literal["up", "down"]
selector: Optional[str] = None
class ScrapeAction(pydantic.BaseModel):
"""Scrape action to perform during scraping."""
type: Literal["scrape"] = pydantic.Field(default="scrape")
class ExecuteJavascriptAction(pydantic.BaseModel):
"""Execute javascript action to perform during scraping."""
type: Literal["executeJavascript"] = pydantic.Field(default="executeJavascript")
script: str
class ExtractAgent(pydantic.BaseModel):
"""Configuration for the agent in extract operations."""
model: Literal["FIRE-1"] = "FIRE-1"
class JsonConfig(pydantic.BaseModel):
"""Configuration for extraction."""
prompt: Optional[str] = None
schema: Optional[Any] = None
system_prompt: Optional[str] = None
agent: Optional[ExtractAgent] = None
class ScrapeParams(ScrapeOptions):
"""Parameters for scraping operations."""
extract: Optional[JsonConfig] = None
json_options: Optional[JsonConfig] = None
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
agent: Optional[AgentOptions] = None
webhook: Optional[WebhookConfig] = None
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
"""Response from scraping operations."""
success: bool = True
warning: Optional[str] = None
error: Optional[str] = None
class BatchScrapeResponse(pydantic.BaseModel):
"""Response from batch scrape operations."""
id: Optional[str] = None
url: Optional[str] = None
success: bool = True
error: Optional[str] = None
invalid_urls: Optional[List[str]] = None
class BatchScrapeStatusResponse(pydantic.BaseModel):
"""Response from batch scrape status checks."""
success: bool = True
status: Literal["scraping", "completed", "failed", "cancelled"]
completed: int
total: int
credits_used: int
expires_at: datetime
next: Optional[str] = None
data: List[FirecrawlDocument]
class CrawlParams(pydantic.BaseModel):
"""Parameters for crawling operations."""
include_paths: Optional[List[str]] = None
exclude_paths: Optional[List[str]] = None
max_depth: Optional[int] = None
max_discovery_depth: Optional[int] = None
limit: Optional[int] = None
allow_backward_links: Optional[bool] = None
allow_external_links: Optional[bool] = None
ignore_sitemap: Optional[bool] = None
scrape_options: Optional[ScrapeOptions] = None
webhook: Optional[Union[str, WebhookConfig]] = None
deduplicate_similar_urls: Optional[bool] = None
ignore_query_parameters: Optional[bool] = None
regex_on_full_url: Optional[bool] = None
delay: Optional[int] = None # Delay in seconds between scrapes
class CrawlResponse(pydantic.BaseModel):
"""Response from crawling operations."""
id: Optional[str] = None
url: Optional[str] = None
success: bool = True
error: Optional[str] = None
class CrawlStatusResponse(pydantic.BaseModel):
"""Response from crawl status checks."""
success: bool = True
status: Literal["scraping", "completed", "failed", "cancelled"]
completed: int
total: int
credits_used: int
expires_at: datetime
next: Optional[str] = None
data: List[FirecrawlDocument]
class CrawlErrorsResponse(pydantic.BaseModel):
"""Response from crawl/batch scrape error monitoring."""
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
robots_blocked: List[str]
class MapParams(pydantic.BaseModel):
"""Parameters for mapping operations."""
search: Optional[str] = None
ignore_sitemap: Optional[bool] = None
include_subdomains: Optional[bool] = None
sitemap_only: Optional[bool] = None
limit: Optional[int] = None
timeout: Optional[int] = None
class MapResponse(pydantic.BaseModel):
"""Response from mapping operations."""
success: bool = True
links: Optional[List[str]] = None
error: Optional[str] = None
class ExtractParams(pydantic.BaseModel):
"""Parameters for extracting information from URLs."""
prompt: Optional[str] = None
schema: Optional[Any] = None
system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = None
enable_web_search: Optional[bool] = None
include_subdomains: Optional[bool] = None
origin: Optional[str] = None
show_sources: Optional[bool] = None
scrape_options: Optional[ScrapeOptions] = None
class ExtractResponse(pydantic.BaseModel, Generic[T]):
"""Response from extract operations."""
id: Optional[str] = None
status: Optional[Literal["processing", "completed", "failed"]] = None
expires_at: Optional[datetime] = None
success: bool = True
data: Optional[T] = None
error: Optional[str] = None
warning: Optional[str] = None
sources: Optional[List[str]] = None
class SearchParams(pydantic.BaseModel):
query: str
limit: Optional[int] = 5
tbs: Optional[str] = None
filter: Optional[str] = None
lang: Optional[str] = "en"
country: Optional[str] = "us"
location: Optional[str] = None
origin: Optional[str] = "api"
timeout: Optional[int] = 60000
scrape_options: Optional[ScrapeOptions] = None
class SearchResponse(pydantic.BaseModel):
"""Response from search operations."""
success: bool = True
data: List[FirecrawlDocument]
warning: Optional[str] = None
error: Optional[str] = None
class GenerateLLMsTextParams(pydantic.BaseModel):
"""
Parameters for the LLMs.txt generation operation.
"""
max_urls: Optional[int] = 10
show_full_text: Optional[bool] = False
__experimental_stream: Optional[bool] = None
class DeepResearchParams(pydantic.BaseModel):
"""
Parameters for the deep research operation.
"""
max_depth: Optional[int] = 7
time_limit: Optional[int] = 270
max_urls: Optional[int] = 20
analysis_prompt: Optional[str] = None
system_prompt: Optional[str] = None
__experimental_stream_steps: Optional[bool] = None
class DeepResearchResponse(pydantic.BaseModel):
"""
Response from the deep research operation.
"""
success: bool
id: str
error: Optional[str] = None
class DeepResearchStatusResponse(pydantic.BaseModel):
"""
Status response from the deep research operation.
"""
success: bool
data: Optional[Dict[str, Any]] = None
status: str
error: Optional[str] = None
expires_at: str
current_depth: int
max_depth: int
activities: List[Dict[str, Any]]
sources: List[Dict[str, Any]]
summaries: List[str]
class GenerateLLMsTextResponse(pydantic.BaseModel):
"""Response from LLMs.txt generation operations."""
success: bool = True
id: str
error: Optional[str] = None
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
llmstxt: str
llmsfulltxt: Optional[str] = None
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
"""Status response from LLMs.txt generation operations."""
success: bool = True
data: Optional[GenerateLLMsTextStatusResponseData] = None
status: Literal["processing", "completed", "failed"]
error: Optional[str] = None
expires_at: str
class SearchResponse(pydantic.BaseModel):
"""
Response from the search operation.
"""
success: bool
data: List[Dict[str, Any]]
warning: Optional[str] = None
error: Optional[str] = None
class ExtractParams(pydantic.BaseModel):
"""
Parameters for the extract operation.
"""
prompt: Optional[str] = None
schema: Optional[Any] = pydantic.Field(None, alias='schema')
system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False
enable_web_search: Optional[bool] = False
show_sources: Optional[bool] = False
agent: Optional[Dict[str, Any]] = None

View File

@ -2,12 +2,10 @@
Utility functions for the Firecrawl SDK.
"""
import re
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict, Literal
from .types import LocationConfig, JsonConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, AgentOptions
T = TypeVar('T')
class DeepResearchDataSource(TypedDict, total=False):
"""Type definition for a source in deep research data."""
url: str
@ -113,3 +111,207 @@ def convert_to_dot_dict(data: Union[Dict[str, Any], List[Any], Any]) -> Union[Do
return [convert_to_dot_dict(item) for item in data]
else:
return data
def ensure_schema_dict(schema):
"""
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
"""
if schema is None:
return schema
if isinstance(schema, type):
# Pydantic v1/v2 model class
if hasattr(schema, 'model_json_schema'):
return schema.model_json_schema()
elif hasattr(schema, 'schema'):
return schema.schema()
if isinstance(schema, dict):
return {k: ensure_schema_dict(v) for k, v in schema.items()}
if isinstance(schema, (list, tuple)):
return [ensure_schema_dict(v) for v in schema]
return schema
def parse_scrape_options(
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None,
wait_for: Optional[int] = None,
timeout: Optional[int] = None,
location: Optional[LocationConfig] = None,
mobile: Optional[bool] = None,
skip_tls_verification: Optional[bool] = None,
remove_base64_images: Optional[bool] = None,
block_ads: Optional[bool] = None,
proxy: Optional[str] = None,
extract: Optional[JsonConfig] = None,
json_options: Optional[JsonConfig] = None,
actions: Optional[List[Union[WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction]]] = None,
change_tracking_options: Optional[ChangeTrackingOptions] = None,
agent: Optional[AgentOptions] = None,
**kwargs: Any
) -> Dict[str, Any]:
"""
Parse the scrape options and return a dictionary of parameters for the API request.
"""
scrape_params = {}
# Add optional parameters if provided and not None
if formats:
scrape_params['formats'] = scrape_formats_transform(formats)
if include_tags:
scrape_params['includeTags'] = include_tags
if exclude_tags:
scrape_params['excludeTags'] = exclude_tags
if only_main_content is not None:
scrape_params['onlyMainContent'] = only_main_content
if wait_for:
scrape_params['waitFor'] = wait_for
if timeout:
scrape_params['timeout'] = timeout
if location is not None:
scrape_params['location'] = {
"country": location.country,
"languages": location.languages
}
if mobile is not None:
scrape_params['mobile'] = mobile
if skip_tls_verification is not None:
scrape_params['skipTlsVerification'] = skip_tls_verification
if remove_base64_images is not None:
scrape_params['removeBase64Images'] = remove_base64_images
if block_ads is not None:
scrape_params['blockAds'] = block_ads
if proxy:
scrape_params['proxy'] = proxy
if extract is not None:
extract = ensure_schema_dict(extract)
if isinstance(extract, dict) and "schema" in extract:
extract["schema"] = ensure_schema_dict(extract["schema"])
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True)
if json_options is not None:
json_options = ensure_schema_dict(json_options)
if isinstance(json_options, dict) and "schema" in json_options:
json_options["schema"] = ensure_schema_dict(json_options["schema"])
# Convert to dict if it's a JsonConfig object
if hasattr(json_options, 'dict'):
json_options_dict = json_options.model_dump(exclude_none=True)
else:
json_options_dict = json_options
# Convert snake_case to camelCase for API
json_options_api = {}
if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None:
json_options_api['prompt'] = json_options_dict['prompt']
if 'schema' in json_options_dict and json_options_dict['schema'] is not None:
json_options_api['schema'] = json_options_dict['schema']
if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None:
json_options_api['systemPrompt'] = json_options_dict['system_prompt']
if 'agent' in json_options_dict and json_options_dict['agent'] is not None:
json_options_api['agent'] = json_options_dict['agent']
scrape_params['jsonOptions'] = json_options_api
if actions:
scrape_params['actions'] = [action if isinstance(action, dict) else action.model_dump(exclude_none=True) for action in actions]
if change_tracking_options is not None:
change_tracking_dict = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.model_dump(exclude_none=True)
# Convert snake_case to camelCase for API
change_tracking_api = {}
if 'modes' in change_tracking_dict and change_tracking_dict['modes'] is not None:
change_tracking_api['modes'] = change_tracking_dict['modes']
if 'schema' in change_tracking_dict and change_tracking_dict['schema'] is not None:
change_tracking_api['schema'] = change_tracking_dict['schema']
if 'prompt' in change_tracking_dict and change_tracking_dict['prompt'] is not None:
change_tracking_api['prompt'] = change_tracking_dict['prompt']
scrape_params['changeTrackingOptions'] = change_tracking_api
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
scrape_params['extract']['schema'] = ensure_schema_dict(scrape_params['extract']['schema'])
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
scrape_params['jsonOptions']['schema'] = ensure_schema_dict(scrape_params['jsonOptions']['schema'])
if agent:
scrape_params['agent'] = agent
if 'agent' in scrape_params and scrape_params['agent'] and 'model' in scrape_params['agent']:
scrape_params['agent']['model'] = scrape_params['agent']['model'].value
# Add any additional kwargs
scrape_params.update(kwargs)
return scrape_params
def scrape_formats_transform(formats: List[str]) -> List[str]:
"""
Transform the formats from snake_case to camelCase (API format).
"""
if 'screenshot@full_page' in formats:
formats.remove('screenshot@full_page')
formats.append('screenshot@fullPage')
if 'change_tracking' in formats:
formats.remove('change_tracking')
formats.append('changeTracking')
if 'raw_html' in formats:
formats.remove('raw_html')
formats.append('rawHtml')
return formats
def scrape_formats_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform the data response formats from camelCase (API format) to snake_case.
"""
if 'changeTracking' in data:
data['change_tracking'] = data['changeTracking']
del data['changeTracking']
if 'rawHtml' in data:
data['raw_html'] = data['rawHtml']
del data['rawHtml']
return data
def change_tracking_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Transform the change tracking data response from camelCase (API format) to snake_case.
"""
# Handle nested changeTracking object
if 'changeTracking' in data and isinstance(data['changeTracking'], dict):
change_tracking = data['changeTracking']
transformed_change_tracking = {}
# Transform all camelCase fields to snake_case
if 'previousScrapeAt' in change_tracking:
transformed_change_tracking['previous_scrape_at'] = change_tracking['previousScrapeAt']
if 'changeStatus' in change_tracking:
transformed_change_tracking['change_status'] = change_tracking['changeStatus']
if 'visibility' in change_tracking:
transformed_change_tracking['visibility'] = change_tracking['visibility']
if 'diff' in change_tracking:
transformed_change_tracking['diff'] = change_tracking['diff']
if 'json' in change_tracking:
transformed_change_tracking['json'] = change_tracking['json']
# Replace the camelCase changeTracking with snake_case change_tracking
data['change_tracking'] = transformed_change_tracking
del data['changeTracking']
# Handle top-level fields (for backward compatibility)
if 'changeStatus' in data:
data['change_status'] = data['changeStatus']
del data['changeStatus']
if 'previousScrapeAt' in data:
data['previous_scrape_at'] = data['previousScrapeAt']
del data['previousScrapeAt']
return data

View File

@ -16,7 +16,7 @@ API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = 'example.com'
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
@pytest.mark.asyncio
async def test_scrape_url_async_simple():
@ -24,109 +24,32 @@ async def test_scrape_url_async_simple():
TEST_URL,
formats=["markdown"]
)
# Basic response assertions
assert response.success
assert hasattr(response, "markdown")
assert "# Example Domain" in response.markdown
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_extract_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract", "change_tracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "raw_html")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
# Metadata assertions
assert hasattr(response, "metadata")
assert hasattr(response, "change_tracking")
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "raw_html")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "change_tracking")
assert response.metadata is not None
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_json_options():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
WaitAction(milliseconds=500),
ScreenshotAction(full_page=True),
WriteAction(text="test input"),
PressAction(key="Enter"),
ScrollAction(direction="down"),
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
]
response = await app.scrape_url(
url=TEST_URL,
@ -135,7 +58,7 @@ async def test_scrape_url_async_all_params_with_json_options():
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
timeout=30000,
location=location,
mobile=True,
skip_tls_verification=True,
@ -147,12 +70,149 @@ async def test_scrape_url_async_all_params_with_json_options():
change_tracking_options=change_tracking
)
# Basic response assertions
assert response.success
assert response.error is None
# Format assertions
assert hasattr(response, "markdown")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert hasattr(response, "html")
assert response.html is not None
assert isinstance(response.html, str)
assert len(response.html) > 0
assert hasattr(response, "raw_html")
assert response.raw_html is not None
assert isinstance(response.raw_html, str)
assert len(response.raw_html) > 0
assert hasattr(response, "links")
assert response.links is not None
assert isinstance(response.links, list)
assert hasattr(response, "screenshot")
assert response.screenshot is not None
assert isinstance(response.screenshot, str)
assert response.screenshot.startswith(("data:image/", "https://"))
assert hasattr(response, "json")
assert response.json is not None
assert isinstance(response.json, dict)
# Metadata assertions
assert hasattr(response, "metadata")
assert hasattr(response, "change_tracking")
assert response.metadata is not None
assert isinstance(response.metadata, dict)
# Change tracking assertions
assert hasattr(response, "change_tracking")
assert response.change_tracking is not None
assert hasattr(response.change_tracking, "previous_scrape_at")
assert hasattr(response.change_tracking, "change_status")
assert hasattr(response.change_tracking, "visibility")
assert hasattr(response.change_tracking, "diff")
assert hasattr(response.change_tracking, "json")
# Change status can be either "changed" or "same"
assert response.change_tracking.change_status in ["changed", "same", "new"]
assert response.change_tracking.visibility in ["visible", "hidden"]
# If status is "changed", diff and json should have data; if "same", they can be None
if response.change_tracking.change_status == "changed":
assert response.change_tracking.diff is not None
assert response.change_tracking.json is not None
# For "same" status, diff and json can be None (no changes detected)
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_json_options_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
actions = [
WaitAction(milliseconds=500),
ScreenshotAction(full_page=True),
WriteAction(text="test input"),
PressAction(key="Enter"),
ScrollAction(direction="down"),
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=30000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
json_options=json_options,
actions=actions,
change_tracking_options=change_tracking
)
# Basic response assertions
assert response.success
assert response.error is None
# Format assertions
assert hasattr(response, "markdown")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert hasattr(response, "html")
assert response.html is not None
assert isinstance(response.html, str)
assert len(response.html) > 0
assert hasattr(response, "raw_html")
assert response.raw_html is not None
assert isinstance(response.raw_html, str)
assert len(response.raw_html) > 0
assert hasattr(response, "links")
assert response.links is not None
assert isinstance(response.links, list)
assert hasattr(response, "screenshot")
assert response.screenshot is not None
assert isinstance(response.screenshot, str)
assert response.screenshot.startswith(("data:image/", "https://"))
assert hasattr(response, "json")
assert response.json is not None
assert isinstance(response.json, dict)
# Metadata assertions
assert hasattr(response, "metadata")
assert response.metadata is not None
assert isinstance(response.metadata, dict)
# Change tracking assertions
assert hasattr(response, "change_tracking")
assert response.change_tracking is not None
assert hasattr(response.change_tracking, "previous_scrape_at")
assert hasattr(response.change_tracking, "change_status")
assert hasattr(response.change_tracking, "visibility")
assert hasattr(response.change_tracking, "diff")
assert hasattr(response.change_tracking, "json")
# Change status can be either "changed" or "same"
assert response.change_tracking.change_status in ["changed", "same", "new"]
assert response.change_tracking.visibility in ["visible", "hidden"]
# If status is "changed", diff and json should have data; if "same", they can be None
if response.change_tracking.change_status == "changed":
assert response.change_tracking.diff is not None
assert response.change_tracking.json is not None
# For "same" status, diff and json can be None (no changes detected)

View File

@ -13,125 +13,49 @@ load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = 'example.com'
def test_scrape_url_simple():
response = app.scrape_url(
TEST_URL,
formats=["markdown"]
)
# Basic response assertions
assert response.success
assert hasattr(response, "markdown")
assert "# Example Domain" in response.markdown
def test_scrape_url_all_params_with_extract_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
# Metadata assertions
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")
def test_scrape_url_all_params_with_extract_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "raw_html")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "change_tracking")
assert response.metadata is not None
def test_scrape_url_all_params_with_json_options():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
WaitAction(milliseconds=500),
ScreenshotAction(full_page=True),
WriteAction(text="test input"),
PressAction(key="Enter"),
ScrollAction(direction="down"),
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
include_tags=["h1", "p"],
include_tags=["p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
timeout=30000,
location=location,
mobile=True,
skip_tls_verification=True,
@ -143,12 +67,146 @@ def test_scrape_url_all_params_with_json_options():
change_tracking_options=change_tracking
)
# Basic response assertions
assert response.success
assert response.error is None
# Format assertions
assert hasattr(response, "markdown")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert hasattr(response, "html")
assert response.html is not None
assert isinstance(response.html, str)
assert len(response.html) > 0
assert hasattr(response, "raw_html")
assert response.raw_html is not None
assert isinstance(response.raw_html, str)
assert len(response.raw_html) > 0
assert hasattr(response, "links")
assert response.links is not None
assert isinstance(response.links, list)
assert hasattr(response, "screenshot")
assert response.screenshot is not None
assert isinstance(response.screenshot, str)
assert response.screenshot.startswith("https://")
assert hasattr(response, "json")
assert response.json is not None
assert isinstance(response.json, dict)
# Metadata assertions
assert hasattr(response, "metadata")
assert hasattr(response, "change_tracking")
assert response.metadata is not None
assert isinstance(response.metadata, dict)
# Change tracking assertions
assert hasattr(response, "change_tracking")
assert response.change_tracking is not None
assert hasattr(response.change_tracking, "previous_scrape_at")
assert hasattr(response.change_tracking, "change_status")
assert hasattr(response.change_tracking, "visibility")
assert hasattr(response.change_tracking, "diff")
assert hasattr(response.change_tracking, "json")
# Change status can be either "changed" or "same"
assert response.change_tracking.change_status in ["changed", "same", "new"]
assert response.change_tracking.visibility in ["visible", "hidden"]
# If status is "changed", diff and json should have data; if "same", they can be None
if response.change_tracking.change_status == "changed":
assert response.change_tracking.diff is not None
assert response.change_tracking.json is not None
# For "same" status, diff and json can be None (no changes detected)
def test_scrape_url_all_params_with_json_options_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
actions = [
WaitAction(milliseconds=500),
ScreenshotAction(full_page=True),
WriteAction(text="test input"),
PressAction(key="Enter"),
ScrollAction(direction="down"),
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
include_tags=["p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=30000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
json_options=json_options,
actions=actions,
change_tracking_options=change_tracking
)
# Basic response assertions
assert response.success
assert response.error is None
# Format assertions
assert hasattr(response, "markdown")
assert response.markdown is not None
assert isinstance(response.markdown, str)
assert len(response.markdown) > 0
assert hasattr(response, "html")
assert response.html is not None
assert isinstance(response.html, str)
assert len(response.html) > 0
assert hasattr(response, "raw_html")
assert response.raw_html is not None
assert isinstance(response.raw_html, str)
assert len(response.raw_html) > 0
assert hasattr(response, "links")
assert response.links is not None
assert isinstance(response.links, list)
assert hasattr(response, "screenshot")
assert response.screenshot is not None
assert isinstance(response.screenshot, str)
assert response.screenshot.startswith("https://")
assert hasattr(response, "json")
assert response.json is not None
assert isinstance(response.json, dict)
# Metadata assertions
assert hasattr(response, "metadata")
assert response.metadata is not None
assert isinstance(response.metadata, dict)
# Change tracking assertions
assert hasattr(response, "change_tracking")
assert response.change_tracking is not None
assert hasattr(response.change_tracking, "previous_scrape_at")
assert hasattr(response.change_tracking, "change_status")
assert hasattr(response.change_tracking, "visibility")
assert hasattr(response.change_tracking, "diff")
assert hasattr(response.change_tracking, "json")
# Change status can be either "changed" or "same"
assert response.change_tracking.change_status in ["changed", "same", "new"]
assert response.change_tracking.visibility in ["visible", "hidden"]
if response.change_tracking.change_status == "changed":
assert response.change_tracking.diff is not None
assert response.change_tracking.json is not None