mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 23:55:54 +08:00
sdk(v3): scrape and async scrape ok
This commit is contained in:
parent
94220a772b
commit
ab1e244693
@ -13,7 +13,7 @@ import os
|
|||||||
|
|
||||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
|
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
|
||||||
|
|
||||||
__version__ = "2.6.0"
|
__version__ = "3.0.0"
|
||||||
|
|
||||||
# Define the logger for the Firecrawl project
|
# Define the logger for the Firecrawl project
|
||||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||||
|
File diff suppressed because it is too large
Load Diff
386
apps/python-sdk/firecrawl/types.py
Normal file
386
apps/python-sdk/firecrawl/types.py
Normal file
@ -0,0 +1,386 @@
|
|||||||
|
import pydantic
|
||||||
|
from typing import Optional, List, Dict, Literal, Any, Union, Generic, TypeVar
|
||||||
|
from datetime import datetime
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
T = TypeVar('T')
|
||||||
|
|
||||||
|
# Suppress Pydantic warnings about attribute shadowing
|
||||||
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
||||||
|
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
||||||
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
||||||
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
||||||
|
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
|
||||||
|
|
||||||
|
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
|
||||||
|
# """Metadata for a Firecrawl document."""
|
||||||
|
# title: Optional[str] = None
|
||||||
|
# description: Optional[str] = None
|
||||||
|
# language: Optional[str] = None
|
||||||
|
# keywords: Optional[str] = None
|
||||||
|
# robots: Optional[str] = None
|
||||||
|
# ogTitle: Optional[str] = None
|
||||||
|
# ogDescription: Optional[str] = None
|
||||||
|
# ogUrl: Optional[str] = None
|
||||||
|
# ogImage: Optional[str] = None
|
||||||
|
# ogAudio: Optional[str] = None
|
||||||
|
# ogDeterminer: Optional[str] = None
|
||||||
|
# ogLocale: Optional[str] = None
|
||||||
|
# ogLocaleAlternate: Optional[List[str]] = None
|
||||||
|
# ogSiteName: Optional[str] = None
|
||||||
|
# ogVideo: Optional[str] = None
|
||||||
|
# dctermsCreated: Optional[str] = None
|
||||||
|
# dcDateCreated: Optional[str] = None
|
||||||
|
# dcDate: Optional[str] = None
|
||||||
|
# dctermsType: Optional[str] = None
|
||||||
|
# dcType: Optional[str] = None
|
||||||
|
# dctermsAudience: Optional[str] = None
|
||||||
|
# dctermsSubject: Optional[str] = None
|
||||||
|
# dcSubject: Optional[str] = None
|
||||||
|
# dcDescription: Optional[str] = None
|
||||||
|
# dctermsKeywords: Optional[str] = None
|
||||||
|
# modifiedTime: Optional[str] = None
|
||||||
|
# publishedTime: Optional[str] = None
|
||||||
|
# articleTag: Optional[str] = None
|
||||||
|
# articleSection: Optional[str] = None
|
||||||
|
# sourceURL: Optional[str] = None
|
||||||
|
# statusCode: Optional[int] = None
|
||||||
|
# error: Optional[str] = None
|
||||||
|
|
||||||
|
class AgentOptions(pydantic.BaseModel):
|
||||||
|
"""Configuration for the agent."""
|
||||||
|
model: Literal["FIRE-1"] = "FIRE-1"
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
|
||||||
|
class AgentOptionsExtract(pydantic.BaseModel):
|
||||||
|
"""Configuration for the agent in extract operations."""
|
||||||
|
model: Literal["FIRE-1"] = "FIRE-1"
|
||||||
|
|
||||||
|
class ActionsResult(pydantic.BaseModel):
|
||||||
|
"""Result of actions performed during scraping."""
|
||||||
|
screenshots: List[str]
|
||||||
|
|
||||||
|
class ChangeTrackingData(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Data for the change tracking format.
|
||||||
|
"""
|
||||||
|
previous_scrape_at: Optional[str] = None
|
||||||
|
change_status: str # "new" | "same" | "changed" | "removed"
|
||||||
|
visibility: str # "visible" | "hidden"
|
||||||
|
diff: Optional[Dict[str, Any]] = None
|
||||||
|
json: Optional[Any] = None
|
||||||
|
|
||||||
|
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
||||||
|
"""Document retrieved or processed by Firecrawl."""
|
||||||
|
url: Optional[str] = None
|
||||||
|
markdown: Optional[str] = None
|
||||||
|
html: Optional[str] = None
|
||||||
|
raw_html: Optional[str] = None
|
||||||
|
links: Optional[List[str]] = None
|
||||||
|
extract: Optional[T] = None
|
||||||
|
json: Optional[T] = None
|
||||||
|
screenshot: Optional[str] = None
|
||||||
|
metadata: Optional[Any] = None
|
||||||
|
actions: Optional[ActionsResult] = None
|
||||||
|
title: Optional[str] = None # v1 search only
|
||||||
|
description: Optional[str] = None # v1 search only
|
||||||
|
change_tracking: Optional[ChangeTrackingData] = None
|
||||||
|
|
||||||
|
class LocationConfig(pydantic.BaseModel):
|
||||||
|
"""Location configuration for scraping."""
|
||||||
|
country: Optional[str] = None
|
||||||
|
languages: Optional[List[str]] = None
|
||||||
|
|
||||||
|
class WebhookConfig(pydantic.BaseModel):
|
||||||
|
"""Configuration for webhooks."""
|
||||||
|
url: str
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
|
metadata: Optional[Dict[str, str]] = None
|
||||||
|
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
||||||
|
|
||||||
|
class ChangeTrackingOptions(pydantic.BaseModel):
|
||||||
|
"""Configuration for change tracking."""
|
||||||
|
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
||||||
|
schema: Optional[Any] = None
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
|
||||||
|
class ScrapeOptions(pydantic.BaseModel):
|
||||||
|
"""Parameters for scraping operations."""
|
||||||
|
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None
|
||||||
|
headers: Optional[Dict[str, str]] = None
|
||||||
|
include_tags: Optional[List[str]] = None
|
||||||
|
exclude_tags: Optional[List[str]] = None
|
||||||
|
only_main_content: Optional[bool] = None
|
||||||
|
wait_for: Optional[int] = None
|
||||||
|
timeout: Optional[int] = None
|
||||||
|
location: Optional[LocationConfig] = None
|
||||||
|
mobile: Optional[bool] = None
|
||||||
|
skip_tls_verification: Optional[bool] = None
|
||||||
|
remove_base64_images: Optional[bool] = None
|
||||||
|
block_ads: Optional[bool] = None
|
||||||
|
proxy: Optional[Literal["basic", "stealth"]] = None
|
||||||
|
change_tracking_options: Optional[ChangeTrackingOptions] = None
|
||||||
|
|
||||||
|
class WaitAction(pydantic.BaseModel):
|
||||||
|
"""Wait action to perform during scraping."""
|
||||||
|
type: Literal["wait"] = pydantic.Field(default="wait")
|
||||||
|
milliseconds: Optional[int] = None
|
||||||
|
selector: Optional[str] = None
|
||||||
|
|
||||||
|
class ScreenshotAction(pydantic.BaseModel):
|
||||||
|
"""Screenshot action to perform during scraping."""
|
||||||
|
type: Literal["screenshot"] = pydantic.Field(default="screenshot")
|
||||||
|
full_page: Optional[bool] = None
|
||||||
|
|
||||||
|
class ClickAction(pydantic.BaseModel):
|
||||||
|
"""Click action to perform during scraping."""
|
||||||
|
type: Literal["click"] = pydantic.Field(default="click")
|
||||||
|
selector: str
|
||||||
|
|
||||||
|
class WriteAction(pydantic.BaseModel):
|
||||||
|
"""Write action to perform during scraping."""
|
||||||
|
type: Literal["write"] = pydantic.Field(default="write")
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class PressAction(pydantic.BaseModel):
|
||||||
|
"""Press action to perform during scraping."""
|
||||||
|
type: Literal["press"] = pydantic.Field(default="press")
|
||||||
|
key: str
|
||||||
|
|
||||||
|
class ScrollAction(pydantic.BaseModel):
|
||||||
|
"""Scroll action to perform during scraping."""
|
||||||
|
type: Literal["scroll"] = pydantic.Field(default="scroll")
|
||||||
|
direction: Literal["up", "down"]
|
||||||
|
selector: Optional[str] = None
|
||||||
|
|
||||||
|
class ScrapeAction(pydantic.BaseModel):
|
||||||
|
"""Scrape action to perform during scraping."""
|
||||||
|
type: Literal["scrape"] = pydantic.Field(default="scrape")
|
||||||
|
|
||||||
|
class ExecuteJavascriptAction(pydantic.BaseModel):
|
||||||
|
"""Execute javascript action to perform during scraping."""
|
||||||
|
type: Literal["executeJavascript"] = pydantic.Field(default="executeJavascript")
|
||||||
|
script: str
|
||||||
|
|
||||||
|
|
||||||
|
class ExtractAgent(pydantic.BaseModel):
|
||||||
|
"""Configuration for the agent in extract operations."""
|
||||||
|
model: Literal["FIRE-1"] = "FIRE-1"
|
||||||
|
|
||||||
|
class JsonConfig(pydantic.BaseModel):
|
||||||
|
"""Configuration for extraction."""
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
schema: Optional[Any] = None
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
agent: Optional[ExtractAgent] = None
|
||||||
|
|
||||||
|
class ScrapeParams(ScrapeOptions):
|
||||||
|
"""Parameters for scraping operations."""
|
||||||
|
extract: Optional[JsonConfig] = None
|
||||||
|
json_options: Optional[JsonConfig] = None
|
||||||
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
||||||
|
agent: Optional[AgentOptions] = None
|
||||||
|
webhook: Optional[WebhookConfig] = None
|
||||||
|
|
||||||
|
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
||||||
|
"""Response from scraping operations."""
|
||||||
|
success: bool = True
|
||||||
|
warning: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class BatchScrapeResponse(pydantic.BaseModel):
|
||||||
|
"""Response from batch scrape operations."""
|
||||||
|
id: Optional[str] = None
|
||||||
|
url: Optional[str] = None
|
||||||
|
success: bool = True
|
||||||
|
error: Optional[str] = None
|
||||||
|
invalid_urls: Optional[List[str]] = None
|
||||||
|
|
||||||
|
class BatchScrapeStatusResponse(pydantic.BaseModel):
|
||||||
|
"""Response from batch scrape status checks."""
|
||||||
|
success: bool = True
|
||||||
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
||||||
|
completed: int
|
||||||
|
total: int
|
||||||
|
credits_used: int
|
||||||
|
expires_at: datetime
|
||||||
|
next: Optional[str] = None
|
||||||
|
data: List[FirecrawlDocument]
|
||||||
|
|
||||||
|
class CrawlParams(pydantic.BaseModel):
|
||||||
|
"""Parameters for crawling operations."""
|
||||||
|
include_paths: Optional[List[str]] = None
|
||||||
|
exclude_paths: Optional[List[str]] = None
|
||||||
|
max_depth: Optional[int] = None
|
||||||
|
max_discovery_depth: Optional[int] = None
|
||||||
|
limit: Optional[int] = None
|
||||||
|
allow_backward_links: Optional[bool] = None
|
||||||
|
allow_external_links: Optional[bool] = None
|
||||||
|
ignore_sitemap: Optional[bool] = None
|
||||||
|
scrape_options: Optional[ScrapeOptions] = None
|
||||||
|
webhook: Optional[Union[str, WebhookConfig]] = None
|
||||||
|
deduplicate_similar_urls: Optional[bool] = None
|
||||||
|
ignore_query_parameters: Optional[bool] = None
|
||||||
|
regex_on_full_url: Optional[bool] = None
|
||||||
|
delay: Optional[int] = None # Delay in seconds between scrapes
|
||||||
|
|
||||||
|
class CrawlResponse(pydantic.BaseModel):
|
||||||
|
"""Response from crawling operations."""
|
||||||
|
id: Optional[str] = None
|
||||||
|
url: Optional[str] = None
|
||||||
|
success: bool = True
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class CrawlStatusResponse(pydantic.BaseModel):
|
||||||
|
"""Response from crawl status checks."""
|
||||||
|
success: bool = True
|
||||||
|
status: Literal["scraping", "completed", "failed", "cancelled"]
|
||||||
|
completed: int
|
||||||
|
total: int
|
||||||
|
credits_used: int
|
||||||
|
expires_at: datetime
|
||||||
|
next: Optional[str] = None
|
||||||
|
data: List[FirecrawlDocument]
|
||||||
|
|
||||||
|
class CrawlErrorsResponse(pydantic.BaseModel):
|
||||||
|
"""Response from crawl/batch scrape error monitoring."""
|
||||||
|
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
|
||||||
|
robots_blocked: List[str]
|
||||||
|
|
||||||
|
class MapParams(pydantic.BaseModel):
|
||||||
|
"""Parameters for mapping operations."""
|
||||||
|
search: Optional[str] = None
|
||||||
|
ignore_sitemap: Optional[bool] = None
|
||||||
|
include_subdomains: Optional[bool] = None
|
||||||
|
sitemap_only: Optional[bool] = None
|
||||||
|
limit: Optional[int] = None
|
||||||
|
timeout: Optional[int] = None
|
||||||
|
|
||||||
|
class MapResponse(pydantic.BaseModel):
|
||||||
|
"""Response from mapping operations."""
|
||||||
|
success: bool = True
|
||||||
|
links: Optional[List[str]] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class ExtractParams(pydantic.BaseModel):
|
||||||
|
"""Parameters for extracting information from URLs."""
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
schema: Optional[Any] = None
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
allow_external_links: Optional[bool] = None
|
||||||
|
enable_web_search: Optional[bool] = None
|
||||||
|
include_subdomains: Optional[bool] = None
|
||||||
|
origin: Optional[str] = None
|
||||||
|
show_sources: Optional[bool] = None
|
||||||
|
scrape_options: Optional[ScrapeOptions] = None
|
||||||
|
|
||||||
|
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
||||||
|
"""Response from extract operations."""
|
||||||
|
id: Optional[str] = None
|
||||||
|
status: Optional[Literal["processing", "completed", "failed"]] = None
|
||||||
|
expires_at: Optional[datetime] = None
|
||||||
|
success: bool = True
|
||||||
|
data: Optional[T] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
warning: Optional[str] = None
|
||||||
|
sources: Optional[List[str]] = None
|
||||||
|
|
||||||
|
class SearchParams(pydantic.BaseModel):
|
||||||
|
query: str
|
||||||
|
limit: Optional[int] = 5
|
||||||
|
tbs: Optional[str] = None
|
||||||
|
filter: Optional[str] = None
|
||||||
|
lang: Optional[str] = "en"
|
||||||
|
country: Optional[str] = "us"
|
||||||
|
location: Optional[str] = None
|
||||||
|
origin: Optional[str] = "api"
|
||||||
|
timeout: Optional[int] = 60000
|
||||||
|
scrape_options: Optional[ScrapeOptions] = None
|
||||||
|
|
||||||
|
class SearchResponse(pydantic.BaseModel):
|
||||||
|
"""Response from search operations."""
|
||||||
|
success: bool = True
|
||||||
|
data: List[FirecrawlDocument]
|
||||||
|
warning: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class GenerateLLMsTextParams(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Parameters for the LLMs.txt generation operation.
|
||||||
|
"""
|
||||||
|
max_urls: Optional[int] = 10
|
||||||
|
show_full_text: Optional[bool] = False
|
||||||
|
__experimental_stream: Optional[bool] = None
|
||||||
|
|
||||||
|
class DeepResearchParams(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Parameters for the deep research operation.
|
||||||
|
"""
|
||||||
|
max_depth: Optional[int] = 7
|
||||||
|
time_limit: Optional[int] = 270
|
||||||
|
max_urls: Optional[int] = 20
|
||||||
|
analysis_prompt: Optional[str] = None
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
__experimental_stream_steps: Optional[bool] = None
|
||||||
|
|
||||||
|
class DeepResearchResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Response from the deep research operation.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
id: str
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class DeepResearchStatusResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Status response from the deep research operation.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
data: Optional[Dict[str, Any]] = None
|
||||||
|
status: str
|
||||||
|
error: Optional[str] = None
|
||||||
|
expires_at: str
|
||||||
|
current_depth: int
|
||||||
|
max_depth: int
|
||||||
|
activities: List[Dict[str, Any]]
|
||||||
|
sources: List[Dict[str, Any]]
|
||||||
|
summaries: List[str]
|
||||||
|
|
||||||
|
class GenerateLLMsTextResponse(pydantic.BaseModel):
|
||||||
|
"""Response from LLMs.txt generation operations."""
|
||||||
|
success: bool = True
|
||||||
|
id: str
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
||||||
|
llmstxt: str
|
||||||
|
llmsfulltxt: Optional[str] = None
|
||||||
|
|
||||||
|
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
||||||
|
"""Status response from LLMs.txt generation operations."""
|
||||||
|
success: bool = True
|
||||||
|
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
||||||
|
status: Literal["processing", "completed", "failed"]
|
||||||
|
error: Optional[str] = None
|
||||||
|
expires_at: str
|
||||||
|
|
||||||
|
class SearchResponse(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Response from the search operation.
|
||||||
|
"""
|
||||||
|
success: bool
|
||||||
|
data: List[Dict[str, Any]]
|
||||||
|
warning: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class ExtractParams(pydantic.BaseModel):
|
||||||
|
"""
|
||||||
|
Parameters for the extract operation.
|
||||||
|
"""
|
||||||
|
prompt: Optional[str] = None
|
||||||
|
schema: Optional[Any] = pydantic.Field(None, alias='schema')
|
||||||
|
system_prompt: Optional[str] = None
|
||||||
|
allow_external_links: Optional[bool] = False
|
||||||
|
enable_web_search: Optional[bool] = False
|
||||||
|
show_sources: Optional[bool] = False
|
||||||
|
agent: Optional[Dict[str, Any]] = None
|
@ -2,12 +2,10 @@
|
|||||||
Utility functions for the Firecrawl SDK.
|
Utility functions for the Firecrawl SDK.
|
||||||
"""
|
"""
|
||||||
import re
|
import re
|
||||||
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict
|
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict, Literal
|
||||||
|
from .types import LocationConfig, JsonConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, AgentOptions
|
||||||
|
|
||||||
T = TypeVar('T')
|
T = TypeVar('T')
|
||||||
|
|
||||||
|
|
||||||
class DeepResearchDataSource(TypedDict, total=False):
|
class DeepResearchDataSource(TypedDict, total=False):
|
||||||
"""Type definition for a source in deep research data."""
|
"""Type definition for a source in deep research data."""
|
||||||
url: str
|
url: str
|
||||||
@ -113,3 +111,207 @@ def convert_to_dot_dict(data: Union[Dict[str, Any], List[Any], Any]) -> Union[Do
|
|||||||
return [convert_to_dot_dict(item) for item in data]
|
return [convert_to_dot_dict(item) for item in data]
|
||||||
else:
|
else:
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
def ensure_schema_dict(schema):
|
||||||
|
"""
|
||||||
|
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
|
||||||
|
"""
|
||||||
|
if schema is None:
|
||||||
|
return schema
|
||||||
|
if isinstance(schema, type):
|
||||||
|
# Pydantic v1/v2 model class
|
||||||
|
if hasattr(schema, 'model_json_schema'):
|
||||||
|
return schema.model_json_schema()
|
||||||
|
elif hasattr(schema, 'schema'):
|
||||||
|
return schema.schema()
|
||||||
|
if isinstance(schema, dict):
|
||||||
|
return {k: ensure_schema_dict(v) for k, v in schema.items()}
|
||||||
|
if isinstance(schema, (list, tuple)):
|
||||||
|
return [ensure_schema_dict(v) for v in schema]
|
||||||
|
return schema
|
||||||
|
|
||||||
|
def parse_scrape_options(
|
||||||
|
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None,
|
||||||
|
include_tags: Optional[List[str]] = None,
|
||||||
|
exclude_tags: Optional[List[str]] = None,
|
||||||
|
only_main_content: Optional[bool] = None,
|
||||||
|
wait_for: Optional[int] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
location: Optional[LocationConfig] = None,
|
||||||
|
mobile: Optional[bool] = None,
|
||||||
|
skip_tls_verification: Optional[bool] = None,
|
||||||
|
remove_base64_images: Optional[bool] = None,
|
||||||
|
block_ads: Optional[bool] = None,
|
||||||
|
proxy: Optional[str] = None,
|
||||||
|
extract: Optional[JsonConfig] = None,
|
||||||
|
json_options: Optional[JsonConfig] = None,
|
||||||
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction]]] = None,
|
||||||
|
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
||||||
|
agent: Optional[AgentOptions] = None,
|
||||||
|
**kwargs: Any
|
||||||
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse the scrape options and return a dictionary of parameters for the API request.
|
||||||
|
"""
|
||||||
|
|
||||||
|
scrape_params = {}
|
||||||
|
|
||||||
|
# Add optional parameters if provided and not None
|
||||||
|
if formats:
|
||||||
|
scrape_params['formats'] = scrape_formats_transform(formats)
|
||||||
|
if include_tags:
|
||||||
|
scrape_params['includeTags'] = include_tags
|
||||||
|
if exclude_tags:
|
||||||
|
scrape_params['excludeTags'] = exclude_tags
|
||||||
|
if only_main_content is not None:
|
||||||
|
scrape_params['onlyMainContent'] = only_main_content
|
||||||
|
if wait_for:
|
||||||
|
scrape_params['waitFor'] = wait_for
|
||||||
|
if timeout:
|
||||||
|
scrape_params['timeout'] = timeout
|
||||||
|
if location is not None:
|
||||||
|
scrape_params['location'] = {
|
||||||
|
"country": location.country,
|
||||||
|
"languages": location.languages
|
||||||
|
}
|
||||||
|
if mobile is not None:
|
||||||
|
scrape_params['mobile'] = mobile
|
||||||
|
if skip_tls_verification is not None:
|
||||||
|
scrape_params['skipTlsVerification'] = skip_tls_verification
|
||||||
|
if remove_base64_images is not None:
|
||||||
|
scrape_params['removeBase64Images'] = remove_base64_images
|
||||||
|
if block_ads is not None:
|
||||||
|
scrape_params['blockAds'] = block_ads
|
||||||
|
if proxy:
|
||||||
|
scrape_params['proxy'] = proxy
|
||||||
|
if extract is not None:
|
||||||
|
extract = ensure_schema_dict(extract)
|
||||||
|
if isinstance(extract, dict) and "schema" in extract:
|
||||||
|
extract["schema"] = ensure_schema_dict(extract["schema"])
|
||||||
|
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True)
|
||||||
|
if json_options is not None:
|
||||||
|
json_options = ensure_schema_dict(json_options)
|
||||||
|
if isinstance(json_options, dict) and "schema" in json_options:
|
||||||
|
json_options["schema"] = ensure_schema_dict(json_options["schema"])
|
||||||
|
|
||||||
|
# Convert to dict if it's a JsonConfig object
|
||||||
|
if hasattr(json_options, 'dict'):
|
||||||
|
json_options_dict = json_options.model_dump(exclude_none=True)
|
||||||
|
else:
|
||||||
|
json_options_dict = json_options
|
||||||
|
|
||||||
|
# Convert snake_case to camelCase for API
|
||||||
|
json_options_api = {}
|
||||||
|
if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None:
|
||||||
|
json_options_api['prompt'] = json_options_dict['prompt']
|
||||||
|
if 'schema' in json_options_dict and json_options_dict['schema'] is not None:
|
||||||
|
json_options_api['schema'] = json_options_dict['schema']
|
||||||
|
if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None:
|
||||||
|
json_options_api['systemPrompt'] = json_options_dict['system_prompt']
|
||||||
|
if 'agent' in json_options_dict and json_options_dict['agent'] is not None:
|
||||||
|
json_options_api['agent'] = json_options_dict['agent']
|
||||||
|
|
||||||
|
scrape_params['jsonOptions'] = json_options_api
|
||||||
|
if actions:
|
||||||
|
scrape_params['actions'] = [action if isinstance(action, dict) else action.model_dump(exclude_none=True) for action in actions]
|
||||||
|
if change_tracking_options is not None:
|
||||||
|
change_tracking_dict = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.model_dump(exclude_none=True)
|
||||||
|
|
||||||
|
# Convert snake_case to camelCase for API
|
||||||
|
change_tracking_api = {}
|
||||||
|
if 'modes' in change_tracking_dict and change_tracking_dict['modes'] is not None:
|
||||||
|
change_tracking_api['modes'] = change_tracking_dict['modes']
|
||||||
|
if 'schema' in change_tracking_dict and change_tracking_dict['schema'] is not None:
|
||||||
|
change_tracking_api['schema'] = change_tracking_dict['schema']
|
||||||
|
if 'prompt' in change_tracking_dict and change_tracking_dict['prompt'] is not None:
|
||||||
|
change_tracking_api['prompt'] = change_tracking_dict['prompt']
|
||||||
|
|
||||||
|
scrape_params['changeTrackingOptions'] = change_tracking_api
|
||||||
|
|
||||||
|
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
||||||
|
scrape_params['extract']['schema'] = ensure_schema_dict(scrape_params['extract']['schema'])
|
||||||
|
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
||||||
|
scrape_params['jsonOptions']['schema'] = ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
||||||
|
|
||||||
|
if agent:
|
||||||
|
scrape_params['agent'] = agent
|
||||||
|
|
||||||
|
if 'agent' in scrape_params and scrape_params['agent'] and 'model' in scrape_params['agent']:
|
||||||
|
scrape_params['agent']['model'] = scrape_params['agent']['model'].value
|
||||||
|
|
||||||
|
# Add any additional kwargs
|
||||||
|
scrape_params.update(kwargs)
|
||||||
|
|
||||||
|
return scrape_params
|
||||||
|
|
||||||
|
def scrape_formats_transform(formats: List[str]) -> List[str]:
|
||||||
|
"""
|
||||||
|
Transform the formats from snake_case to camelCase (API format).
|
||||||
|
"""
|
||||||
|
if 'screenshot@full_page' in formats:
|
||||||
|
formats.remove('screenshot@full_page')
|
||||||
|
formats.append('screenshot@fullPage')
|
||||||
|
|
||||||
|
if 'change_tracking' in formats:
|
||||||
|
formats.remove('change_tracking')
|
||||||
|
formats.append('changeTracking')
|
||||||
|
|
||||||
|
if 'raw_html' in formats:
|
||||||
|
formats.remove('raw_html')
|
||||||
|
formats.append('rawHtml')
|
||||||
|
|
||||||
|
return formats
|
||||||
|
|
||||||
|
def scrape_formats_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Transform the data response formats from camelCase (API format) to snake_case.
|
||||||
|
"""
|
||||||
|
if 'changeTracking' in data:
|
||||||
|
data['change_tracking'] = data['changeTracking']
|
||||||
|
del data['changeTracking']
|
||||||
|
|
||||||
|
if 'rawHtml' in data:
|
||||||
|
data['raw_html'] = data['rawHtml']
|
||||||
|
del data['rawHtml']
|
||||||
|
|
||||||
|
return data
|
||||||
|
|
||||||
|
def change_tracking_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Transform the change tracking data response from camelCase (API format) to snake_case.
|
||||||
|
"""
|
||||||
|
# Handle nested changeTracking object
|
||||||
|
if 'changeTracking' in data and isinstance(data['changeTracking'], dict):
|
||||||
|
change_tracking = data['changeTracking']
|
||||||
|
transformed_change_tracking = {}
|
||||||
|
|
||||||
|
# Transform all camelCase fields to snake_case
|
||||||
|
if 'previousScrapeAt' in change_tracking:
|
||||||
|
transformed_change_tracking['previous_scrape_at'] = change_tracking['previousScrapeAt']
|
||||||
|
|
||||||
|
if 'changeStatus' in change_tracking:
|
||||||
|
transformed_change_tracking['change_status'] = change_tracking['changeStatus']
|
||||||
|
|
||||||
|
if 'visibility' in change_tracking:
|
||||||
|
transformed_change_tracking['visibility'] = change_tracking['visibility']
|
||||||
|
|
||||||
|
if 'diff' in change_tracking:
|
||||||
|
transformed_change_tracking['diff'] = change_tracking['diff']
|
||||||
|
|
||||||
|
if 'json' in change_tracking:
|
||||||
|
transformed_change_tracking['json'] = change_tracking['json']
|
||||||
|
|
||||||
|
# Replace the camelCase changeTracking with snake_case change_tracking
|
||||||
|
data['change_tracking'] = transformed_change_tracking
|
||||||
|
del data['changeTracking']
|
||||||
|
|
||||||
|
# Handle top-level fields (for backward compatibility)
|
||||||
|
if 'changeStatus' in data:
|
||||||
|
data['change_status'] = data['changeStatus']
|
||||||
|
del data['changeStatus']
|
||||||
|
|
||||||
|
if 'previousScrapeAt' in data:
|
||||||
|
data['previous_scrape_at'] = data['previousScrapeAt']
|
||||||
|
del data['previousScrapeAt']
|
||||||
|
|
||||||
|
return data
|
@ -16,7 +16,7 @@ API_KEY = os.getenv("TEST_API_KEY")
|
|||||||
|
|
||||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||||
|
|
||||||
TEST_URL = 'example.com'
|
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_scrape_url_async_simple():
|
async def test_scrape_url_async_simple():
|
||||||
@ -24,109 +24,32 @@ async def test_scrape_url_async_simple():
|
|||||||
TEST_URL,
|
TEST_URL,
|
||||||
formats=["markdown"]
|
formats=["markdown"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert response.success
|
assert response.success
|
||||||
assert hasattr(response, "markdown")
|
assert hasattr(response, "markdown")
|
||||||
assert "# Example Domain" in response.markdown
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
# Metadata assertions
|
||||||
async def test_scrape_url_async_all_params_with_extract_screenshot():
|
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
|
||||||
]
|
|
||||||
response = await app.scrape_url(
|
|
||||||
url=TEST_URL,
|
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract", "change_tracking"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions,
|
|
||||||
change_tracking_options=change_tracking
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.success
|
|
||||||
assert hasattr(response, "markdown")
|
|
||||||
assert hasattr(response, "html")
|
|
||||||
assert hasattr(response, "raw_html")
|
|
||||||
assert hasattr(response, "links")
|
|
||||||
assert hasattr(response, "screenshot")
|
|
||||||
assert hasattr(response, "extract")
|
|
||||||
assert hasattr(response, "metadata")
|
assert hasattr(response, "metadata")
|
||||||
assert hasattr(response, "change_tracking")
|
assert response.metadata is not None
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
|
||||||
]
|
|
||||||
response = await app.scrape_url(
|
|
||||||
url=TEST_URL,
|
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions,
|
|
||||||
change_tracking_options=change_tracking
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.success
|
|
||||||
assert hasattr(response, "markdown")
|
|
||||||
assert hasattr(response, "html")
|
|
||||||
assert hasattr(response, "raw_html")
|
|
||||||
assert hasattr(response, "links")
|
|
||||||
assert hasattr(response, "screenshot")
|
|
||||||
assert hasattr(response, "extract")
|
|
||||||
assert hasattr(response, "metadata")
|
|
||||||
assert hasattr(response, "change_tracking")
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_scrape_url_async_all_params_with_json_options():
|
async def test_scrape_url_async_all_params_with_json_options():
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
|
||||||
actions = [
|
actions = [
|
||||||
WaitAction(type="wait", milliseconds=500),
|
WaitAction(milliseconds=500),
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
ScreenshotAction(full_page=True),
|
||||||
WriteAction(type="write", text="test input"),
|
WriteAction(text="test input"),
|
||||||
PressAction(type="press", key="Enter"),
|
PressAction(key="Enter"),
|
||||||
ScrollAction(type="scroll", direction="down"),
|
ScrollAction(direction="down"),
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
]
|
]
|
||||||
response = await app.scrape_url(
|
response = await app.scrape_url(
|
||||||
url=TEST_URL,
|
url=TEST_URL,
|
||||||
@ -135,7 +58,7 @@ async def test_scrape_url_async_all_params_with_json_options():
|
|||||||
exclude_tags=["footer"],
|
exclude_tags=["footer"],
|
||||||
only_main_content=True,
|
only_main_content=True,
|
||||||
wait_for=1000,
|
wait_for=1000,
|
||||||
timeout=15000,
|
timeout=30000,
|
||||||
location=location,
|
location=location,
|
||||||
mobile=True,
|
mobile=True,
|
||||||
skip_tls_verification=True,
|
skip_tls_verification=True,
|
||||||
@ -147,12 +70,149 @@ async def test_scrape_url_async_all_params_with_json_options():
|
|||||||
change_tracking_options=change_tracking
|
change_tracking_options=change_tracking
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert response.success
|
assert response.success
|
||||||
|
assert response.error is None
|
||||||
|
|
||||||
|
# Format assertions
|
||||||
assert hasattr(response, "markdown")
|
assert hasattr(response, "markdown")
|
||||||
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
|
||||||
assert hasattr(response, "html")
|
assert hasattr(response, "html")
|
||||||
|
assert response.html is not None
|
||||||
|
assert isinstance(response.html, str)
|
||||||
|
assert len(response.html) > 0
|
||||||
|
|
||||||
assert hasattr(response, "raw_html")
|
assert hasattr(response, "raw_html")
|
||||||
|
assert response.raw_html is not None
|
||||||
|
assert isinstance(response.raw_html, str)
|
||||||
|
assert len(response.raw_html) > 0
|
||||||
|
|
||||||
assert hasattr(response, "links")
|
assert hasattr(response, "links")
|
||||||
|
assert response.links is not None
|
||||||
|
assert isinstance(response.links, list)
|
||||||
|
|
||||||
assert hasattr(response, "screenshot")
|
assert hasattr(response, "screenshot")
|
||||||
|
assert response.screenshot is not None
|
||||||
|
assert isinstance(response.screenshot, str)
|
||||||
|
assert response.screenshot.startswith(("data:image/", "https://"))
|
||||||
|
|
||||||
assert hasattr(response, "json")
|
assert hasattr(response, "json")
|
||||||
|
assert response.json is not None
|
||||||
|
assert isinstance(response.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
assert hasattr(response, "metadata")
|
assert hasattr(response, "metadata")
|
||||||
|
assert response.metadata is not None
|
||||||
|
assert isinstance(response.metadata, dict)
|
||||||
|
|
||||||
|
# Change tracking assertions
|
||||||
assert hasattr(response, "change_tracking")
|
assert hasattr(response, "change_tracking")
|
||||||
|
assert response.change_tracking is not None
|
||||||
|
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||||
|
assert hasattr(response.change_tracking, "change_status")
|
||||||
|
assert hasattr(response.change_tracking, "visibility")
|
||||||
|
assert hasattr(response.change_tracking, "diff")
|
||||||
|
assert hasattr(response.change_tracking, "json")
|
||||||
|
|
||||||
|
# Change status can be either "changed" or "same"
|
||||||
|
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||||
|
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||||
|
|
||||||
|
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||||
|
if response.change_tracking.change_status == "changed":
|
||||||
|
assert response.change_tracking.diff is not None
|
||||||
|
assert response.change_tracking.json is not None
|
||||||
|
# For "same" status, diff and json can be None (no changes detected)
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_scrape_url_async_all_params_with_json_options_full_page_screenshot():
|
||||||
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
|
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
|
||||||
|
actions = [
|
||||||
|
WaitAction(milliseconds=500),
|
||||||
|
ScreenshotAction(full_page=True),
|
||||||
|
WriteAction(text="test input"),
|
||||||
|
PressAction(key="Enter"),
|
||||||
|
ScrollAction(direction="down"),
|
||||||
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
|
]
|
||||||
|
response = await app.scrape_url(
|
||||||
|
url=TEST_URL,
|
||||||
|
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
|
||||||
|
include_tags=["h1", "p"],
|
||||||
|
exclude_tags=["footer"],
|
||||||
|
only_main_content=True,
|
||||||
|
wait_for=1000,
|
||||||
|
timeout=30000,
|
||||||
|
location=location,
|
||||||
|
mobile=True,
|
||||||
|
skip_tls_verification=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
block_ads=True,
|
||||||
|
proxy="basic",
|
||||||
|
json_options=json_options,
|
||||||
|
actions=actions,
|
||||||
|
change_tracking_options=change_tracking
|
||||||
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
|
assert response.success
|
||||||
|
assert response.error is None
|
||||||
|
|
||||||
|
# Format assertions
|
||||||
|
assert hasattr(response, "markdown")
|
||||||
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "html")
|
||||||
|
assert response.html is not None
|
||||||
|
assert isinstance(response.html, str)
|
||||||
|
assert len(response.html) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "raw_html")
|
||||||
|
assert response.raw_html is not None
|
||||||
|
assert isinstance(response.raw_html, str)
|
||||||
|
assert len(response.raw_html) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "links")
|
||||||
|
assert response.links is not None
|
||||||
|
assert isinstance(response.links, list)
|
||||||
|
|
||||||
|
assert hasattr(response, "screenshot")
|
||||||
|
assert response.screenshot is not None
|
||||||
|
assert isinstance(response.screenshot, str)
|
||||||
|
assert response.screenshot.startswith(("data:image/", "https://"))
|
||||||
|
|
||||||
|
assert hasattr(response, "json")
|
||||||
|
assert response.json is not None
|
||||||
|
assert isinstance(response.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(response, "metadata")
|
||||||
|
assert response.metadata is not None
|
||||||
|
assert isinstance(response.metadata, dict)
|
||||||
|
|
||||||
|
# Change tracking assertions
|
||||||
|
assert hasattr(response, "change_tracking")
|
||||||
|
assert response.change_tracking is not None
|
||||||
|
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||||
|
assert hasattr(response.change_tracking, "change_status")
|
||||||
|
assert hasattr(response.change_tracking, "visibility")
|
||||||
|
assert hasattr(response.change_tracking, "diff")
|
||||||
|
assert hasattr(response.change_tracking, "json")
|
||||||
|
|
||||||
|
# Change status can be either "changed" or "same"
|
||||||
|
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||||
|
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||||
|
|
||||||
|
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||||
|
if response.change_tracking.change_status == "changed":
|
||||||
|
assert response.change_tracking.diff is not None
|
||||||
|
assert response.change_tracking.json is not None
|
||||||
|
# For "same" status, diff and json can be None (no changes detected)
|
@ -13,125 +13,49 @@ load_dotenv()
|
|||||||
|
|
||||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||||
API_KEY = os.getenv("TEST_API_KEY")
|
API_KEY = os.getenv("TEST_API_KEY")
|
||||||
|
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
|
||||||
|
|
||||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||||
|
|
||||||
TEST_URL = 'example.com'
|
|
||||||
|
|
||||||
def test_scrape_url_simple():
|
def test_scrape_url_simple():
|
||||||
response = app.scrape_url(
|
response = app.scrape_url(
|
||||||
TEST_URL,
|
TEST_URL,
|
||||||
formats=["markdown"]
|
formats=["markdown"]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert response.success
|
assert response.success
|
||||||
assert hasattr(response, "markdown")
|
assert hasattr(response, "markdown")
|
||||||
assert "# Example Domain" in response.markdown
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
|
||||||
|
|
||||||
def test_scrape_url_all_params_with_extract_screenshot():
|
# Metadata assertions
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
|
||||||
]
|
|
||||||
response = app.scrape_url(
|
|
||||||
url=TEST_URL,
|
|
||||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions,
|
|
||||||
change_tracking_options=change_tracking
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.success
|
|
||||||
assert hasattr(response, "markdown")
|
|
||||||
assert hasattr(response, "html")
|
|
||||||
assert hasattr(response, "rawHtml")
|
|
||||||
assert hasattr(response, "links")
|
|
||||||
assert hasattr(response, "screenshot")
|
|
||||||
assert hasattr(response, "extract")
|
|
||||||
assert hasattr(response, "metadata")
|
assert hasattr(response, "metadata")
|
||||||
assert hasattr(response, "changeTracking")
|
assert response.metadata is not None
|
||||||
|
|
||||||
def test_scrape_url_all_params_with_extract_full_page_screenshot():
|
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
|
||||||
]
|
|
||||||
response = app.scrape_url(
|
|
||||||
url=TEST_URL,
|
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions,
|
|
||||||
change_tracking_options=change_tracking
|
|
||||||
)
|
|
||||||
|
|
||||||
assert response.success
|
|
||||||
assert hasattr(response, "markdown")
|
|
||||||
assert hasattr(response, "html")
|
|
||||||
assert hasattr(response, "raw_html")
|
|
||||||
assert hasattr(response, "links")
|
|
||||||
assert hasattr(response, "screenshot")
|
|
||||||
assert hasattr(response, "extract")
|
|
||||||
assert hasattr(response, "metadata")
|
|
||||||
assert hasattr(response, "change_tracking")
|
|
||||||
|
|
||||||
def test_scrape_url_all_params_with_json_options():
|
def test_scrape_url_all_params_with_json_options():
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
|
||||||
actions = [
|
actions = [
|
||||||
WaitAction(type="wait", milliseconds=500),
|
WaitAction(milliseconds=500),
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
ScreenshotAction(full_page=True),
|
||||||
WriteAction(type="write", text="test input"),
|
WriteAction(text="test input"),
|
||||||
PressAction(type="press", key="Enter"),
|
PressAction(key="Enter"),
|
||||||
ScrollAction(type="scroll", direction="down"),
|
ScrollAction(direction="down"),
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
]
|
]
|
||||||
response = app.scrape_url(
|
response = app.scrape_url(
|
||||||
url=TEST_URL,
|
url=TEST_URL,
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
|
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
|
||||||
include_tags=["h1", "p"],
|
include_tags=["p"],
|
||||||
exclude_tags=["footer"],
|
exclude_tags=["footer"],
|
||||||
only_main_content=True,
|
only_main_content=True,
|
||||||
wait_for=1000,
|
wait_for=1000,
|
||||||
timeout=15000,
|
timeout=30000,
|
||||||
location=location,
|
location=location,
|
||||||
mobile=True,
|
mobile=True,
|
||||||
skip_tls_verification=True,
|
skip_tls_verification=True,
|
||||||
@ -143,12 +67,146 @@ def test_scrape_url_all_params_with_json_options():
|
|||||||
change_tracking_options=change_tracking
|
change_tracking_options=change_tracking
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert response.success
|
assert response.success
|
||||||
|
assert response.error is None
|
||||||
|
|
||||||
|
# Format assertions
|
||||||
assert hasattr(response, "markdown")
|
assert hasattr(response, "markdown")
|
||||||
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
|
||||||
assert hasattr(response, "html")
|
assert hasattr(response, "html")
|
||||||
|
assert response.html is not None
|
||||||
|
assert isinstance(response.html, str)
|
||||||
|
assert len(response.html) > 0
|
||||||
|
|
||||||
assert hasattr(response, "raw_html")
|
assert hasattr(response, "raw_html")
|
||||||
|
assert response.raw_html is not None
|
||||||
|
assert isinstance(response.raw_html, str)
|
||||||
|
assert len(response.raw_html) > 0
|
||||||
|
|
||||||
assert hasattr(response, "links")
|
assert hasattr(response, "links")
|
||||||
|
assert response.links is not None
|
||||||
|
assert isinstance(response.links, list)
|
||||||
|
|
||||||
assert hasattr(response, "screenshot")
|
assert hasattr(response, "screenshot")
|
||||||
|
assert response.screenshot is not None
|
||||||
|
assert isinstance(response.screenshot, str)
|
||||||
|
assert response.screenshot.startswith("https://")
|
||||||
|
|
||||||
assert hasattr(response, "json")
|
assert hasattr(response, "json")
|
||||||
|
assert response.json is not None
|
||||||
|
assert isinstance(response.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
assert hasattr(response, "metadata")
|
assert hasattr(response, "metadata")
|
||||||
|
assert response.metadata is not None
|
||||||
|
assert isinstance(response.metadata, dict)
|
||||||
|
|
||||||
|
# Change tracking assertions
|
||||||
assert hasattr(response, "change_tracking")
|
assert hasattr(response, "change_tracking")
|
||||||
|
assert response.change_tracking is not None
|
||||||
|
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||||
|
assert hasattr(response.change_tracking, "change_status")
|
||||||
|
assert hasattr(response.change_tracking, "visibility")
|
||||||
|
assert hasattr(response.change_tracking, "diff")
|
||||||
|
assert hasattr(response.change_tracking, "json")
|
||||||
|
|
||||||
|
# Change status can be either "changed" or "same"
|
||||||
|
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||||
|
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||||
|
|
||||||
|
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||||
|
if response.change_tracking.change_status == "changed":
|
||||||
|
assert response.change_tracking.diff is not None
|
||||||
|
assert response.change_tracking.json is not None
|
||||||
|
# For "same" status, diff and json can be None (no changes detected)
|
||||||
|
|
||||||
|
def test_scrape_url_all_params_with_json_options_full_page_screenshot():
|
||||||
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
|
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
|
||||||
|
actions = [
|
||||||
|
WaitAction(milliseconds=500),
|
||||||
|
ScreenshotAction(full_page=True),
|
||||||
|
WriteAction(text="test input"),
|
||||||
|
PressAction(key="Enter"),
|
||||||
|
ScrollAction(direction="down"),
|
||||||
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
|
]
|
||||||
|
response = app.scrape_url(
|
||||||
|
url=TEST_URL,
|
||||||
|
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
|
||||||
|
include_tags=["p"],
|
||||||
|
exclude_tags=["footer"],
|
||||||
|
only_main_content=True,
|
||||||
|
wait_for=1000,
|
||||||
|
timeout=30000,
|
||||||
|
location=location,
|
||||||
|
mobile=True,
|
||||||
|
skip_tls_verification=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
block_ads=True,
|
||||||
|
proxy="basic",
|
||||||
|
json_options=json_options,
|
||||||
|
actions=actions,
|
||||||
|
change_tracking_options=change_tracking
|
||||||
|
)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
|
assert response.success
|
||||||
|
assert response.error is None
|
||||||
|
|
||||||
|
# Format assertions
|
||||||
|
assert hasattr(response, "markdown")
|
||||||
|
assert response.markdown is not None
|
||||||
|
assert isinstance(response.markdown, str)
|
||||||
|
assert len(response.markdown) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "html")
|
||||||
|
assert response.html is not None
|
||||||
|
assert isinstance(response.html, str)
|
||||||
|
assert len(response.html) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "raw_html")
|
||||||
|
assert response.raw_html is not None
|
||||||
|
assert isinstance(response.raw_html, str)
|
||||||
|
assert len(response.raw_html) > 0
|
||||||
|
|
||||||
|
assert hasattr(response, "links")
|
||||||
|
assert response.links is not None
|
||||||
|
assert isinstance(response.links, list)
|
||||||
|
|
||||||
|
assert hasattr(response, "screenshot")
|
||||||
|
assert response.screenshot is not None
|
||||||
|
assert isinstance(response.screenshot, str)
|
||||||
|
assert response.screenshot.startswith("https://")
|
||||||
|
|
||||||
|
assert hasattr(response, "json")
|
||||||
|
assert response.json is not None
|
||||||
|
assert isinstance(response.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(response, "metadata")
|
||||||
|
assert response.metadata is not None
|
||||||
|
assert isinstance(response.metadata, dict)
|
||||||
|
|
||||||
|
# Change tracking assertions
|
||||||
|
assert hasattr(response, "change_tracking")
|
||||||
|
assert response.change_tracking is not None
|
||||||
|
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||||
|
assert hasattr(response.change_tracking, "change_status")
|
||||||
|
assert hasattr(response.change_tracking, "visibility")
|
||||||
|
assert hasattr(response.change_tracking, "diff")
|
||||||
|
assert hasattr(response.change_tracking, "json")
|
||||||
|
|
||||||
|
# Change status can be either "changed" or "same"
|
||||||
|
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||||
|
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||||
|
|
||||||
|
if response.change_tracking.change_status == "changed":
|
||||||
|
assert response.change_tracking.diff is not None
|
||||||
|
assert response.change_tracking.json is not None
|
Loading…
x
Reference in New Issue
Block a user