mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 02:45:53 +08:00
sdk(v3): scrape and async scrape ok
This commit is contained in:
parent
94220a772b
commit
ab1e244693
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
|
||||
|
||||
__version__ = "2.6.0"
|
||||
__version__ = "3.0.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
File diff suppressed because it is too large
Load Diff
386
apps/python-sdk/firecrawl/types.py
Normal file
386
apps/python-sdk/firecrawl/types.py
Normal file
@ -0,0 +1,386 @@
|
||||
import pydantic
|
||||
from typing import Optional, List, Dict, Literal, Any, Union, Generic, TypeVar
|
||||
from datetime import datetime
|
||||
import warnings
|
||||
|
||||
T = TypeVar('T')
|
||||
|
||||
# Suppress Pydantic warnings about attribute shadowing
|
||||
warnings.filterwarnings("ignore", message="Field name \"json\" in \"FirecrawlDocument\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"json\" in \"ChangeTrackingData\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"JsonConfig\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ExtractParams\" shadows an attribute in parent \"BaseModel\"")
|
||||
warnings.filterwarnings("ignore", message="Field name \"schema\" in \"ChangeTrackingOptions\" shadows an attribute in parent \"BaseModel\"")
|
||||
|
||||
# class FirecrawlDocumentMetadata(pydantic.BaseModel):
|
||||
# """Metadata for a Firecrawl document."""
|
||||
# title: Optional[str] = None
|
||||
# description: Optional[str] = None
|
||||
# language: Optional[str] = None
|
||||
# keywords: Optional[str] = None
|
||||
# robots: Optional[str] = None
|
||||
# ogTitle: Optional[str] = None
|
||||
# ogDescription: Optional[str] = None
|
||||
# ogUrl: Optional[str] = None
|
||||
# ogImage: Optional[str] = None
|
||||
# ogAudio: Optional[str] = None
|
||||
# ogDeterminer: Optional[str] = None
|
||||
# ogLocale: Optional[str] = None
|
||||
# ogLocaleAlternate: Optional[List[str]] = None
|
||||
# ogSiteName: Optional[str] = None
|
||||
# ogVideo: Optional[str] = None
|
||||
# dctermsCreated: Optional[str] = None
|
||||
# dcDateCreated: Optional[str] = None
|
||||
# dcDate: Optional[str] = None
|
||||
# dctermsType: Optional[str] = None
|
||||
# dcType: Optional[str] = None
|
||||
# dctermsAudience: Optional[str] = None
|
||||
# dctermsSubject: Optional[str] = None
|
||||
# dcSubject: Optional[str] = None
|
||||
# dcDescription: Optional[str] = None
|
||||
# dctermsKeywords: Optional[str] = None
|
||||
# modifiedTime: Optional[str] = None
|
||||
# publishedTime: Optional[str] = None
|
||||
# articleTag: Optional[str] = None
|
||||
# articleSection: Optional[str] = None
|
||||
# sourceURL: Optional[str] = None
|
||||
# statusCode: Optional[int] = None
|
||||
# error: Optional[str] = None
|
||||
|
||||
class AgentOptions(pydantic.BaseModel):
|
||||
"""Configuration for the agent."""
|
||||
model: Literal["FIRE-1"] = "FIRE-1"
|
||||
prompt: Optional[str] = None
|
||||
|
||||
class AgentOptionsExtract(pydantic.BaseModel):
|
||||
"""Configuration for the agent in extract operations."""
|
||||
model: Literal["FIRE-1"] = "FIRE-1"
|
||||
|
||||
class ActionsResult(pydantic.BaseModel):
|
||||
"""Result of actions performed during scraping."""
|
||||
screenshots: List[str]
|
||||
|
||||
class ChangeTrackingData(pydantic.BaseModel):
|
||||
"""
|
||||
Data for the change tracking format.
|
||||
"""
|
||||
previous_scrape_at: Optional[str] = None
|
||||
change_status: str # "new" | "same" | "changed" | "removed"
|
||||
visibility: str # "visible" | "hidden"
|
||||
diff: Optional[Dict[str, Any]] = None
|
||||
json: Optional[Any] = None
|
||||
|
||||
class FirecrawlDocument(pydantic.BaseModel, Generic[T]):
|
||||
"""Document retrieved or processed by Firecrawl."""
|
||||
url: Optional[str] = None
|
||||
markdown: Optional[str] = None
|
||||
html: Optional[str] = None
|
||||
raw_html: Optional[str] = None
|
||||
links: Optional[List[str]] = None
|
||||
extract: Optional[T] = None
|
||||
json: Optional[T] = None
|
||||
screenshot: Optional[str] = None
|
||||
metadata: Optional[Any] = None
|
||||
actions: Optional[ActionsResult] = None
|
||||
title: Optional[str] = None # v1 search only
|
||||
description: Optional[str] = None # v1 search only
|
||||
change_tracking: Optional[ChangeTrackingData] = None
|
||||
|
||||
class LocationConfig(pydantic.BaseModel):
|
||||
"""Location configuration for scraping."""
|
||||
country: Optional[str] = None
|
||||
languages: Optional[List[str]] = None
|
||||
|
||||
class WebhookConfig(pydantic.BaseModel):
|
||||
"""Configuration for webhooks."""
|
||||
url: str
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
metadata: Optional[Dict[str, str]] = None
|
||||
events: Optional[List[Literal["completed", "failed", "page", "started"]]] = None
|
||||
|
||||
class ChangeTrackingOptions(pydantic.BaseModel):
|
||||
"""Configuration for change tracking."""
|
||||
modes: Optional[List[Literal["git-diff", "json"]]] = None
|
||||
schema: Optional[Any] = None
|
||||
prompt: Optional[str] = None
|
||||
|
||||
class ScrapeOptions(pydantic.BaseModel):
|
||||
"""Parameters for scraping operations."""
|
||||
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
include_tags: Optional[List[str]] = None
|
||||
exclude_tags: Optional[List[str]] = None
|
||||
only_main_content: Optional[bool] = None
|
||||
wait_for: Optional[int] = None
|
||||
timeout: Optional[int] = None
|
||||
location: Optional[LocationConfig] = None
|
||||
mobile: Optional[bool] = None
|
||||
skip_tls_verification: Optional[bool] = None
|
||||
remove_base64_images: Optional[bool] = None
|
||||
block_ads: Optional[bool] = None
|
||||
proxy: Optional[Literal["basic", "stealth"]] = None
|
||||
change_tracking_options: Optional[ChangeTrackingOptions] = None
|
||||
|
||||
class WaitAction(pydantic.BaseModel):
|
||||
"""Wait action to perform during scraping."""
|
||||
type: Literal["wait"] = pydantic.Field(default="wait")
|
||||
milliseconds: Optional[int] = None
|
||||
selector: Optional[str] = None
|
||||
|
||||
class ScreenshotAction(pydantic.BaseModel):
|
||||
"""Screenshot action to perform during scraping."""
|
||||
type: Literal["screenshot"] = pydantic.Field(default="screenshot")
|
||||
full_page: Optional[bool] = None
|
||||
|
||||
class ClickAction(pydantic.BaseModel):
|
||||
"""Click action to perform during scraping."""
|
||||
type: Literal["click"] = pydantic.Field(default="click")
|
||||
selector: str
|
||||
|
||||
class WriteAction(pydantic.BaseModel):
|
||||
"""Write action to perform during scraping."""
|
||||
type: Literal["write"] = pydantic.Field(default="write")
|
||||
text: str
|
||||
|
||||
class PressAction(pydantic.BaseModel):
|
||||
"""Press action to perform during scraping."""
|
||||
type: Literal["press"] = pydantic.Field(default="press")
|
||||
key: str
|
||||
|
||||
class ScrollAction(pydantic.BaseModel):
|
||||
"""Scroll action to perform during scraping."""
|
||||
type: Literal["scroll"] = pydantic.Field(default="scroll")
|
||||
direction: Literal["up", "down"]
|
||||
selector: Optional[str] = None
|
||||
|
||||
class ScrapeAction(pydantic.BaseModel):
|
||||
"""Scrape action to perform during scraping."""
|
||||
type: Literal["scrape"] = pydantic.Field(default="scrape")
|
||||
|
||||
class ExecuteJavascriptAction(pydantic.BaseModel):
|
||||
"""Execute javascript action to perform during scraping."""
|
||||
type: Literal["executeJavascript"] = pydantic.Field(default="executeJavascript")
|
||||
script: str
|
||||
|
||||
|
||||
class ExtractAgent(pydantic.BaseModel):
|
||||
"""Configuration for the agent in extract operations."""
|
||||
model: Literal["FIRE-1"] = "FIRE-1"
|
||||
|
||||
class JsonConfig(pydantic.BaseModel):
|
||||
"""Configuration for extraction."""
|
||||
prompt: Optional[str] = None
|
||||
schema: Optional[Any] = None
|
||||
system_prompt: Optional[str] = None
|
||||
agent: Optional[ExtractAgent] = None
|
||||
|
||||
class ScrapeParams(ScrapeOptions):
|
||||
"""Parameters for scraping operations."""
|
||||
extract: Optional[JsonConfig] = None
|
||||
json_options: Optional[JsonConfig] = None
|
||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
||||
agent: Optional[AgentOptions] = None
|
||||
webhook: Optional[WebhookConfig] = None
|
||||
|
||||
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
||||
"""Response from scraping operations."""
|
||||
success: bool = True
|
||||
warning: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
class BatchScrapeResponse(pydantic.BaseModel):
|
||||
"""Response from batch scrape operations."""
|
||||
id: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
success: bool = True
|
||||
error: Optional[str] = None
|
||||
invalid_urls: Optional[List[str]] = None
|
||||
|
||||
class BatchScrapeStatusResponse(pydantic.BaseModel):
|
||||
"""Response from batch scrape status checks."""
|
||||
success: bool = True
|
||||
status: Literal["scraping", "completed", "failed", "cancelled"]
|
||||
completed: int
|
||||
total: int
|
||||
credits_used: int
|
||||
expires_at: datetime
|
||||
next: Optional[str] = None
|
||||
data: List[FirecrawlDocument]
|
||||
|
||||
class CrawlParams(pydantic.BaseModel):
|
||||
"""Parameters for crawling operations."""
|
||||
include_paths: Optional[List[str]] = None
|
||||
exclude_paths: Optional[List[str]] = None
|
||||
max_depth: Optional[int] = None
|
||||
max_discovery_depth: Optional[int] = None
|
||||
limit: Optional[int] = None
|
||||
allow_backward_links: Optional[bool] = None
|
||||
allow_external_links: Optional[bool] = None
|
||||
ignore_sitemap: Optional[bool] = None
|
||||
scrape_options: Optional[ScrapeOptions] = None
|
||||
webhook: Optional[Union[str, WebhookConfig]] = None
|
||||
deduplicate_similar_urls: Optional[bool] = None
|
||||
ignore_query_parameters: Optional[bool] = None
|
||||
regex_on_full_url: Optional[bool] = None
|
||||
delay: Optional[int] = None # Delay in seconds between scrapes
|
||||
|
||||
class CrawlResponse(pydantic.BaseModel):
|
||||
"""Response from crawling operations."""
|
||||
id: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
success: bool = True
|
||||
error: Optional[str] = None
|
||||
|
||||
class CrawlStatusResponse(pydantic.BaseModel):
|
||||
"""Response from crawl status checks."""
|
||||
success: bool = True
|
||||
status: Literal["scraping", "completed", "failed", "cancelled"]
|
||||
completed: int
|
||||
total: int
|
||||
credits_used: int
|
||||
expires_at: datetime
|
||||
next: Optional[str] = None
|
||||
data: List[FirecrawlDocument]
|
||||
|
||||
class CrawlErrorsResponse(pydantic.BaseModel):
|
||||
"""Response from crawl/batch scrape error monitoring."""
|
||||
errors: List[Dict[str, str]] # {id: str, timestamp: str, url: str, error: str}
|
||||
robots_blocked: List[str]
|
||||
|
||||
class MapParams(pydantic.BaseModel):
|
||||
"""Parameters for mapping operations."""
|
||||
search: Optional[str] = None
|
||||
ignore_sitemap: Optional[bool] = None
|
||||
include_subdomains: Optional[bool] = None
|
||||
sitemap_only: Optional[bool] = None
|
||||
limit: Optional[int] = None
|
||||
timeout: Optional[int] = None
|
||||
|
||||
class MapResponse(pydantic.BaseModel):
|
||||
"""Response from mapping operations."""
|
||||
success: bool = True
|
||||
links: Optional[List[str]] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
class ExtractParams(pydantic.BaseModel):
|
||||
"""Parameters for extracting information from URLs."""
|
||||
prompt: Optional[str] = None
|
||||
schema: Optional[Any] = None
|
||||
system_prompt: Optional[str] = None
|
||||
allow_external_links: Optional[bool] = None
|
||||
enable_web_search: Optional[bool] = None
|
||||
include_subdomains: Optional[bool] = None
|
||||
origin: Optional[str] = None
|
||||
show_sources: Optional[bool] = None
|
||||
scrape_options: Optional[ScrapeOptions] = None
|
||||
|
||||
class ExtractResponse(pydantic.BaseModel, Generic[T]):
|
||||
"""Response from extract operations."""
|
||||
id: Optional[str] = None
|
||||
status: Optional[Literal["processing", "completed", "failed"]] = None
|
||||
expires_at: Optional[datetime] = None
|
||||
success: bool = True
|
||||
data: Optional[T] = None
|
||||
error: Optional[str] = None
|
||||
warning: Optional[str] = None
|
||||
sources: Optional[List[str]] = None
|
||||
|
||||
class SearchParams(pydantic.BaseModel):
|
||||
query: str
|
||||
limit: Optional[int] = 5
|
||||
tbs: Optional[str] = None
|
||||
filter: Optional[str] = None
|
||||
lang: Optional[str] = "en"
|
||||
country: Optional[str] = "us"
|
||||
location: Optional[str] = None
|
||||
origin: Optional[str] = "api"
|
||||
timeout: Optional[int] = 60000
|
||||
scrape_options: Optional[ScrapeOptions] = None
|
||||
|
||||
class SearchResponse(pydantic.BaseModel):
|
||||
"""Response from search operations."""
|
||||
success: bool = True
|
||||
data: List[FirecrawlDocument]
|
||||
warning: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
class GenerateLLMsTextParams(pydantic.BaseModel):
|
||||
"""
|
||||
Parameters for the LLMs.txt generation operation.
|
||||
"""
|
||||
max_urls: Optional[int] = 10
|
||||
show_full_text: Optional[bool] = False
|
||||
__experimental_stream: Optional[bool] = None
|
||||
|
||||
class DeepResearchParams(pydantic.BaseModel):
|
||||
"""
|
||||
Parameters for the deep research operation.
|
||||
"""
|
||||
max_depth: Optional[int] = 7
|
||||
time_limit: Optional[int] = 270
|
||||
max_urls: Optional[int] = 20
|
||||
analysis_prompt: Optional[str] = None
|
||||
system_prompt: Optional[str] = None
|
||||
__experimental_stream_steps: Optional[bool] = None
|
||||
|
||||
class DeepResearchResponse(pydantic.BaseModel):
|
||||
"""
|
||||
Response from the deep research operation.
|
||||
"""
|
||||
success: bool
|
||||
id: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class DeepResearchStatusResponse(pydantic.BaseModel):
|
||||
"""
|
||||
Status response from the deep research operation.
|
||||
"""
|
||||
success: bool
|
||||
data: Optional[Dict[str, Any]] = None
|
||||
status: str
|
||||
error: Optional[str] = None
|
||||
expires_at: str
|
||||
current_depth: int
|
||||
max_depth: int
|
||||
activities: List[Dict[str, Any]]
|
||||
sources: List[Dict[str, Any]]
|
||||
summaries: List[str]
|
||||
|
||||
class GenerateLLMsTextResponse(pydantic.BaseModel):
|
||||
"""Response from LLMs.txt generation operations."""
|
||||
success: bool = True
|
||||
id: str
|
||||
error: Optional[str] = None
|
||||
|
||||
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
||||
llmstxt: str
|
||||
llmsfulltxt: Optional[str] = None
|
||||
|
||||
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
||||
"""Status response from LLMs.txt generation operations."""
|
||||
success: bool = True
|
||||
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
||||
status: Literal["processing", "completed", "failed"]
|
||||
error: Optional[str] = None
|
||||
expires_at: str
|
||||
|
||||
class SearchResponse(pydantic.BaseModel):
|
||||
"""
|
||||
Response from the search operation.
|
||||
"""
|
||||
success: bool
|
||||
data: List[Dict[str, Any]]
|
||||
warning: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
|
||||
class ExtractParams(pydantic.BaseModel):
|
||||
"""
|
||||
Parameters for the extract operation.
|
||||
"""
|
||||
prompt: Optional[str] = None
|
||||
schema: Optional[Any] = pydantic.Field(None, alias='schema')
|
||||
system_prompt: Optional[str] = None
|
||||
allow_external_links: Optional[bool] = False
|
||||
enable_web_search: Optional[bool] = False
|
||||
show_sources: Optional[bool] = False
|
||||
agent: Optional[Dict[str, Any]] = None
|
@ -2,12 +2,10 @@
|
||||
Utility functions for the Firecrawl SDK.
|
||||
"""
|
||||
import re
|
||||
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict
|
||||
|
||||
|
||||
from typing import Any, Dict, List, Union, TypeVar, Generic, Optional, TypedDict, Literal
|
||||
from .types import LocationConfig, JsonConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, AgentOptions
|
||||
T = TypeVar('T')
|
||||
|
||||
|
||||
class DeepResearchDataSource(TypedDict, total=False):
|
||||
"""Type definition for a source in deep research data."""
|
||||
url: str
|
||||
@ -113,3 +111,207 @@ def convert_to_dot_dict(data: Union[Dict[str, Any], List[Any], Any]) -> Union[Do
|
||||
return [convert_to_dot_dict(item) for item in data]
|
||||
else:
|
||||
return data
|
||||
|
||||
def ensure_schema_dict(schema):
|
||||
"""
|
||||
Utility to ensure a schema is a dict, not a Pydantic model class. Recursively checks dicts and lists.
|
||||
"""
|
||||
if schema is None:
|
||||
return schema
|
||||
if isinstance(schema, type):
|
||||
# Pydantic v1/v2 model class
|
||||
if hasattr(schema, 'model_json_schema'):
|
||||
return schema.model_json_schema()
|
||||
elif hasattr(schema, 'schema'):
|
||||
return schema.schema()
|
||||
if isinstance(schema, dict):
|
||||
return {k: ensure_schema_dict(v) for k, v in schema.items()}
|
||||
if isinstance(schema, (list, tuple)):
|
||||
return [ensure_schema_dict(v) for v in schema]
|
||||
return schema
|
||||
|
||||
def parse_scrape_options(
|
||||
formats: Optional[List[Literal["markdown", "html", "raw_html", "links", "screenshot", "screenshot@full_page", "extract", "json", "change_tracking"]]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
only_main_content: Optional[bool] = None,
|
||||
wait_for: Optional[int] = None,
|
||||
timeout: Optional[int] = None,
|
||||
location: Optional[LocationConfig] = None,
|
||||
mobile: Optional[bool] = None,
|
||||
skip_tls_verification: Optional[bool] = None,
|
||||
remove_base64_images: Optional[bool] = None,
|
||||
block_ads: Optional[bool] = None,
|
||||
proxy: Optional[str] = None,
|
||||
extract: Optional[JsonConfig] = None,
|
||||
json_options: Optional[JsonConfig] = None,
|
||||
actions: Optional[List[Union[WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction]]] = None,
|
||||
change_tracking_options: Optional[ChangeTrackingOptions] = None,
|
||||
agent: Optional[AgentOptions] = None,
|
||||
**kwargs: Any
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse the scrape options and return a dictionary of parameters for the API request.
|
||||
"""
|
||||
|
||||
scrape_params = {}
|
||||
|
||||
# Add optional parameters if provided and not None
|
||||
if formats:
|
||||
scrape_params['formats'] = scrape_formats_transform(formats)
|
||||
if include_tags:
|
||||
scrape_params['includeTags'] = include_tags
|
||||
if exclude_tags:
|
||||
scrape_params['excludeTags'] = exclude_tags
|
||||
if only_main_content is not None:
|
||||
scrape_params['onlyMainContent'] = only_main_content
|
||||
if wait_for:
|
||||
scrape_params['waitFor'] = wait_for
|
||||
if timeout:
|
||||
scrape_params['timeout'] = timeout
|
||||
if location is not None:
|
||||
scrape_params['location'] = {
|
||||
"country": location.country,
|
||||
"languages": location.languages
|
||||
}
|
||||
if mobile is not None:
|
||||
scrape_params['mobile'] = mobile
|
||||
if skip_tls_verification is not None:
|
||||
scrape_params['skipTlsVerification'] = skip_tls_verification
|
||||
if remove_base64_images is not None:
|
||||
scrape_params['removeBase64Images'] = remove_base64_images
|
||||
if block_ads is not None:
|
||||
scrape_params['blockAds'] = block_ads
|
||||
if proxy:
|
||||
scrape_params['proxy'] = proxy
|
||||
if extract is not None:
|
||||
extract = ensure_schema_dict(extract)
|
||||
if isinstance(extract, dict) and "schema" in extract:
|
||||
extract["schema"] = ensure_schema_dict(extract["schema"])
|
||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True)
|
||||
if json_options is not None:
|
||||
json_options = ensure_schema_dict(json_options)
|
||||
if isinstance(json_options, dict) and "schema" in json_options:
|
||||
json_options["schema"] = ensure_schema_dict(json_options["schema"])
|
||||
|
||||
# Convert to dict if it's a JsonConfig object
|
||||
if hasattr(json_options, 'dict'):
|
||||
json_options_dict = json_options.model_dump(exclude_none=True)
|
||||
else:
|
||||
json_options_dict = json_options
|
||||
|
||||
# Convert snake_case to camelCase for API
|
||||
json_options_api = {}
|
||||
if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None:
|
||||
json_options_api['prompt'] = json_options_dict['prompt']
|
||||
if 'schema' in json_options_dict and json_options_dict['schema'] is not None:
|
||||
json_options_api['schema'] = json_options_dict['schema']
|
||||
if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None:
|
||||
json_options_api['systemPrompt'] = json_options_dict['system_prompt']
|
||||
if 'agent' in json_options_dict and json_options_dict['agent'] is not None:
|
||||
json_options_api['agent'] = json_options_dict['agent']
|
||||
|
||||
scrape_params['jsonOptions'] = json_options_api
|
||||
if actions:
|
||||
scrape_params['actions'] = [action if isinstance(action, dict) else action.model_dump(exclude_none=True) for action in actions]
|
||||
if change_tracking_options is not None:
|
||||
change_tracking_dict = change_tracking_options if isinstance(change_tracking_options, dict) else change_tracking_options.model_dump(exclude_none=True)
|
||||
|
||||
# Convert snake_case to camelCase for API
|
||||
change_tracking_api = {}
|
||||
if 'modes' in change_tracking_dict and change_tracking_dict['modes'] is not None:
|
||||
change_tracking_api['modes'] = change_tracking_dict['modes']
|
||||
if 'schema' in change_tracking_dict and change_tracking_dict['schema'] is not None:
|
||||
change_tracking_api['schema'] = change_tracking_dict['schema']
|
||||
if 'prompt' in change_tracking_dict and change_tracking_dict['prompt'] is not None:
|
||||
change_tracking_api['prompt'] = change_tracking_dict['prompt']
|
||||
|
||||
scrape_params['changeTrackingOptions'] = change_tracking_api
|
||||
|
||||
if 'extract' in scrape_params and scrape_params['extract'] and 'schema' in scrape_params['extract']:
|
||||
scrape_params['extract']['schema'] = ensure_schema_dict(scrape_params['extract']['schema'])
|
||||
if 'jsonOptions' in scrape_params and scrape_params['jsonOptions'] and 'schema' in scrape_params['jsonOptions']:
|
||||
scrape_params['jsonOptions']['schema'] = ensure_schema_dict(scrape_params['jsonOptions']['schema'])
|
||||
|
||||
if agent:
|
||||
scrape_params['agent'] = agent
|
||||
|
||||
if 'agent' in scrape_params and scrape_params['agent'] and 'model' in scrape_params['agent']:
|
||||
scrape_params['agent']['model'] = scrape_params['agent']['model'].value
|
||||
|
||||
# Add any additional kwargs
|
||||
scrape_params.update(kwargs)
|
||||
|
||||
return scrape_params
|
||||
|
||||
def scrape_formats_transform(formats: List[str]) -> List[str]:
|
||||
"""
|
||||
Transform the formats from snake_case to camelCase (API format).
|
||||
"""
|
||||
if 'screenshot@full_page' in formats:
|
||||
formats.remove('screenshot@full_page')
|
||||
formats.append('screenshot@fullPage')
|
||||
|
||||
if 'change_tracking' in formats:
|
||||
formats.remove('change_tracking')
|
||||
formats.append('changeTracking')
|
||||
|
||||
if 'raw_html' in formats:
|
||||
formats.remove('raw_html')
|
||||
formats.append('rawHtml')
|
||||
|
||||
return formats
|
||||
|
||||
def scrape_formats_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Transform the data response formats from camelCase (API format) to snake_case.
|
||||
"""
|
||||
if 'changeTracking' in data:
|
||||
data['change_tracking'] = data['changeTracking']
|
||||
del data['changeTracking']
|
||||
|
||||
if 'rawHtml' in data:
|
||||
data['raw_html'] = data['rawHtml']
|
||||
del data['rawHtml']
|
||||
|
||||
return data
|
||||
|
||||
def change_tracking_response_transform(data: Dict[str, Any]) -> Dict[str, Any]:
|
||||
"""
|
||||
Transform the change tracking data response from camelCase (API format) to snake_case.
|
||||
"""
|
||||
# Handle nested changeTracking object
|
||||
if 'changeTracking' in data and isinstance(data['changeTracking'], dict):
|
||||
change_tracking = data['changeTracking']
|
||||
transformed_change_tracking = {}
|
||||
|
||||
# Transform all camelCase fields to snake_case
|
||||
if 'previousScrapeAt' in change_tracking:
|
||||
transformed_change_tracking['previous_scrape_at'] = change_tracking['previousScrapeAt']
|
||||
|
||||
if 'changeStatus' in change_tracking:
|
||||
transformed_change_tracking['change_status'] = change_tracking['changeStatus']
|
||||
|
||||
if 'visibility' in change_tracking:
|
||||
transformed_change_tracking['visibility'] = change_tracking['visibility']
|
||||
|
||||
if 'diff' in change_tracking:
|
||||
transformed_change_tracking['diff'] = change_tracking['diff']
|
||||
|
||||
if 'json' in change_tracking:
|
||||
transformed_change_tracking['json'] = change_tracking['json']
|
||||
|
||||
# Replace the camelCase changeTracking with snake_case change_tracking
|
||||
data['change_tracking'] = transformed_change_tracking
|
||||
del data['changeTracking']
|
||||
|
||||
# Handle top-level fields (for backward compatibility)
|
||||
if 'changeStatus' in data:
|
||||
data['change_status'] = data['changeStatus']
|
||||
del data['changeStatus']
|
||||
|
||||
if 'previousScrapeAt' in data:
|
||||
data['previous_scrape_at'] = data['previousScrapeAt']
|
||||
del data['previousScrapeAt']
|
||||
|
||||
return data
|
@ -16,7 +16,7 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = 'example.com'
|
||||
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_simple():
|
||||
@ -24,109 +24,32 @@ async def test_scrape_url_async_simple():
|
||||
TEST_URL,
|
||||
formats=["markdown"]
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert "# Example Domain" in response.markdown
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_extract_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "change_tracking")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.metadata is not None
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
WaitAction(milliseconds=500),
|
||||
ScreenshotAction(full_page=True),
|
||||
WriteAction(text="test input"),
|
||||
PressAction(key="Enter"),
|
||||
ScrollAction(direction="down"),
|
||||
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
@ -135,7 +58,7 @@ async def test_scrape_url_async_all_params_with_json_options():
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
timeout=30000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
@ -147,12 +70,149 @@ async def test_scrape_url_async_all_params_with_json_options():
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert response.error is None
|
||||
|
||||
# Format assertions
|
||||
assert hasattr(response, "markdown")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
|
||||
assert hasattr(response, "html")
|
||||
assert response.html is not None
|
||||
assert isinstance(response.html, str)
|
||||
assert len(response.html) > 0
|
||||
|
||||
assert hasattr(response, "raw_html")
|
||||
assert response.raw_html is not None
|
||||
assert isinstance(response.raw_html, str)
|
||||
assert len(response.raw_html) > 0
|
||||
|
||||
assert hasattr(response, "links")
|
||||
assert response.links is not None
|
||||
assert isinstance(response.links, list)
|
||||
|
||||
assert hasattr(response, "screenshot")
|
||||
assert response.screenshot is not None
|
||||
assert isinstance(response.screenshot, str)
|
||||
assert response.screenshot.startswith(("data:image/", "https://"))
|
||||
|
||||
assert hasattr(response, "json")
|
||||
assert response.json is not None
|
||||
assert isinstance(response.json, dict)
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.metadata is not None
|
||||
assert isinstance(response.metadata, dict)
|
||||
|
||||
# Change tracking assertions
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.change_tracking is not None
|
||||
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||
assert hasattr(response.change_tracking, "change_status")
|
||||
assert hasattr(response.change_tracking, "visibility")
|
||||
assert hasattr(response.change_tracking, "diff")
|
||||
assert hasattr(response.change_tracking, "json")
|
||||
|
||||
# Change status can be either "changed" or "same"
|
||||
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||
|
||||
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||
if response.change_tracking.change_status == "changed":
|
||||
assert response.change_tracking.diff is not None
|
||||
assert response.change_tracking.json is not None
|
||||
# For "same" status, diff and json can be None (no changes detected)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_json_options_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"])
|
||||
actions = [
|
||||
WaitAction(milliseconds=500),
|
||||
ScreenshotAction(full_page=True),
|
||||
WriteAction(text="test input"),
|
||||
PressAction(key="Enter"),
|
||||
ScrollAction(direction="down"),
|
||||
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=30000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert response.error is None
|
||||
|
||||
# Format assertions
|
||||
assert hasattr(response, "markdown")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
|
||||
assert hasattr(response, "html")
|
||||
assert response.html is not None
|
||||
assert isinstance(response.html, str)
|
||||
assert len(response.html) > 0
|
||||
|
||||
assert hasattr(response, "raw_html")
|
||||
assert response.raw_html is not None
|
||||
assert isinstance(response.raw_html, str)
|
||||
assert len(response.raw_html) > 0
|
||||
|
||||
assert hasattr(response, "links")
|
||||
assert response.links is not None
|
||||
assert isinstance(response.links, list)
|
||||
|
||||
assert hasattr(response, "screenshot")
|
||||
assert response.screenshot is not None
|
||||
assert isinstance(response.screenshot, str)
|
||||
assert response.screenshot.startswith(("data:image/", "https://"))
|
||||
|
||||
assert hasattr(response, "json")
|
||||
assert response.json is not None
|
||||
assert isinstance(response.json, dict)
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert response.metadata is not None
|
||||
assert isinstance(response.metadata, dict)
|
||||
|
||||
# Change tracking assertions
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.change_tracking is not None
|
||||
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||
assert hasattr(response.change_tracking, "change_status")
|
||||
assert hasattr(response.change_tracking, "visibility")
|
||||
assert hasattr(response.change_tracking, "diff")
|
||||
assert hasattr(response.change_tracking, "json")
|
||||
|
||||
# Change status can be either "changed" or "same"
|
||||
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||
|
||||
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||
if response.change_tracking.change_status == "changed":
|
||||
assert response.change_tracking.diff is not None
|
||||
assert response.change_tracking.json is not None
|
||||
# For "same" status, diff and json can be None (no changes detected)
|
@ -13,125 +13,49 @@ load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
TEST_URL = 'https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions'
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = 'example.com'
|
||||
|
||||
def test_scrape_url_simple():
|
||||
response = app.scrape_url(
|
||||
TEST_URL,
|
||||
formats=["markdown"]
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert "# Example Domain" in response.markdown
|
||||
|
||||
def test_scrape_url_all_params_with_extract_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
assert "This page is used for end-to-end (e2e) testing with Firecrawl." in response.markdown
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
|
||||
def test_scrape_url_all_params_with_extract_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.metadata is not None
|
||||
|
||||
def test_scrape_url_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
WaitAction(milliseconds=500),
|
||||
ScreenshotAction(full_page=True),
|
||||
WriteAction(text="test input"),
|
||||
PressAction(key="Enter"),
|
||||
ScrollAction(direction="down"),
|
||||
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
include_tags=["p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
timeout=30000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
@ -143,12 +67,146 @@ def test_scrape_url_all_params_with_json_options():
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert response.error is None
|
||||
|
||||
# Format assertions
|
||||
assert hasattr(response, "markdown")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
|
||||
assert hasattr(response, "html")
|
||||
assert response.html is not None
|
||||
assert isinstance(response.html, str)
|
||||
assert len(response.html) > 0
|
||||
|
||||
assert hasattr(response, "raw_html")
|
||||
assert response.raw_html is not None
|
||||
assert isinstance(response.raw_html, str)
|
||||
assert len(response.raw_html) > 0
|
||||
|
||||
assert hasattr(response, "links")
|
||||
assert response.links is not None
|
||||
assert isinstance(response.links, list)
|
||||
|
||||
assert hasattr(response, "screenshot")
|
||||
assert response.screenshot is not None
|
||||
assert isinstance(response.screenshot, str)
|
||||
assert response.screenshot.startswith("https://")
|
||||
|
||||
assert hasattr(response, "json")
|
||||
assert response.json is not None
|
||||
assert isinstance(response.json, dict)
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.metadata is not None
|
||||
assert isinstance(response.metadata, dict)
|
||||
|
||||
# Change tracking assertions
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.change_tracking is not None
|
||||
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||
assert hasattr(response.change_tracking, "change_status")
|
||||
assert hasattr(response.change_tracking, "visibility")
|
||||
assert hasattr(response.change_tracking, "diff")
|
||||
assert hasattr(response.change_tracking, "json")
|
||||
|
||||
# Change status can be either "changed" or "same"
|
||||
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||
|
||||
# If status is "changed", diff and json should have data; if "same", they can be None
|
||||
if response.change_tracking.change_status == "changed":
|
||||
assert response.change_tracking.diff is not None
|
||||
assert response.change_tracking.json is not None
|
||||
# For "same" status, diff and json can be None (no changes detected)
|
||||
|
||||
def test_scrape_url_all_params_with_json_options_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"]) # , prompt="Track changes", schema=json_schema
|
||||
actions = [
|
||||
WaitAction(milliseconds=500),
|
||||
ScreenshotAction(full_page=True),
|
||||
WriteAction(text="test input"),
|
||||
PressAction(key="Enter"),
|
||||
ScrollAction(direction="down"),
|
||||
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json", "change_tracking"],
|
||||
include_tags=["p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=30000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
# Basic response assertions
|
||||
assert response.success
|
||||
assert response.error is None
|
||||
|
||||
# Format assertions
|
||||
assert hasattr(response, "markdown")
|
||||
assert response.markdown is not None
|
||||
assert isinstance(response.markdown, str)
|
||||
assert len(response.markdown) > 0
|
||||
|
||||
assert hasattr(response, "html")
|
||||
assert response.html is not None
|
||||
assert isinstance(response.html, str)
|
||||
assert len(response.html) > 0
|
||||
|
||||
assert hasattr(response, "raw_html")
|
||||
assert response.raw_html is not None
|
||||
assert isinstance(response.raw_html, str)
|
||||
assert len(response.raw_html) > 0
|
||||
|
||||
assert hasattr(response, "links")
|
||||
assert response.links is not None
|
||||
assert isinstance(response.links, list)
|
||||
|
||||
assert hasattr(response, "screenshot")
|
||||
assert response.screenshot is not None
|
||||
assert isinstance(response.screenshot, str)
|
||||
assert response.screenshot.startswith("https://")
|
||||
|
||||
assert hasattr(response, "json")
|
||||
assert response.json is not None
|
||||
assert isinstance(response.json, dict)
|
||||
|
||||
# Metadata assertions
|
||||
assert hasattr(response, "metadata")
|
||||
assert response.metadata is not None
|
||||
assert isinstance(response.metadata, dict)
|
||||
|
||||
# Change tracking assertions
|
||||
assert hasattr(response, "change_tracking")
|
||||
assert response.change_tracking is not None
|
||||
assert hasattr(response.change_tracking, "previous_scrape_at")
|
||||
assert hasattr(response.change_tracking, "change_status")
|
||||
assert hasattr(response.change_tracking, "visibility")
|
||||
assert hasattr(response.change_tracking, "diff")
|
||||
assert hasattr(response.change_tracking, "json")
|
||||
|
||||
# Change status can be either "changed" or "same"
|
||||
assert response.change_tracking.change_status in ["changed", "same", "new"]
|
||||
assert response.change_tracking.visibility in ["visible", "hidden"]
|
||||
|
||||
if response.change_tracking.change_status == "changed":
|
||||
assert response.change_tracking.diff is not None
|
||||
assert response.change_tracking.json is not None
|
Loading…
x
Reference in New Issue
Block a user