mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 03:46:06 +08:00
sdk(v3): async batch scrape (async and sync clients)
This commit is contained in:
parent
2e14859de0
commit
c5b64bd294
@ -930,89 +930,32 @@ class FirecrawlApp:
|
|||||||
# Validate any additional kwargs
|
# Validate any additional kwargs
|
||||||
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
||||||
|
|
||||||
scrape_params = {}
|
scrape_params = parse_scrape_options(
|
||||||
|
formats=formats,
|
||||||
# Add individual parameters
|
include_tags=include_tags,
|
||||||
if formats is not None:
|
exclude_tags=exclude_tags,
|
||||||
scrape_params['formats'] = formats
|
only_main_content=only_main_content,
|
||||||
if headers is not None:
|
wait_for=wait_for,
|
||||||
scrape_params['headers'] = headers
|
timeout=timeout,
|
||||||
if include_tags is not None:
|
location=location,
|
||||||
scrape_params['includeTags'] = include_tags
|
mobile=mobile,
|
||||||
if exclude_tags is not None:
|
skip_tls_verification=skip_tls_verification,
|
||||||
scrape_params['excludeTags'] = exclude_tags
|
remove_base64_images=remove_base64_images,
|
||||||
if only_main_content is not None:
|
block_ads=block_ads,
|
||||||
scrape_params['onlyMainContent'] = only_main_content
|
proxy=proxy,
|
||||||
if wait_for is not None:
|
extract=extract,
|
||||||
scrape_params['waitFor'] = wait_for
|
json_options=json_options,
|
||||||
if timeout is not None:
|
actions=actions,
|
||||||
scrape_params['timeout'] = timeout
|
agent=agent,
|
||||||
if location is not None:
|
**kwargs
|
||||||
scrape_params['location'] = location.model_dump(exclude_none=True)
|
)
|
||||||
if mobile is not None:
|
|
||||||
scrape_params['mobile'] = mobile
|
scrape_params['urls'] = urls
|
||||||
if skip_tls_verification is not None:
|
scrape_params['origin'] = f"python-sdk@{version}"
|
||||||
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
||||||
if remove_base64_images is not None:
|
|
||||||
scrape_params['removeBase64Images'] = remove_base64_images
|
|
||||||
if block_ads is not None:
|
|
||||||
scrape_params['blockAds'] = block_ads
|
|
||||||
if proxy is not None:
|
|
||||||
scrape_params['proxy'] = proxy
|
|
||||||
if extract is not None:
|
|
||||||
extract = ensure_schema_dict(extract)
|
|
||||||
if isinstance(extract, dict) and "schema" in extract:
|
|
||||||
extract["schema"] = ensure_schema_dict(extract["schema"])
|
|
||||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True)
|
|
||||||
if json_options is not None:
|
|
||||||
json_options = ensure_schema_dict(json_options)
|
|
||||||
if isinstance(json_options, dict) and "schema" in json_options:
|
|
||||||
json_options["schema"] = ensure_schema_dict(json_options["schema"])
|
|
||||||
|
|
||||||
# Convert to dict if it's a JsonConfig object
|
|
||||||
if hasattr(json_options, 'dict'):
|
|
||||||
json_options_dict = json_options.model_dump(exclude_none=True)
|
|
||||||
else:
|
|
||||||
json_options_dict = json_options
|
|
||||||
|
|
||||||
# Convert snake_case to camelCase for API
|
|
||||||
json_options_api = {}
|
|
||||||
if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None:
|
|
||||||
json_options_api['prompt'] = json_options_dict['prompt']
|
|
||||||
if 'schema' in json_options_dict and json_options_dict['schema'] is not None:
|
|
||||||
json_options_api['schema'] = json_options_dict['schema']
|
|
||||||
if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None:
|
|
||||||
json_options_api['systemPrompt'] = json_options_dict['system_prompt']
|
|
||||||
if 'agent' in json_options_dict and json_options_dict['agent'] is not None:
|
|
||||||
json_options_api['agent'] = json_options_dict['agent']
|
|
||||||
|
|
||||||
scrape_params['jsonOptions'] = json_options_api
|
|
||||||
if actions is not None:
|
|
||||||
scrape_params['actions'] = [action.model_dump(exclude_none=True) for action in actions]
|
|
||||||
if agent is not None:
|
|
||||||
scrape_params['agent'] = agent.model_dump(exclude_none=True)
|
|
||||||
|
|
||||||
# Add any additional kwargs
|
|
||||||
scrape_params.update(kwargs)
|
|
||||||
|
|
||||||
# Create final params object
|
|
||||||
final_params = ScrapeParams(**scrape_params)
|
|
||||||
params_dict = final_params.model_dump(exclude_none=True)
|
|
||||||
params_dict['urls'] = urls
|
|
||||||
params_dict['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
||||||
params_dict['extract']['schema'] = ensure_schema_dict(params_dict['extract']['schema'])
|
|
||||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
||||||
params_dict['jsonOptions']['schema'] = ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
||||||
|
|
||||||
# Apply format transformation for API
|
|
||||||
if 'formats' in params_dict and params_dict['formats']:
|
|
||||||
params_dict['formats'] = scrape_formats_transform(params_dict['formats'])
|
|
||||||
|
|
||||||
# Make request
|
# Make request
|
||||||
headers = self._prepare_headers(idempotency_key)
|
headers = self._prepare_headers(idempotency_key)
|
||||||
response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers)
|
response = self._post_request(f'{self.api_url}/v1/batch/scrape', scrape_params, headers)
|
||||||
|
|
||||||
if response.status_code == 200:
|
if response.status_code == 200:
|
||||||
try:
|
try:
|
||||||
@ -2747,97 +2690,40 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
# Validate any additional kwargs
|
# Validate any additional kwargs
|
||||||
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
self._validate_kwargs(kwargs, "async_batch_scrape_urls")
|
||||||
|
|
||||||
scrape_params = {}
|
scrape_params = parse_scrape_options(
|
||||||
|
formats=formats,
|
||||||
# Add individual parameters
|
include_tags=include_tags,
|
||||||
if formats is not None:
|
exclude_tags=exclude_tags,
|
||||||
scrape_params['formats'] = formats
|
only_main_content=only_main_content,
|
||||||
if headers is not None:
|
wait_for=wait_for,
|
||||||
scrape_params['headers'] = headers
|
timeout=timeout,
|
||||||
if include_tags is not None:
|
location=location,
|
||||||
scrape_params['includeTags'] = include_tags
|
mobile=mobile,
|
||||||
if exclude_tags is not None:
|
skip_tls_verification=skip_tls_verification,
|
||||||
scrape_params['excludeTags'] = exclude_tags
|
remove_base64_images=remove_base64_images,
|
||||||
if only_main_content is not None:
|
block_ads=block_ads,
|
||||||
scrape_params['onlyMainContent'] = only_main_content
|
proxy=proxy,
|
||||||
if wait_for is not None:
|
extract=extract,
|
||||||
scrape_params['waitFor'] = wait_for
|
json_options=json_options,
|
||||||
if timeout is not None:
|
actions=actions,
|
||||||
scrape_params['timeout'] = timeout
|
agent=agent,
|
||||||
if location is not None:
|
**kwargs
|
||||||
scrape_params['location'] = location.model_dump(exclude_none=True)
|
)
|
||||||
if mobile is not None:
|
|
||||||
scrape_params['mobile'] = mobile
|
scrape_params['urls'] = urls
|
||||||
if skip_tls_verification is not None:
|
scrape_params['origin'] = f"python-sdk@{version}"
|
||||||
scrape_params['skipTlsVerification'] = skip_tls_verification
|
|
||||||
if remove_base64_images is not None:
|
|
||||||
scrape_params['removeBase64Images'] = remove_base64_images
|
|
||||||
if block_ads is not None:
|
|
||||||
scrape_params['blockAds'] = block_ads
|
|
||||||
if proxy is not None:
|
|
||||||
scrape_params['proxy'] = proxy
|
|
||||||
if extract is not None:
|
|
||||||
extract = ensure_schema_dict(extract)
|
|
||||||
if isinstance(extract, dict) and "schema" in extract:
|
|
||||||
extract["schema"] = ensure_schema_dict(extract["schema"])
|
|
||||||
scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True)
|
|
||||||
if json_options is not None:
|
|
||||||
json_options = ensure_schema_dict(json_options)
|
|
||||||
if isinstance(json_options, dict) and "schema" in json_options:
|
|
||||||
json_options["schema"] = ensure_schema_dict(json_options["schema"])
|
|
||||||
|
|
||||||
# Convert to dict if it's a JsonConfig object
|
|
||||||
if hasattr(json_options, 'dict'):
|
|
||||||
json_options_dict = json_options.model_dump(exclude_none=True)
|
|
||||||
else:
|
|
||||||
json_options_dict = json_options
|
|
||||||
|
|
||||||
# Convert snake_case to camelCase for API
|
|
||||||
json_options_api = {}
|
|
||||||
if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None:
|
|
||||||
json_options_api['prompt'] = json_options_dict['prompt']
|
|
||||||
if 'schema' in json_options_dict and json_options_dict['schema'] is not None:
|
|
||||||
json_options_api['schema'] = json_options_dict['schema']
|
|
||||||
if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None:
|
|
||||||
json_options_api['systemPrompt'] = json_options_dict['system_prompt']
|
|
||||||
if 'agent' in json_options_dict and json_options_dict['agent'] is not None:
|
|
||||||
json_options_api['agent'] = json_options_dict['agent']
|
|
||||||
|
|
||||||
scrape_params['jsonOptions'] = json_options_api
|
|
||||||
if actions is not None:
|
|
||||||
scrape_params['actions'] = [action.model_dump(exclude_none=True) for action in actions]
|
|
||||||
if agent is not None:
|
|
||||||
scrape_params['agent'] = agent.model_dump(exclude_none=True)
|
|
||||||
|
|
||||||
# Add any additional kwargs
|
|
||||||
scrape_params.update(kwargs)
|
|
||||||
|
|
||||||
# Create final params object
|
|
||||||
final_params = ScrapeParams(**scrape_params)
|
|
||||||
params_dict = final_params.model_dump(exclude_none=True)
|
|
||||||
params_dict['urls'] = urls
|
|
||||||
params_dict['origin'] = f"python-sdk@{version}"
|
|
||||||
|
|
||||||
if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']:
|
|
||||||
params_dict['extract']['schema'] = ensure_schema_dict(params_dict['extract']['schema'])
|
|
||||||
if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']:
|
|
||||||
params_dict['jsonOptions']['schema'] = ensure_schema_dict(params_dict['jsonOptions']['schema'])
|
|
||||||
|
|
||||||
# Apply format transformation for API
|
|
||||||
if 'formats' in params_dict and params_dict['formats']:
|
|
||||||
params_dict['formats'] = scrape_formats_transform(params_dict['formats'])
|
|
||||||
|
|
||||||
# Make request
|
# Make request
|
||||||
headers = self._prepare_headers(idempotency_key)
|
headers = self._prepare_headers(idempotency_key)
|
||||||
response = await self._async_post_request(
|
response = await self._async_post_request(
|
||||||
f'{self.api_url}/v1/batch/scrape',
|
f'{self.api_url}/v1/batch/scrape',
|
||||||
params_dict,
|
scrape_params,
|
||||||
headers
|
headers
|
||||||
)
|
)
|
||||||
|
|
||||||
if response.get('status_code') == 200:
|
if response.get('success'):
|
||||||
try:
|
try:
|
||||||
return BatchScrapeResponse(**response.json())
|
return BatchScrapeResponse(**response)
|
||||||
except:
|
except:
|
||||||
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
raise Exception(f'Failed to parse Firecrawl response as JSON.')
|
||||||
else:
|
else:
|
||||||
|
@ -1,8 +1,7 @@
|
|||||||
import os
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
import sys
|
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import asyncio
|
import sys
|
||||||
|
|
||||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||||
from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||||
@ -15,16 +14,17 @@ API_KEY = os.getenv("TEST_API_KEY")
|
|||||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||||
|
|
||||||
TEST_URLS = [
|
TEST_URLS = [
|
||||||
"https://example.com",
|
"https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
|
||||||
"https://www.iana.org"
|
"https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
async def wait_for_batch_completion(app, job_id, timeout=60):
|
async def wait_for_batch_completion(app, job_id, timeout=60):
|
||||||
for _ in range(timeout):
|
for _ in range(timeout):
|
||||||
status = await app.check_batch_scrape_status(job_id)
|
status = await app.check_batch_scrape_status(job_id)
|
||||||
if status.status == "completed":
|
if status.status == "completed":
|
||||||
return status
|
return status
|
||||||
await asyncio.sleep(1)
|
import time; time.sleep(1)
|
||||||
raise TimeoutError("Batch scrape did not complete in time")
|
raise TimeoutError("Batch scrape did not complete in time")
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@ -35,55 +35,31 @@ async def test_async_batch_scrape_urls_simple():
|
|||||||
)
|
)
|
||||||
assert job.id is not None
|
assert job.id is not None
|
||||||
status = await wait_for_batch_completion(app, job.id)
|
status = await wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert status.success
|
assert status.success
|
||||||
assert hasattr(status, "data")
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
assert len(status.data) == len(TEST_URLS)
|
assert len(status.data) == len(TEST_URLS)
|
||||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
|
||||||
assert any("Internet Assigned Numbers" in doc.markdown for doc in status.data)
|
# Content assertions for each document
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
|
||||||
async def test_async_batch_scrape_urls_all_params_with_extract():
|
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
|
||||||
]
|
|
||||||
job = await app.async_batch_scrape_urls(
|
|
||||||
TEST_URLS,
|
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions
|
|
||||||
)
|
|
||||||
assert job.id is not None
|
|
||||||
status = await wait_for_batch_completion(app, job.id)
|
|
||||||
assert status.success
|
|
||||||
assert hasattr(status, "data")
|
|
||||||
assert len(status.data) == len(TEST_URLS)
|
|
||||||
for doc in status.data:
|
for doc in status.data:
|
||||||
assert hasattr(doc, "markdown")
|
assert hasattr(doc, "markdown")
|
||||||
assert hasattr(doc, "html")
|
assert doc.markdown is not None
|
||||||
assert hasattr(doc, "raw_html")
|
assert isinstance(doc.markdown, str)
|
||||||
assert hasattr(doc, "links")
|
assert len(doc.markdown) > 0
|
||||||
assert hasattr(doc, "screenshot")
|
|
||||||
assert hasattr(doc, "extract")
|
|
||||||
assert hasattr(doc, "metadata")
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
||||||
|
|
||||||
|
# Check that we got content from both test pages
|
||||||
|
markdown_contents = [doc.markdown for doc in status.data]
|
||||||
|
assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
|
||||||
|
assert any("Numbered Pagination" in content for content in markdown_contents)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_async_batch_scrape_urls_all_params_with_json_options():
|
async def test_async_batch_scrape_urls_all_params_with_json_options():
|
||||||
@ -91,12 +67,12 @@ async def test_async_batch_scrape_urls_all_params_with_json_options():
|
|||||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
actions = [
|
actions = [
|
||||||
WaitAction(type="wait", milliseconds=500),
|
WaitAction(milliseconds=500),
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
ScreenshotAction(fullPage=True),
|
||||||
WriteAction(type="write", text="test input"),
|
WriteAction(text="test input"),
|
||||||
PressAction(type="press", key="Enter"),
|
PressAction(key="Enter"),
|
||||||
ScrollAction(type="scroll", direction="down"),
|
ScrollAction(direction="down"),
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
]
|
]
|
||||||
job = await app.async_batch_scrape_urls(
|
job = await app.async_batch_scrape_urls(
|
||||||
TEST_URLS,
|
TEST_URLS,
|
||||||
@ -105,7 +81,7 @@ async def test_async_batch_scrape_urls_all_params_with_json_options():
|
|||||||
exclude_tags=["footer"],
|
exclude_tags=["footer"],
|
||||||
only_main_content=True,
|
only_main_content=True,
|
||||||
wait_for=1000,
|
wait_for=1000,
|
||||||
timeout=15000,
|
timeout=30000,
|
||||||
location=location,
|
location=location,
|
||||||
mobile=True,
|
mobile=True,
|
||||||
skip_tls_verification=True,
|
skip_tls_verification=True,
|
||||||
@ -117,14 +93,136 @@ async def test_async_batch_scrape_urls_all_params_with_json_options():
|
|||||||
)
|
)
|
||||||
assert job.id is not None
|
assert job.id is not None
|
||||||
status = await wait_for_batch_completion(app, job.id)
|
status = await wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert status.success
|
assert status.success
|
||||||
assert hasattr(status, "data")
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
assert len(status.data) == len(TEST_URLS)
|
assert len(status.data) == len(TEST_URLS)
|
||||||
|
|
||||||
|
# Detailed content assertions for each document
|
||||||
for doc in status.data:
|
for doc in status.data:
|
||||||
|
# Markdown assertions
|
||||||
assert hasattr(doc, "markdown")
|
assert hasattr(doc, "markdown")
|
||||||
assert hasattr(doc, "html")
|
assert doc.markdown is not None
|
||||||
|
assert isinstance(doc.markdown, str)
|
||||||
|
assert len(doc.markdown) > 0
|
||||||
|
|
||||||
|
# HTML assertions
|
||||||
|
assert hasattr(doc, "html")
|
||||||
|
assert doc.html is not None
|
||||||
|
assert isinstance(doc.html, str)
|
||||||
|
assert len(doc.html) > 0
|
||||||
|
|
||||||
|
# Raw HTML assertions
|
||||||
assert hasattr(doc, "raw_html")
|
assert hasattr(doc, "raw_html")
|
||||||
|
assert doc.raw_html is not None
|
||||||
|
assert isinstance(doc.raw_html, str)
|
||||||
|
assert len(doc.raw_html) > 0
|
||||||
|
|
||||||
|
# Links assertions
|
||||||
assert hasattr(doc, "links")
|
assert hasattr(doc, "links")
|
||||||
|
assert doc.links is not None
|
||||||
|
assert isinstance(doc.links, list)
|
||||||
|
|
||||||
|
# Screenshot assertions
|
||||||
assert hasattr(doc, "screenshot")
|
assert hasattr(doc, "screenshot")
|
||||||
|
assert doc.screenshot is not None
|
||||||
|
assert isinstance(doc.screenshot, str)
|
||||||
|
assert doc.screenshot.startswith("https://")
|
||||||
|
|
||||||
|
# JSON assertions
|
||||||
assert hasattr(doc, "json")
|
assert hasattr(doc, "json")
|
||||||
assert hasattr(doc, "metadata")
|
assert doc.json is not None
|
||||||
|
assert isinstance(doc.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_async_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
|
||||||
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
|
actions = [
|
||||||
|
WaitAction(milliseconds=500),
|
||||||
|
ScreenshotAction(fullPage=True),
|
||||||
|
WriteAction(text="test input"),
|
||||||
|
PressAction(key="Enter"),
|
||||||
|
ScrollAction(direction="down"),
|
||||||
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
|
]
|
||||||
|
job = await app.async_batch_scrape_urls(
|
||||||
|
TEST_URLS,
|
||||||
|
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
|
||||||
|
include_tags=["h1", "p"],
|
||||||
|
exclude_tags=["footer"],
|
||||||
|
only_main_content=True,
|
||||||
|
wait_for=1000,
|
||||||
|
timeout=30000,
|
||||||
|
location=location,
|
||||||
|
mobile=True,
|
||||||
|
skip_tls_verification=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
block_ads=True,
|
||||||
|
proxy="basic",
|
||||||
|
json_options=json_options,
|
||||||
|
actions=actions
|
||||||
|
)
|
||||||
|
assert job.id is not None
|
||||||
|
status = await wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
|
assert status.success
|
||||||
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
|
assert len(status.data) == len(TEST_URLS)
|
||||||
|
|
||||||
|
# Detailed content assertions for each document
|
||||||
|
for doc in status.data:
|
||||||
|
# Markdown assertions
|
||||||
|
assert hasattr(doc, "markdown")
|
||||||
|
assert doc.markdown is not None
|
||||||
|
assert isinstance(doc.markdown, str)
|
||||||
|
assert len(doc.markdown) > 0
|
||||||
|
|
||||||
|
# HTML assertions
|
||||||
|
assert hasattr(doc, "html")
|
||||||
|
assert doc.html is not None
|
||||||
|
assert isinstance(doc.html, str)
|
||||||
|
assert len(doc.html) > 0
|
||||||
|
|
||||||
|
# Raw HTML assertions
|
||||||
|
assert hasattr(doc, "raw_html")
|
||||||
|
assert doc.raw_html is not None
|
||||||
|
assert isinstance(doc.raw_html, str)
|
||||||
|
assert len(doc.raw_html) > 0
|
||||||
|
|
||||||
|
# Links assertions
|
||||||
|
assert hasattr(doc, "links")
|
||||||
|
assert doc.links is not None
|
||||||
|
assert isinstance(doc.links, list)
|
||||||
|
|
||||||
|
# Screenshot assertions (full page)
|
||||||
|
assert hasattr(doc, "screenshot")
|
||||||
|
assert doc.screenshot is not None
|
||||||
|
assert isinstance(doc.screenshot, str)
|
||||||
|
assert doc.screenshot.startswith("https://")
|
||||||
|
|
||||||
|
# JSON assertions
|
||||||
|
assert hasattr(doc, "json")
|
||||||
|
assert doc.json is not None
|
||||||
|
assert isinstance(doc.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
@ -14,8 +14,8 @@ API_KEY = os.getenv("TEST_API_KEY")
|
|||||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||||
|
|
||||||
TEST_URLS = [
|
TEST_URLS = [
|
||||||
"https://example.com",
|
"https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
|
||||||
"https://www.iana.org"
|
"https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
|
||||||
]
|
]
|
||||||
|
|
||||||
def wait_for_batch_completion(app, job_id, timeout=60):
|
def wait_for_batch_completion(app, job_id, timeout=60):
|
||||||
@ -33,66 +33,43 @@ def test_async_batch_scrape_urls_simple():
|
|||||||
)
|
)
|
||||||
assert job.id is not None
|
assert job.id is not None
|
||||||
status = wait_for_batch_completion(app, job.id)
|
status = wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert status.success
|
assert status.success
|
||||||
assert hasattr(status, "data")
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
assert len(status.data) == len(TEST_URLS)
|
assert len(status.data) == len(TEST_URLS)
|
||||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
|
||||||
assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data)
|
# Content assertions for each document
|
||||||
|
|
||||||
def test_async_batch_scrape_urls_all_params_with_extract():
|
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
|
||||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
|
||||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
|
||||||
actions = [
|
|
||||||
WaitAction(type="wait", milliseconds=500),
|
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
|
||||||
WriteAction(type="write", text="test input"),
|
|
||||||
PressAction(type="press", key="Enter"),
|
|
||||||
ScrollAction(type="scroll", direction="down"),
|
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
|
||||||
]
|
|
||||||
job = app.async_batch_scrape_urls(
|
|
||||||
TEST_URLS,
|
|
||||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
|
||||||
include_tags=["h1", "p"],
|
|
||||||
exclude_tags=["footer"],
|
|
||||||
only_main_content=True,
|
|
||||||
wait_for=1000,
|
|
||||||
timeout=15000,
|
|
||||||
location=location,
|
|
||||||
mobile=True,
|
|
||||||
skip_tls_verification=True,
|
|
||||||
remove_base64_images=True,
|
|
||||||
block_ads=True,
|
|
||||||
proxy="basic",
|
|
||||||
extract=extract,
|
|
||||||
actions=actions
|
|
||||||
)
|
|
||||||
assert job.id is not None
|
|
||||||
status = wait_for_batch_completion(app, job.id)
|
|
||||||
assert status.success
|
|
||||||
assert hasattr(status, "data")
|
|
||||||
assert len(status.data) == len(TEST_URLS)
|
|
||||||
for doc in status.data:
|
for doc in status.data:
|
||||||
assert hasattr(doc, "markdown")
|
assert hasattr(doc, "markdown")
|
||||||
assert hasattr(doc, "html")
|
assert doc.markdown is not None
|
||||||
assert hasattr(doc, "raw_html")
|
assert isinstance(doc.markdown, str)
|
||||||
assert hasattr(doc, "links")
|
assert len(doc.markdown) > 0
|
||||||
assert hasattr(doc, "screenshot")
|
|
||||||
assert hasattr(doc, "extract")
|
|
||||||
assert hasattr(doc, "metadata")
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
||||||
|
|
||||||
|
# Check that we got content from both test pages
|
||||||
|
markdown_contents = [doc.markdown for doc in status.data]
|
||||||
|
assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
|
||||||
|
assert any("Numbered Pagination" in content for content in markdown_contents)
|
||||||
|
|
||||||
def test_async_batch_scrape_urls_all_params_with_json_options():
|
def test_async_batch_scrape_urls_all_params_with_json_options():
|
||||||
location = LocationConfig(country="us", languages=["en"])
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
actions = [
|
actions = [
|
||||||
WaitAction(type="wait", milliseconds=500),
|
WaitAction(milliseconds=500),
|
||||||
ScreenshotAction(type="screenshot", fullPage=True),
|
ScreenshotAction(fullPage=True),
|
||||||
WriteAction(type="write", text="test input"),
|
WriteAction(text="test input"),
|
||||||
PressAction(type="press", key="Enter"),
|
PressAction(key="Enter"),
|
||||||
ScrollAction(type="scroll", direction="down"),
|
ScrollAction(direction="down"),
|
||||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
]
|
]
|
||||||
job = app.async_batch_scrape_urls(
|
job = app.async_batch_scrape_urls(
|
||||||
TEST_URLS,
|
TEST_URLS,
|
||||||
@ -101,7 +78,7 @@ def test_async_batch_scrape_urls_all_params_with_json_options():
|
|||||||
exclude_tags=["footer"],
|
exclude_tags=["footer"],
|
||||||
only_main_content=True,
|
only_main_content=True,
|
||||||
wait_for=1000,
|
wait_for=1000,
|
||||||
timeout=15000,
|
timeout=30000,
|
||||||
location=location,
|
location=location,
|
||||||
mobile=True,
|
mobile=True,
|
||||||
skip_tls_verification=True,
|
skip_tls_verification=True,
|
||||||
@ -113,14 +90,135 @@ def test_async_batch_scrape_urls_all_params_with_json_options():
|
|||||||
)
|
)
|
||||||
assert job.id is not None
|
assert job.id is not None
|
||||||
status = wait_for_batch_completion(app, job.id)
|
status = wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
assert status.success
|
assert status.success
|
||||||
assert hasattr(status, "data")
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
assert len(status.data) == len(TEST_URLS)
|
assert len(status.data) == len(TEST_URLS)
|
||||||
|
|
||||||
|
# Detailed content assertions for each document
|
||||||
for doc in status.data:
|
for doc in status.data:
|
||||||
|
# Markdown assertions
|
||||||
assert hasattr(doc, "markdown")
|
assert hasattr(doc, "markdown")
|
||||||
|
assert doc.markdown is not None
|
||||||
|
assert isinstance(doc.markdown, str)
|
||||||
|
assert len(doc.markdown) > 0
|
||||||
|
|
||||||
|
# HTML assertions
|
||||||
assert hasattr(doc, "html")
|
assert hasattr(doc, "html")
|
||||||
|
assert doc.html is not None
|
||||||
|
assert isinstance(doc.html, str)
|
||||||
|
assert len(doc.html) > 0
|
||||||
|
|
||||||
|
# Raw HTML assertions
|
||||||
assert hasattr(doc, "raw_html")
|
assert hasattr(doc, "raw_html")
|
||||||
|
assert doc.raw_html is not None
|
||||||
|
assert isinstance(doc.raw_html, str)
|
||||||
|
assert len(doc.raw_html) > 0
|
||||||
|
|
||||||
|
# Links assertions
|
||||||
assert hasattr(doc, "links")
|
assert hasattr(doc, "links")
|
||||||
|
assert doc.links is not None
|
||||||
|
assert isinstance(doc.links, list)
|
||||||
|
|
||||||
|
# Screenshot assertions
|
||||||
assert hasattr(doc, "screenshot")
|
assert hasattr(doc, "screenshot")
|
||||||
|
assert doc.screenshot is not None
|
||||||
|
assert isinstance(doc.screenshot, str)
|
||||||
|
assert doc.screenshot.startswith("https://")
|
||||||
|
|
||||||
|
# JSON assertions
|
||||||
assert hasattr(doc, "json")
|
assert hasattr(doc, "json")
|
||||||
assert hasattr(doc, "metadata")
|
assert doc.json is not None
|
||||||
|
assert isinstance(doc.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
||||||
|
|
||||||
|
def test_async_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
|
||||||
|
location = LocationConfig(country="us", languages=["en"])
|
||||||
|
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||||
|
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||||
|
actions = [
|
||||||
|
WaitAction(milliseconds=500),
|
||||||
|
ScreenshotAction(fullPage=True),
|
||||||
|
WriteAction(text="test input"),
|
||||||
|
PressAction(key="Enter"),
|
||||||
|
ScrollAction(direction="down"),
|
||||||
|
ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
|
||||||
|
]
|
||||||
|
job = app.async_batch_scrape_urls(
|
||||||
|
TEST_URLS,
|
||||||
|
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
|
||||||
|
include_tags=["h1", "p"],
|
||||||
|
exclude_tags=["footer"],
|
||||||
|
only_main_content=True,
|
||||||
|
wait_for=1000,
|
||||||
|
timeout=30000,
|
||||||
|
location=location,
|
||||||
|
mobile=True,
|
||||||
|
skip_tls_verification=True,
|
||||||
|
remove_base64_images=True,
|
||||||
|
block_ads=True,
|
||||||
|
proxy="basic",
|
||||||
|
json_options=json_options,
|
||||||
|
actions=actions
|
||||||
|
)
|
||||||
|
assert job.id is not None
|
||||||
|
status = wait_for_batch_completion(app, job.id)
|
||||||
|
|
||||||
|
# Basic response assertions
|
||||||
|
assert status.success
|
||||||
|
assert hasattr(status, "data")
|
||||||
|
assert status.data is not None
|
||||||
|
assert isinstance(status.data, list)
|
||||||
|
assert len(status.data) == len(TEST_URLS)
|
||||||
|
|
||||||
|
# Detailed content assertions for each document
|
||||||
|
for doc in status.data:
|
||||||
|
# Markdown assertions
|
||||||
|
assert hasattr(doc, "markdown")
|
||||||
|
assert doc.markdown is not None
|
||||||
|
assert isinstance(doc.markdown, str)
|
||||||
|
assert len(doc.markdown) > 0
|
||||||
|
|
||||||
|
# HTML assertions
|
||||||
|
assert hasattr(doc, "html")
|
||||||
|
assert doc.html is not None
|
||||||
|
assert isinstance(doc.html, str)
|
||||||
|
assert len(doc.html) > 0
|
||||||
|
|
||||||
|
# Raw HTML assertions
|
||||||
|
assert hasattr(doc, "raw_html")
|
||||||
|
assert doc.raw_html is not None
|
||||||
|
assert isinstance(doc.raw_html, str)
|
||||||
|
assert len(doc.raw_html) > 0
|
||||||
|
|
||||||
|
# Links assertions
|
||||||
|
assert hasattr(doc, "links")
|
||||||
|
assert doc.links is not None
|
||||||
|
assert isinstance(doc.links, list)
|
||||||
|
|
||||||
|
# Screenshot assertions (full page)
|
||||||
|
assert hasattr(doc, "screenshot")
|
||||||
|
assert doc.screenshot is not None
|
||||||
|
assert isinstance(doc.screenshot, str)
|
||||||
|
assert doc.screenshot.startswith("https://")
|
||||||
|
|
||||||
|
# JSON assertions
|
||||||
|
assert hasattr(doc, "json")
|
||||||
|
assert doc.json is not None
|
||||||
|
assert isinstance(doc.json, dict)
|
||||||
|
|
||||||
|
# Metadata assertions
|
||||||
|
assert hasattr(doc, "metadata")
|
||||||
|
assert doc.metadata is not None
|
||||||
|
assert isinstance(doc.metadata, dict)
|
||||||
|
assert "url" in doc.metadata
|
||||||
|
assert doc.metadata["url"] in TEST_URLS
|
Loading…
x
Reference in New Issue
Block a user