mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 00:25:55 +08:00
e2e tests for python methods
This commit is contained in:
parent
f0b0afc5d6
commit
06cf6be9d5
@ -1,9 +1,11 @@
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
import asyncio
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -12,7 +14,117 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URLS = [
|
||||
"https://example.com",
|
||||
"https://www.iana.org"
|
||||
]
|
||||
|
||||
async def wait_for_batch_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = await app.check_batch_scrape_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
await asyncio.sleep(1)
|
||||
raise TimeoutError("Batch scrape did not complete in time")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_batch_scrape_urls_async_placeholder():
|
||||
# TODO: Implement async E2E tests for async_batch_scrape_urls
|
||||
assert True
|
||||
async def test_async_batch_scrape_urls_simple():
|
||||
job = await app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
assert any("Internet Assigned Numbers" in doc.markdown for doc in status.data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_batch_scrape_urls_all_params_with_extract():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = await app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "extract")
|
||||
assert hasattr(doc, "metadata")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_batch_scrape_urls_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = await app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "json")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,7 +1,11 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,7 +14,81 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
async def wait_for_crawl_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = await app.check_crawl_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
await asyncio.sleep(1)
|
||||
raise TimeoutError("Crawl did not complete in time")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_crawl_url_async_placeholder():
|
||||
# TODO: Implement async E2E tests for async_crawl_url
|
||||
assert True
|
||||
async def test_async_crawl_url_simple():
|
||||
job = await app.async_crawl_url(
|
||||
TEST_URL,
|
||||
limit=1,
|
||||
scrape_options=ScrapeOptions(formats=["markdown"])
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_crawl_url_all_params():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
scrape_options = ScrapeOptions(
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
actions=actions
|
||||
)
|
||||
job = await app.async_crawl_url(
|
||||
TEST_URL,
|
||||
include_paths=["/"],
|
||||
exclude_paths=["/notarealpage"],
|
||||
max_depth=2,
|
||||
max_discovery_depth=2,
|
||||
limit=2,
|
||||
allow_backward_links=True,
|
||||
allow_external_links=True,
|
||||
ignore_sitemap=True,
|
||||
scrape_options=scrape_options,
|
||||
deduplicate_similar_urls=True,
|
||||
ignore_query_parameters=True,
|
||||
regex_on_full_url=True,
|
||||
delay=1
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "metadata")
|
||||
|
@ -42,11 +42,11 @@ async def test_batch_scrape_urls_async_all_params_with_extract():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
result = await app.batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -68,7 +68,7 @@ async def test_batch_scrape_urls_async_all_params_with_extract():
|
||||
for doc in result.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "rawHtml")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "extract")
|
||||
@ -85,11 +85,11 @@ async def test_batch_scrape_urls_async_all_params_with_json_options():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
result = await app.batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -110,7 +110,7 @@ async def test_batch_scrape_urls_async_all_params_with_json_options():
|
||||
for doc in result.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "rawHtml")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "json")
|
||||
|
@ -1,16 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_crawl_status_async_placeholder():
|
||||
# TODO: Implement async E2E tests for check_crawl_status
|
||||
assert True
|
@ -1,7 +1,11 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
import asyncio
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,7 +14,81 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
async def wait_for_crawl_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = await app.check_crawl_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
await asyncio.sleep(1)
|
||||
raise TimeoutError("Crawl did not complete in time")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_async_placeholder():
|
||||
# TODO: Implement async E2E tests for crawl_url
|
||||
assert True
|
||||
async def test_crawl_url_async_simple():
|
||||
job = await app.async_crawl_url(
|
||||
TEST_URL,
|
||||
limit=1,
|
||||
scrape_options=ScrapeOptions(formats=["markdown"])
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_async_all_params():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
scrape_options = ScrapeOptions(
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
actions=actions
|
||||
)
|
||||
job = await app.async_crawl_url(
|
||||
TEST_URL,
|
||||
include_paths=["/"],
|
||||
exclude_paths=["/notarealpage"],
|
||||
max_depth=2,
|
||||
max_discovery_depth=2,
|
||||
limit=2,
|
||||
allow_backward_links=True,
|
||||
allow_external_links=True,
|
||||
ignore_sitemap=True,
|
||||
scrape_options=scrape_options,
|
||||
deduplicate_similar_urls=True,
|
||||
ignore_query_parameters=True,
|
||||
regex_on_full_url=True,
|
||||
delay=1
|
||||
)
|
||||
assert job.id is not None
|
||||
status = await wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -11,6 +14,35 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deep_research_async_placeholder():
|
||||
# TODO: Implement async E2E tests for deep_research
|
||||
assert True
|
||||
async def test_deep_research_async_simple():
|
||||
result = await app.deep_research("What is the capital of France?", max_urls=2)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "success")
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert any("Paris" in str(result.data) or "France" in str(result.data) for _ in [0])
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deep_research_async_all_params():
|
||||
result = await app.deep_research(
|
||||
"What are the latest advancements in AI?",
|
||||
max_depth=2,
|
||||
time_limit=60,
|
||||
max_urls=3,
|
||||
analysis_prompt="Summarize the most important recent AI advancements.",
|
||||
system_prompt="You are an expert AI researcher."
|
||||
)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "success")
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert hasattr(result, "activities")
|
||||
assert isinstance(result.activities, list)
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "sources")
|
||||
assert isinstance(result.data.sources, list)
|
||||
assert hasattr(result.data, "final_analysis")
|
||||
assert isinstance(result.data.final_analysis, str)
|
@ -1,7 +1,12 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,7 +15,85 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
class ExtractSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_async_placeholder():
|
||||
# TODO: Implement async E2E tests for extract
|
||||
assert True
|
||||
async def test_extract_async_simple_schema_class():
|
||||
result = await app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=ExtractSchema
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_async_simple_schema_dict():
|
||||
result = await app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=ExtractSchema.schema()
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_async_simple_schema_json():
|
||||
# Try both model_json_schema (Pydantic v2) and schema_json (Pydantic v1)
|
||||
schema = None
|
||||
if hasattr(ExtractSchema, "model_json_schema"):
|
||||
schema = ExtractSchema.model_json_schema()
|
||||
elif hasattr(ExtractSchema, "schema_json"):
|
||||
schema = ExtractSchema.schema_json()
|
||||
else:
|
||||
pytest.skip("No JSON schema export method available on ExtractSchema")
|
||||
result = await app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=schema
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_async_all_params():
|
||||
class AllParamsSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
author: str
|
||||
schema = AllParamsSchema.schema()
|
||||
result = await app.extract(
|
||||
["https://www.iana.org"],
|
||||
prompt="Extract the title, description, links, and author from the website",
|
||||
schema=schema,
|
||||
system_prompt="You are a helpful extraction agent.",
|
||||
allow_external_links=True,
|
||||
enable_web_search=True,
|
||||
show_sources=True,
|
||||
agent={"model": "FIRE-1"}
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert hasattr(result.data, "author")
|
||||
assert isinstance(result.data.links, list)
|
||||
assert hasattr(result, "sources")
|
||||
assert isinstance(result.sources, list)
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -11,6 +14,30 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_llms_text_async_placeholder():
|
||||
# TODO: Implement async E2E tests for generate_llms_text
|
||||
assert True
|
||||
async def test_generate_llms_text_async_simple():
|
||||
result = await app.generate_llms_text("https://example.com")
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "llmstxt")
|
||||
assert isinstance(result.data.llmstxt, str)
|
||||
assert len(result.data.llmstxt) > 0
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_llms_text_async_all_params():
|
||||
result = await app.generate_llms_text(
|
||||
"https://www.iana.org",
|
||||
max_urls=5,
|
||||
show_full_text=True,
|
||||
experimental_stream=True
|
||||
)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "llmstxt")
|
||||
assert isinstance(result.data.llmstxt, str)
|
||||
assert len(result.data.llmstxt) > 0
|
||||
assert hasattr(result.data, "llmsfulltxt")
|
||||
assert result.data.llmsfulltxt is None or isinstance(result.data.llmsfulltxt, str)
|
@ -40,11 +40,11 @@ async def test_scrape_url_async_all_params_with_extract_screenshot():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -64,12 +64,12 @@ async def test_scrape_url_async_all_params_with_extract_screenshot():
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
assert hasattr(response, "change_tracking")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
||||
@ -83,11 +83,11 @@ async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -107,12 +107,12 @@ async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
assert hasattr(response, "change_tracking")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_json_options():
|
||||
@ -126,11 +126,11 @@ async def test_scrape_url_async_all_params_with_json_options():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -150,9 +150,9 @@ async def test_scrape_url_async_all_params_with_json_options():
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "json")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
assert hasattr(response, "change_tracking")
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +13,114 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_async_batch_scrape_urls_placeholder():
|
||||
# TODO: Implement E2E tests for async_batch_scrape_urls
|
||||
assert True
|
||||
TEST_URLS = [
|
||||
"https://example.com",
|
||||
"https://www.iana.org"
|
||||
]
|
||||
|
||||
def wait_for_batch_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = app.check_batch_scrape_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
import time; time.sleep(1)
|
||||
raise TimeoutError("Batch scrape did not complete in time")
|
||||
|
||||
def test_async_batch_scrape_urls_simple():
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data)
|
||||
|
||||
def test_async_batch_scrape_urls_all_params_with_extract():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "extract")
|
||||
assert hasattr(doc, "metadata")
|
||||
|
||||
def test_async_batch_scrape_urls_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "json")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +13,79 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_async_crawl_url_placeholder():
|
||||
# TODO: Implement E2E tests for async_crawl_url
|
||||
assert True
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
def wait_for_crawl_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = app.check_crawl_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
import time; time.sleep(1)
|
||||
raise TimeoutError("Crawl did not complete in time")
|
||||
|
||||
def test_crawl_url_simple():
|
||||
job = app.async_crawl_url(
|
||||
TEST_URL,
|
||||
limit=1,
|
||||
scrape_options=ScrapeOptions(formats=["markdown"])
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
|
||||
def test_crawl_url_all_params():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
scrape_options = ScrapeOptions(
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
actions=actions
|
||||
)
|
||||
job = app.async_crawl_url(
|
||||
TEST_URL,
|
||||
include_paths=["/"],
|
||||
exclude_paths=["/notarealpage"],
|
||||
max_depth=2,
|
||||
max_discovery_depth=2,
|
||||
limit=2,
|
||||
allow_backward_links=True,
|
||||
allow_external_links=True,
|
||||
ignore_sitemap=True,
|
||||
scrape_options=scrape_options,
|
||||
deduplicate_similar_urls=True,
|
||||
ignore_query_parameters=True,
|
||||
regex_on_full_url=True,
|
||||
delay=1
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_crawl_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
import sys
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +13,114 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_batch_scrape_urls_placeholder():
|
||||
# TODO: Implement E2E tests for batch_scrape_urls
|
||||
assert True
|
||||
TEST_URLS = [
|
||||
"https://example.com",
|
||||
"https://www.iana.org"
|
||||
]
|
||||
|
||||
def wait_for_batch_completion(app, job_id, timeout=60):
|
||||
for _ in range(timeout):
|
||||
status = app.check_batch_scrape_status(job_id)
|
||||
if status.status == "completed":
|
||||
return status
|
||||
import time; time.sleep(1)
|
||||
raise TimeoutError("Batch scrape did not complete in time")
|
||||
|
||||
def test_batch_scrape_urls_simple():
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data)
|
||||
|
||||
def test_batch_scrape_urls_all_params_with_extract():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "extract")
|
||||
assert hasattr(doc, "metadata")
|
||||
|
||||
def test_batch_scrape_urls_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
job = app.async_batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions
|
||||
)
|
||||
assert job.id is not None
|
||||
status = wait_for_batch_completion(app, job.id)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) == len(TEST_URLS)
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "json")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,15 +0,0 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_check_crawl_status_placeholder():
|
||||
# TODO: Implement E2E tests for check_crawl_status
|
||||
assert True
|
@ -1,7 +1,9 @@
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction, ScrapeOptions
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +12,67 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_crawl_url_placeholder():
|
||||
# TODO: Implement E2E tests for crawl_url
|
||||
assert True
|
||||
TEST_URL = "https://example.com"
|
||||
|
||||
def test_crawl_url_simple():
|
||||
status = app.crawl_url(
|
||||
TEST_URL,
|
||||
limit=1,
|
||||
scrape_options=ScrapeOptions(formats=["markdown"])
|
||||
)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
assert any("Example Domain" in doc.markdown for doc in status.data)
|
||||
|
||||
def test_crawl_url_all_params():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
scrape_options = ScrapeOptions(
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
actions=actions
|
||||
)
|
||||
status = app.crawl_url(
|
||||
TEST_URL,
|
||||
include_paths=["/"],
|
||||
exclude_paths=["/notarealpage"],
|
||||
max_depth=2,
|
||||
max_discovery_depth=2,
|
||||
limit=2,
|
||||
allow_backward_links=True,
|
||||
allow_external_links=True,
|
||||
ignore_sitemap=True,
|
||||
scrape_options=scrape_options,
|
||||
deduplicate_similar_urls=True,
|
||||
ignore_query_parameters=True,
|
||||
regex_on_full_url=True,
|
||||
delay=1
|
||||
)
|
||||
assert status.success
|
||||
assert hasattr(status, "data")
|
||||
assert len(status.data) >= 1
|
||||
for doc in status.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "raw_html")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "metadata")
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +13,34 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_deep_research_placeholder():
|
||||
# TODO: Implement E2E tests for deep_research
|
||||
assert True
|
||||
def test_deep_research_simple():
|
||||
result = app.deep_research("What is the capital of France?", max_urls=2)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "success")
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert any("Paris" in str(result.data) or "France" in str(result.data) for _ in [0])
|
||||
|
||||
def test_deep_research_all_params():
|
||||
result = app.deep_research(
|
||||
"What are the latest advancements in AI?",
|
||||
max_depth=2,
|
||||
time_limit=60,
|
||||
max_urls=3,
|
||||
analysis_prompt="Summarize the most important recent AI advancements.",
|
||||
system_prompt="You are an expert AI researcher."
|
||||
)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "success")
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert hasattr(result, "activities")
|
||||
assert isinstance(result.activities, list)
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "sources")
|
||||
assert isinstance(result.data.sources, list)
|
||||
assert hasattr(result.data, "final_analysis")
|
||||
assert isinstance(result.data.final_analysis, str)
|
@ -1,7 +1,12 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
from pydantic import BaseModel
|
||||
from typing import List
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +15,80 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_extract_placeholder():
|
||||
# TODO: Implement E2E tests for extract
|
||||
assert True
|
||||
class ExtractSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
|
||||
def test_extract_simple_schema_class():
|
||||
result = app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=ExtractSchema
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
def test_extract_simple_schema_dict():
|
||||
result = app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=ExtractSchema.schema()
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
def test_extract_simple_schema_json():
|
||||
schema = None
|
||||
if hasattr(ExtractSchema, "model_json_schema"):
|
||||
schema = ExtractSchema.model_json_schema()
|
||||
elif hasattr(ExtractSchema, "schema_json"):
|
||||
schema = ExtractSchema.schema_json()
|
||||
else:
|
||||
pytest.skip("No JSON schema export method available on ExtractSchema")
|
||||
result = app.extract(
|
||||
["https://example.com"],
|
||||
prompt="Extract the title, description, and links from the website",
|
||||
schema=schema
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert isinstance(result.data.links, list)
|
||||
|
||||
def test_extract_all_params():
|
||||
class AllParamsSchema(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
links: List[str]
|
||||
author: str
|
||||
schema = AllParamsSchema.schema()
|
||||
result = app.extract(
|
||||
["https://www.iana.org"],
|
||||
prompt="Extract the title, description, links, and author from the website",
|
||||
schema=schema,
|
||||
system_prompt="You are a helpful extraction agent.",
|
||||
allow_external_links=True,
|
||||
enable_web_search=True,
|
||||
show_sources=True,
|
||||
agent={"model": "FIRE-1"}
|
||||
)
|
||||
assert result.success
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "title")
|
||||
assert hasattr(result.data, "description")
|
||||
assert hasattr(result.data, "links")
|
||||
assert hasattr(result.data, "author")
|
||||
assert isinstance(result.data.links, list)
|
||||
assert hasattr(result, "sources")
|
||||
assert isinstance(result.sources, list)
|
@ -1,7 +1,10 @@
|
||||
import os
|
||||
import sys
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
@ -10,6 +13,29 @@ API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_generate_llms_text_placeholder():
|
||||
# TODO: Implement E2E tests for generate_llms_text
|
||||
assert True
|
||||
def test_generate_llms_text_simple():
|
||||
result = app.generate_llms_text("https://example.com")
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "llmstxt")
|
||||
assert isinstance(result.data.llmstxt, str)
|
||||
assert len(result.data.llmstxt) > 0
|
||||
|
||||
def test_generate_llms_text_all_params():
|
||||
result = app.generate_llms_text(
|
||||
"https://www.iana.org",
|
||||
max_urls=5,
|
||||
show_full_text=True,
|
||||
experimental_stream=True
|
||||
)
|
||||
assert hasattr(result, "status")
|
||||
assert result.status == "completed"
|
||||
assert hasattr(result, "data")
|
||||
assert result.data is not None
|
||||
assert hasattr(result.data, "llmstxt")
|
||||
assert isinstance(result.data.llmstxt, str)
|
||||
assert len(result.data.llmstxt) > 0
|
||||
assert hasattr(result.data, "llmsfulltxt")
|
||||
assert result.data.llmsfulltxt is None or isinstance(result.data.llmsfulltxt, str)
|
@ -80,11 +80,11 @@ def test_scrape_url_all_params_with_extract_full_page_screenshot():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "extract", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -104,12 +104,12 @@ def test_scrape_url_all_params_with_extract_full_page_screenshot():
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
assert hasattr(response, "change_tracking")
|
||||
|
||||
def test_scrape_url_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
@ -122,11 +122,11 @@ def test_scrape_url_all_params_with_json_options():
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
|
||||
formats=["markdown", "html", "raw_html", "links", "screenshot", "json", "change_tracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
@ -146,9 +146,9 @@ def test_scrape_url_all_params_with_json_options():
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "raw_html")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "json")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
assert hasattr(response, "change_tracking")
|
Loading…
x
Reference in New Issue
Block a user