Enhance Firecrawl SDK: Add LocationConfig to __init__.py and update ScrapeOptions formats to remove 'content' type. Introduce new E2E tests for async and standard scraping functionalities, including batch scraping and URL mapping.

This commit is contained in:
rafaelmmiller 2025-05-09 18:32:05 -03:00
parent f17b1cdc86
commit f0b0afc5d6
22 changed files with 750 additions and 11 deletions

View File

@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
import logging
import os
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
__version__ = "2.6.0"

View File

@ -144,7 +144,7 @@ class ChangeTrackingOptions(pydantic.BaseModel):
class ScrapeOptions(pydantic.BaseModel):
"""Parameters for scraping operations."""
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
headers: Optional[Dict[str, str]] = None
includeTags: Optional[List[str]] = None
excludeTags: Optional[List[str]] = None
@ -448,7 +448,7 @@ class FirecrawlApp:
self,
url: str,
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None,
@ -470,7 +470,7 @@ class FirecrawlApp:
Args:
url (str): Target URL to scrape
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
include_tags (Optional[List[str]]): HTML tags to include
exclude_tags (Optional[List[str]]): HTML tags to exclude
only_main_content (Optional[bool]): Extract main content only
@ -1179,7 +1179,7 @@ class FirecrawlApp:
self,
urls: List[str],
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
@ -1313,7 +1313,7 @@ class FirecrawlApp:
self,
urls: List[str],
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
@ -1445,7 +1445,7 @@ class FirecrawlApp:
self,
urls: List[str],
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
@ -2844,7 +2844,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
self,
url: str,
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
only_main_content: Optional[bool] = None,
@ -2865,7 +2865,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
Args:
url (str): Target URL to scrape
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
include_tags (Optional[List[str]]): HTML tags to include
exclude_tags (Optional[List[str]]): HTML tags to exclude
only_main_content (Optional[bool]): Extract main content only
@ -2972,7 +2972,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
self,
urls: List[str],
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,
@ -3111,7 +3111,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
self,
urls: List[str],
*,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
headers: Optional[Dict[str, str]] = None,
include_tags: Optional[List[str]] = None,
exclude_tags: Optional[List[str]] = None,

View File

@ -0,0 +1,18 @@
import os
import pytest
import sys
from dotenv import load_dotenv
from firecrawl.firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_async_batch_scrape_urls_async_placeholder():
# TODO: Implement async E2E tests for async_batch_scrape_urls
assert True

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_async_crawl_url_async_placeholder():
# TODO: Implement async E2E tests for async_crawl_url
assert True

View File

@ -0,0 +1,117 @@
import sys
import os
import subprocess
import time
import pytest
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URLS = [
"https://example.com",
"https://iana.org"
]
@pytest.mark.asyncio
async def test_batch_scrape_urls_async_simple():
result = await app.batch_scrape_urls(
TEST_URLS,
formats=["markdown"]
)
assert result.success
assert hasattr(result, "data")
assert len(result.data) == len(TEST_URLS)
assert any("Example Domain" in doc.markdown for doc in result.data)
@pytest.mark.asyncio
async def test_batch_scrape_urls_async_all_params_with_extract():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
result = await app.batch_scrape_urls(
TEST_URLS,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions
)
assert result.success
assert hasattr(result, "data")
assert len(result.data) == len(TEST_URLS)
for doc in result.data:
assert hasattr(doc, "markdown")
assert hasattr(doc, "html")
assert hasattr(doc, "rawHtml")
assert hasattr(doc, "links")
assert hasattr(doc, "screenshot")
assert hasattr(doc, "extract")
assert hasattr(doc, "metadata")
@pytest.mark.asyncio
async def test_batch_scrape_urls_async_all_params_with_json_options():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
result = await app.batch_scrape_urls(
TEST_URLS,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
json_options=json_options,
actions=actions
)
assert result.success
assert hasattr(result, "data")
assert len(result.data) == len(TEST_URLS)
for doc in result.data:
assert hasattr(doc, "markdown")
assert hasattr(doc, "html")
assert hasattr(doc, "rawHtml")
assert hasattr(doc, "links")
assert hasattr(doc, "screenshot")
assert hasattr(doc, "json")
assert hasattr(doc, "metadata")

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_check_crawl_status_async_placeholder():
# TODO: Implement async E2E tests for check_crawl_status
assert True

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_crawl_url_async_placeholder():
# TODO: Implement async E2E tests for crawl_url
assert True

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_deep_research_async_placeholder():
# TODO: Implement async E2E tests for deep_research
assert True

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_extract_async_placeholder():
# TODO: Implement async E2E tests for extract
assert True

View File

@ -0,0 +1,16 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
@pytest.mark.asyncio
async def test_generate_llms_text_async_placeholder():
# TODO: Implement async E2E tests for generate_llms_text
assert True

View File

@ -0,0 +1,40 @@
import sys
import os
import subprocess
import time
import pytest
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
from firecrawl.firecrawl import AsyncFirecrawlApp
load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = "example.com"
@pytest.mark.asyncio
async def test_map_url_async_simple():
result = await app.map_url(TEST_URL)
assert result is not None
assert result.success
assert hasattr(result, "links")
assert any("example.com" in url for url in result.links)
@pytest.mark.asyncio
async def test_map_url_async_all_params():
result = await app.map_url(
TEST_URL,
search="test",
sitemap_only=False,
include_subdomains=False,
limit=10
)
assert result is not None
assert result.success
assert hasattr(result, "links")
assert any("example.com" in url for url in result.links)

View File

@ -0,0 +1,158 @@
import sys
import os
import subprocess
import time
import pytest
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
from firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions
from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = 'example.com'
@pytest.mark.asyncio
async def test_scrape_url_async_simple():
response = await app.scrape_url(
TEST_URL,
formats=["markdown"]
)
assert response.success
assert hasattr(response, "markdown")
assert "# Example Domain" in response.markdown
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_extract_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")
@pytest.mark.asyncio
async def test_scrape_url_async_all_params_with_json_options():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = await app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
json_options=json_options,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "json")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_async_batch_scrape_urls_placeholder():
# TODO: Implement E2E tests for async_batch_scrape_urls
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_async_crawl_url_placeholder():
# TODO: Implement E2E tests for async_crawl_url
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_batch_scrape_urls_placeholder():
# TODO: Implement E2E tests for batch_scrape_urls
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_check_crawl_status_placeholder():
# TODO: Implement E2E tests for check_crawl_status
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_crawl_url_placeholder():
# TODO: Implement E2E tests for crawl_url
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_deep_research_placeholder():
# TODO: Implement E2E tests for deep_research
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_extract_placeholder():
# TODO: Implement E2E tests for extract
assert True

View File

@ -0,0 +1,15 @@
import os
import pytest
from dotenv import load_dotenv
from firecrawl import FirecrawlApp
load_dotenv()
API_URL = "https://api.firecrawl.dev"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
def test_generate_llms_text_placeholder():
# TODO: Implement E2E tests for generate_llms_text
assert True

View File

@ -0,0 +1,36 @@
import sys
import os
import pytest
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
from firecrawl.firecrawl import FirecrawlApp
load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = "example.com"
def test_map_url_simple():
result = app.map_url(TEST_URL)
assert result is not None
assert result.success
assert hasattr(result, "links")
assert any("example.com" in url for url in result.links)
def test_map_url_all_params():
result = app.map_url(
TEST_URL,
search="test",
sitemap_only=False,
include_subdomains=False,
limit=10
)
assert result is not None
assert result.success
assert hasattr(result, "links")
assert any("example.com" in url for url in result.links)

View File

@ -0,0 +1,154 @@
import sys
import os
import subprocess
import time
import pytest
from dotenv import load_dotenv
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
from firecrawl import FirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions
from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
load_dotenv()
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
API_KEY = os.getenv("TEST_API_KEY")
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
TEST_URL = 'example.com'
def test_scrape_url_simple():
response = app.scrape_url(
TEST_URL,
formats=["markdown"]
)
assert response.success
assert hasattr(response, "markdown")
assert "# Example Domain" in response.markdown
def test_scrape_url_all_params_with_extract_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")
def test_scrape_url_all_params_with_extract_full_page_screenshot():
location = LocationConfig(country="us", languages=["en"])
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
extract=extract,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "extract")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")
def test_scrape_url_all_params_with_json_options():
location = LocationConfig(country="us", languages=["en"])
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
actions = [
WaitAction(type="wait", milliseconds=500),
ScreenshotAction(type="screenshot", fullPage=True),
WriteAction(type="write", text="test input"),
PressAction(type="press", key="Enter"),
ScrollAction(type="scroll", direction="down"),
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
]
response = app.scrape_url(
url=TEST_URL,
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
include_tags=["h1", "p"],
exclude_tags=["footer"],
only_main_content=True,
wait_for=1000,
timeout=15000,
location=location,
mobile=True,
skip_tls_verification=True,
remove_base64_images=True,
block_ads=True,
proxy="basic",
json_options=json_options,
actions=actions,
change_tracking_options=change_tracking
)
assert response.success
assert hasattr(response, "markdown")
assert hasattr(response, "html")
assert hasattr(response, "rawHtml")
assert hasattr(response, "links")
assert hasattr(response, "screenshot")
assert hasattr(response, "json")
assert hasattr(response, "metadata")
assert hasattr(response, "changeTracking")