mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 06:25:56 +08:00
Enhance Firecrawl SDK: Add LocationConfig to __init__.py and update ScrapeOptions formats to remove 'content' type. Introduce new E2E tests for async and standard scraping functionalities, including batch scraping and URL mapping.
This commit is contained in:
parent
f17b1cdc86
commit
f0b0afc5d6
@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/
|
||||
import logging
|
||||
import os
|
||||
|
||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa
|
||||
from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa
|
||||
|
||||
__version__ = "2.6.0"
|
||||
|
||||
|
@ -144,7 +144,7 @@ class ChangeTrackingOptions(pydantic.BaseModel):
|
||||
|
||||
class ScrapeOptions(pydantic.BaseModel):
|
||||
"""Parameters for scraping operations."""
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None
|
||||
headers: Optional[Dict[str, str]] = None
|
||||
includeTags: Optional[List[str]] = None
|
||||
excludeTags: Optional[List[str]] = None
|
||||
@ -448,7 +448,7 @@ class FirecrawlApp:
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
only_main_content: Optional[bool] = None,
|
||||
@ -470,7 +470,7 @@ class FirecrawlApp:
|
||||
|
||||
Args:
|
||||
url (str): Target URL to scrape
|
||||
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
||||
formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
||||
include_tags (Optional[List[str]]): HTML tags to include
|
||||
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
||||
only_main_content (Optional[bool]): Extract main content only
|
||||
@ -1179,7 +1179,7 @@ class FirecrawlApp:
|
||||
self,
|
||||
urls: List[str],
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
@ -1313,7 +1313,7 @@ class FirecrawlApp:
|
||||
self,
|
||||
urls: List[str],
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
@ -1445,7 +1445,7 @@ class FirecrawlApp:
|
||||
self,
|
||||
urls: List[str],
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
@ -2844,7 +2844,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
self,
|
||||
url: str,
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
only_main_content: Optional[bool] = None,
|
||||
@ -2865,7 +2865,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
|
||||
Args:
|
||||
url (str): Target URL to scrape
|
||||
formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
||||
formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc)
|
||||
include_tags (Optional[List[str]]): HTML tags to include
|
||||
exclude_tags (Optional[List[str]]): HTML tags to exclude
|
||||
only_main_content (Optional[bool]): Extract main content only
|
||||
@ -2972,7 +2972,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
self,
|
||||
urls: List[str],
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
@ -3111,7 +3111,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
||||
self,
|
||||
urls: List[str],
|
||||
*,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None,
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
include_tags: Optional[List[str]] = None,
|
||||
exclude_tags: Optional[List[str]] = None,
|
||||
|
18
apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py
Normal file
18
apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py
Normal file
@ -0,0 +1,18 @@
|
||||
import os
|
||||
import pytest
|
||||
import sys
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_batch_scrape_urls_async_placeholder():
|
||||
# TODO: Implement async E2E tests for async_batch_scrape_urls
|
||||
assert True
|
16
apps/python-sdk/tests/e2e/async/test_async_crawl.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_async_crawl.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_async_crawl_url_async_placeholder():
|
||||
# TODO: Implement async E2E tests for async_crawl_url
|
||||
assert True
|
117
apps/python-sdk/tests/e2e/async/test_batch_scrape.py
Normal file
117
apps/python-sdk/tests/e2e/async/test_batch_scrape.py
Normal file
@ -0,0 +1,117 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URLS = [
|
||||
"https://example.com",
|
||||
"https://iana.org"
|
||||
]
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_scrape_urls_async_simple():
|
||||
result = await app.batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert len(result.data) == len(TEST_URLS)
|
||||
assert any("Example Domain" in doc.markdown for doc in result.data)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_scrape_urls_async_all_params_with_extract():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
result = await app.batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions
|
||||
)
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
|
||||
assert len(result.data) == len(TEST_URLS)
|
||||
for doc in result.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "rawHtml")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "extract")
|
||||
assert hasattr(doc, "metadata")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_batch_scrape_urls_async_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
result = await app.batch_scrape_urls(
|
||||
TEST_URLS,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions
|
||||
)
|
||||
assert result.success
|
||||
assert hasattr(result, "data")
|
||||
assert len(result.data) == len(TEST_URLS)
|
||||
for doc in result.data:
|
||||
assert hasattr(doc, "markdown")
|
||||
assert hasattr(doc, "html")
|
||||
assert hasattr(doc, "rawHtml")
|
||||
assert hasattr(doc, "links")
|
||||
assert hasattr(doc, "screenshot")
|
||||
assert hasattr(doc, "json")
|
||||
assert hasattr(doc, "metadata")
|
16
apps/python-sdk/tests/e2e/async/test_check_crawl_status.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_check_crawl_status.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_check_crawl_status_async_placeholder():
|
||||
# TODO: Implement async E2E tests for check_crawl_status
|
||||
assert True
|
16
apps/python-sdk/tests/e2e/async/test_crawl.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_crawl.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_crawl_url_async_placeholder():
|
||||
# TODO: Implement async E2E tests for crawl_url
|
||||
assert True
|
16
apps/python-sdk/tests/e2e/async/test_deep_research.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_deep_research.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_deep_research_async_placeholder():
|
||||
# TODO: Implement async E2E tests for deep_research
|
||||
assert True
|
16
apps/python-sdk/tests/e2e/async/test_extract.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_extract.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_extract_async_placeholder():
|
||||
# TODO: Implement async E2E tests for extract
|
||||
assert True
|
16
apps/python-sdk/tests/e2e/async/test_generate_llms_text.py
Normal file
16
apps/python-sdk/tests/e2e/async/test_generate_llms_text.py
Normal file
@ -0,0 +1,16 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_generate_llms_text_async_placeholder():
|
||||
# TODO: Implement async E2E tests for generate_llms_text
|
||||
assert True
|
40
apps/python-sdk/tests/e2e/async/test_map.py
Normal file
40
apps/python-sdk/tests/e2e/async/test_map.py
Normal file
@ -0,0 +1,40 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl.firecrawl import AsyncFirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = "example.com"
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_map_url_async_simple():
|
||||
result = await app.map_url(TEST_URL)
|
||||
assert result is not None
|
||||
assert result.success
|
||||
assert hasattr(result, "links")
|
||||
assert any("example.com" in url for url in result.links)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_map_url_async_all_params():
|
||||
result = await app.map_url(
|
||||
TEST_URL,
|
||||
search="test",
|
||||
sitemap_only=False,
|
||||
include_subdomains=False,
|
||||
limit=10
|
||||
)
|
||||
assert result is not None
|
||||
assert result.success
|
||||
assert hasattr(result, "links")
|
||||
assert any("example.com" in url for url in result.links)
|
158
apps/python-sdk/tests/e2e/async/test_scrape.py
Normal file
158
apps/python-sdk/tests/e2e/async/test_scrape.py
Normal file
@ -0,0 +1,158 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
|
||||
from firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions
|
||||
from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = 'example.com'
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_simple():
|
||||
response = await app.scrape_url(
|
||||
TEST_URL,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert "# Example Domain" in response.markdown
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_extract_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_extract_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_scrape_url_async_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = await app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "json")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
15
apps/python-sdk/tests/e2e/test_async_batch_scrape.py
Normal file
15
apps/python-sdk/tests/e2e/test_async_batch_scrape.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_async_batch_scrape_urls_placeholder():
|
||||
# TODO: Implement E2E tests for async_batch_scrape_urls
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_async_crawl.py
Normal file
15
apps/python-sdk/tests/e2e/test_async_crawl.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_async_crawl_url_placeholder():
|
||||
# TODO: Implement E2E tests for async_crawl_url
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_batch_scrape.py
Normal file
15
apps/python-sdk/tests/e2e/test_batch_scrape.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_batch_scrape_urls_placeholder():
|
||||
# TODO: Implement E2E tests for batch_scrape_urls
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_check_crawl_status.py
Normal file
15
apps/python-sdk/tests/e2e/test_check_crawl_status.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_check_crawl_status_placeholder():
|
||||
# TODO: Implement E2E tests for check_crawl_status
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_crawl.py
Normal file
15
apps/python-sdk/tests/e2e/test_crawl.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_crawl_url_placeholder():
|
||||
# TODO: Implement E2E tests for crawl_url
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_deep_research.py
Normal file
15
apps/python-sdk/tests/e2e/test_deep_research.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_deep_research_placeholder():
|
||||
# TODO: Implement E2E tests for deep_research
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_extract.py
Normal file
15
apps/python-sdk/tests/e2e/test_extract.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_extract_placeholder():
|
||||
# TODO: Implement E2E tests for extract
|
||||
assert True
|
15
apps/python-sdk/tests/e2e/test_generate_llms_text.py
Normal file
15
apps/python-sdk/tests/e2e/test_generate_llms_text.py
Normal file
@ -0,0 +1,15 @@
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
from firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "https://api.firecrawl.dev"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
def test_generate_llms_text_placeholder():
|
||||
# TODO: Implement E2E tests for generate_llms_text
|
||||
assert True
|
36
apps/python-sdk/tests/e2e/test_map.py
Normal file
36
apps/python-sdk/tests/e2e/test_map.py
Normal file
@ -0,0 +1,36 @@
|
||||
import sys
|
||||
import os
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = "example.com"
|
||||
|
||||
def test_map_url_simple():
|
||||
result = app.map_url(TEST_URL)
|
||||
assert result is not None
|
||||
assert result.success
|
||||
assert hasattr(result, "links")
|
||||
assert any("example.com" in url for url in result.links)
|
||||
|
||||
def test_map_url_all_params():
|
||||
result = app.map_url(
|
||||
TEST_URL,
|
||||
search="test",
|
||||
sitemap_only=False,
|
||||
include_subdomains=False,
|
||||
limit=10
|
||||
)
|
||||
assert result is not None
|
||||
assert result.success
|
||||
assert hasattr(result, "links")
|
||||
assert any("example.com" in url for url in result.links)
|
154
apps/python-sdk/tests/e2e/test_scrape.py
Normal file
154
apps/python-sdk/tests/e2e/test_scrape.py
Normal file
@ -0,0 +1,154 @@
|
||||
import sys
|
||||
import os
|
||||
import subprocess
|
||||
import time
|
||||
import pytest
|
||||
from dotenv import load_dotenv
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))
|
||||
from firecrawl import FirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions
|
||||
from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
|
||||
API_KEY = os.getenv("TEST_API_KEY")
|
||||
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
|
||||
|
||||
TEST_URL = 'example.com'
|
||||
|
||||
def test_scrape_url_simple():
|
||||
response = app.scrape_url(
|
||||
TEST_URL,
|
||||
formats=["markdown"]
|
||||
)
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert "# Example Domain" in response.markdown
|
||||
|
||||
def test_scrape_url_all_params_with_extract_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
|
||||
def test_scrape_url_all_params_with_extract_full_page_screenshot():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
extract=extract,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "extract")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
||||
|
||||
def test_scrape_url_all_params_with_json_options():
|
||||
location = LocationConfig(country="us", languages=["en"])
|
||||
json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
|
||||
json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
|
||||
change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema)
|
||||
actions = [
|
||||
WaitAction(type="wait", milliseconds=500),
|
||||
ScreenshotAction(type="screenshot", fullPage=True),
|
||||
WriteAction(type="write", text="test input"),
|
||||
PressAction(type="press", key="Enter"),
|
||||
ScrollAction(type="scroll", direction="down"),
|
||||
ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();")
|
||||
]
|
||||
response = app.scrape_url(
|
||||
url=TEST_URL,
|
||||
formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"],
|
||||
include_tags=["h1", "p"],
|
||||
exclude_tags=["footer"],
|
||||
only_main_content=True,
|
||||
wait_for=1000,
|
||||
timeout=15000,
|
||||
location=location,
|
||||
mobile=True,
|
||||
skip_tls_verification=True,
|
||||
remove_base64_images=True,
|
||||
block_ads=True,
|
||||
proxy="basic",
|
||||
json_options=json_options,
|
||||
actions=actions,
|
||||
change_tracking_options=change_tracking
|
||||
)
|
||||
|
||||
assert response.success
|
||||
assert hasattr(response, "markdown")
|
||||
assert hasattr(response, "html")
|
||||
assert hasattr(response, "rawHtml")
|
||||
assert hasattr(response, "links")
|
||||
assert hasattr(response, "screenshot")
|
||||
assert hasattr(response, "json")
|
||||
assert hasattr(response, "metadata")
|
||||
assert hasattr(response, "changeTracking")
|
Loading…
x
Reference in New Issue
Block a user