From f0b0afc5d67ed801611813279d5dda42f30c70cd Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 9 May 2025 18:32:05 -0300 Subject: [PATCH] Enhance Firecrawl SDK: Add LocationConfig to __init__.py and update ScrapeOptions formats to remove 'content' type. Introduce new E2E tests for async and standard scraping functionalities, including batch scraping and URL mapping. --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 20 +-- .../e2e/async/test_async_batch_scrape.py | 18 ++ .../tests/e2e/async/test_async_crawl.py | 16 ++ .../tests/e2e/async/test_batch_scrape.py | 117 +++++++++++++ .../e2e/async/test_check_crawl_status.py | 16 ++ apps/python-sdk/tests/e2e/async/test_crawl.py | 16 ++ .../tests/e2e/async/test_deep_research.py | 16 ++ .../tests/e2e/async/test_extract.py | 16 ++ .../e2e/async/test_generate_llms_text.py | 16 ++ apps/python-sdk/tests/e2e/async/test_map.py | 40 +++++ .../python-sdk/tests/e2e/async/test_scrape.py | 158 ++++++++++++++++++ .../tests/e2e/test_async_batch_scrape.py | 15 ++ apps/python-sdk/tests/e2e/test_async_crawl.py | 15 ++ .../python-sdk/tests/e2e/test_batch_scrape.py | 15 ++ .../tests/e2e/test_check_crawl_status.py | 15 ++ apps/python-sdk/tests/e2e/test_crawl.py | 15 ++ .../tests/e2e/test_deep_research.py | 15 ++ apps/python-sdk/tests/e2e/test_extract.py | 15 ++ .../tests/e2e/test_generate_llms_text.py | 15 ++ apps/python-sdk/tests/e2e/test_map.py | 36 ++++ apps/python-sdk/tests/e2e/test_scrape.py | 154 +++++++++++++++++ 22 files changed, 750 insertions(+), 11 deletions(-) create mode 100644 apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/async/test_async_crawl.py create mode 100644 apps/python-sdk/tests/e2e/async/test_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/async/test_check_crawl_status.py create mode 100644 apps/python-sdk/tests/e2e/async/test_crawl.py create mode 100644 apps/python-sdk/tests/e2e/async/test_deep_research.py create mode 100644 apps/python-sdk/tests/e2e/async/test_extract.py create mode 100644 apps/python-sdk/tests/e2e/async/test_generate_llms_text.py create mode 100644 apps/python-sdk/tests/e2e/async/test_map.py create mode 100644 apps/python-sdk/tests/e2e/async/test_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_async_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_async_crawl.py create mode 100644 apps/python-sdk/tests/e2e/test_batch_scrape.py create mode 100644 apps/python-sdk/tests/e2e/test_check_crawl_status.py create mode 100644 apps/python-sdk/tests/e2e/test_crawl.py create mode 100644 apps/python-sdk/tests/e2e/test_deep_research.py create mode 100644 apps/python-sdk/tests/e2e/test_extract.py create mode 100644 apps/python-sdk/tests/e2e/test_generate_llms_text.py create mode 100644 apps/python-sdk/tests/e2e/test_map.py create mode 100644 apps/python-sdk/tests/e2e/test_scrape.py diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 3c7c8b3b..a4548185 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -11,7 +11,7 @@ For more information visit https://github.com/firecrawl/ import logging import os -from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions # noqa +from .firecrawl import FirecrawlApp, AsyncFirecrawlApp, JsonConfig, ScrapeOptions, ChangeTrackingOptions, LocationConfig # noqa __version__ = "2.6.0" diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 6961c44a..1df2fcc9 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -144,7 +144,7 @@ class ChangeTrackingOptions(pydantic.BaseModel): class ScrapeOptions(pydantic.BaseModel): """Parameters for scraping operations.""" - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None headers: Optional[Dict[str, str]] = None includeTags: Optional[List[str]] = None excludeTags: Optional[List[str]] = None @@ -448,7 +448,7 @@ class FirecrawlApp: self, url: str, *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, @@ -470,7 +470,7 @@ class FirecrawlApp: Args: url (str): Target URL to scrape - formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) include_tags (Optional[List[str]]): HTML tags to include exclude_tags (Optional[List[str]]): HTML tags to exclude only_main_content (Optional[bool]): Extract main content only @@ -1179,7 +1179,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -1313,7 +1313,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -1445,7 +1445,7 @@ class FirecrawlApp: self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -2844,7 +2844,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, url: str, *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json", "changeTracking"]]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, only_main_content: Optional[bool] = None, @@ -2865,7 +2865,7 @@ class AsyncFirecrawlApp(FirecrawlApp): Args: url (str): Target URL to scrape - formats (Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) + formats (Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]]): Content types to retrieve (markdown/html/etc) include_tags (Optional[List[str]]): HTML tags to include exclude_tags (Optional[List[str]]): HTML tags to exclude only_main_content (Optional[bool]): Extract main content only @@ -2972,7 +2972,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, @@ -3111,7 +3111,7 @@ class AsyncFirecrawlApp(FirecrawlApp): self, urls: List[str], *, - formats: Optional[List[Literal["markdown", "html", "rawHtml", "content", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, + formats: Optional[List[Literal["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage", "extract", "json"]]] = None, headers: Optional[Dict[str, str]] = None, include_tags: Optional[List[str]] = None, exclude_tags: Optional[List[str]] = None, diff --git a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py new file mode 100644 index 00000000..ce366a40 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py @@ -0,0 +1,18 @@ +import os +import pytest +import sys + +from dotenv import load_dotenv +from firecrawl.firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_async_batch_scrape_urls_async_placeholder(): + # TODO: Implement async E2E tests for async_batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_async_crawl.py b/apps/python-sdk/tests/e2e/async/test_async_crawl.py new file mode 100644 index 00000000..02d05f01 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_async_crawl.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_async_crawl_url_async_placeholder(): + # TODO: Implement async E2E tests for async_crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py new file mode 100644 index 00000000..a5dde5d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py @@ -0,0 +1,117 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URLS = [ + "https://example.com", + "https://iana.org" +] + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_simple(): + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown"] + ) + assert result.success + assert hasattr(result, "data") + assert len(result.data) == len(TEST_URLS) + assert any("Example Domain" in doc.markdown for doc in result.data) + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_all_params_with_extract(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions + ) + assert result.success + assert hasattr(result, "data") + + assert len(result.data) == len(TEST_URLS) + for doc in result.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "rawHtml") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "extract") + assert hasattr(doc, "metadata") + +@pytest.mark.asyncio +async def test_batch_scrape_urls_async_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + result = await app.batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert result.success + assert hasattr(result, "data") + assert len(result.data) == len(TEST_URLS) + for doc in result.data: + assert hasattr(doc, "markdown") + assert hasattr(doc, "html") + assert hasattr(doc, "rawHtml") + assert hasattr(doc, "links") + assert hasattr(doc, "screenshot") + assert hasattr(doc, "json") + assert hasattr(doc, "metadata") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py new file mode 100644 index 00000000..ac929fde --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_check_crawl_status.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_check_crawl_status_async_placeholder(): + # TODO: Implement async E2E tests for check_crawl_status + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_crawl.py b/apps/python-sdk/tests/e2e/async/test_crawl.py new file mode 100644 index 00000000..fec46b1a --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_crawl.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_crawl_url_async_placeholder(): + # TODO: Implement async E2E tests for crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_deep_research.py b/apps/python-sdk/tests/e2e/async/test_deep_research.py new file mode 100644 index 00000000..66cb51d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_deep_research.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_deep_research_async_placeholder(): + # TODO: Implement async E2E tests for deep_research + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_extract.py b/apps/python-sdk/tests/e2e/async/test_extract.py new file mode 100644 index 00000000..4a059f13 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_extract.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_extract_async_placeholder(): + # TODO: Implement async E2E tests for extract + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py new file mode 100644 index 00000000..00d6b987 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_generate_llms_text.py @@ -0,0 +1,16 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +@pytest.mark.asyncio +async def test_generate_llms_text_async_placeholder(): + # TODO: Implement async E2E tests for generate_llms_text + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_map.py b/apps/python-sdk/tests/e2e/async/test_map.py new file mode 100644 index 00000000..b034e9f0 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_map.py @@ -0,0 +1,40 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl.firecrawl import AsyncFirecrawlApp + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = "example.com" + +@pytest.mark.asyncio +async def test_map_url_async_simple(): + result = await app.map_url(TEST_URL) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) + +@pytest.mark.asyncio +async def test_map_url_async_all_params(): + result = await app.map_url( + TEST_URL, + search="test", + sitemap_only=False, + include_subdomains=False, + limit=10 + ) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/async/test_scrape.py b/apps/python-sdk/tests/e2e/async/test_scrape.py new file mode 100644 index 00000000..ae0e0a46 --- /dev/null +++ b/apps/python-sdk/tests/e2e/async/test_scrape.py @@ -0,0 +1,158 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) +from firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions +from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = 'example.com' + +@pytest.mark.asyncio +async def test_scrape_url_async_simple(): + response = await app.scrape_url( + TEST_URL, + formats=["markdown"] + ) + assert response.success + assert hasattr(response, "markdown") + assert "# Example Domain" in response.markdown + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_extract_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_extract_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +@pytest.mark.asyncio +async def test_scrape_url_async_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = await app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "json") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py new file mode 100644 index 00000000..d3fc8c6c --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_async_batch_scrape_urls_placeholder(): + # TODO: Implement E2E tests for async_batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_crawl.py b/apps/python-sdk/tests/e2e/test_async_crawl.py new file mode 100644 index 00000000..a7b3dc06 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_async_crawl.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_async_crawl_url_placeholder(): + # TODO: Implement E2E tests for async_crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_batch_scrape.py b/apps/python-sdk/tests/e2e/test_batch_scrape.py new file mode 100644 index 00000000..ff8aa0af --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_batch_scrape.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_batch_scrape_urls_placeholder(): + # TODO: Implement E2E tests for batch_scrape_urls + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_check_crawl_status.py b/apps/python-sdk/tests/e2e/test_check_crawl_status.py new file mode 100644 index 00000000..22b1a4d4 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_check_crawl_status.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_check_crawl_status_placeholder(): + # TODO: Implement E2E tests for check_crawl_status + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_crawl.py b/apps/python-sdk/tests/e2e/test_crawl.py new file mode 100644 index 00000000..bf758919 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_crawl.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_crawl_url_placeholder(): + # TODO: Implement E2E tests for crawl_url + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_deep_research.py b/apps/python-sdk/tests/e2e/test_deep_research.py new file mode 100644 index 00000000..463aea64 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_deep_research.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_deep_research_placeholder(): + # TODO: Implement E2E tests for deep_research + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_extract.py b/apps/python-sdk/tests/e2e/test_extract.py new file mode 100644 index 00000000..4e925041 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_extract.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_extract_placeholder(): + # TODO: Implement E2E tests for extract + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_generate_llms_text.py b/apps/python-sdk/tests/e2e/test_generate_llms_text.py new file mode 100644 index 00000000..fe9928e1 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_generate_llms_text.py @@ -0,0 +1,15 @@ +import os +import pytest +from dotenv import load_dotenv +from firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = "https://api.firecrawl.dev" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +def test_generate_llms_text_placeholder(): + # TODO: Implement E2E tests for generate_llms_text + assert True \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_map.py b/apps/python-sdk/tests/e2e/test_map.py new file mode 100644 index 00000000..da9bb4d5 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_map.py @@ -0,0 +1,36 @@ +import sys +import os +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl.firecrawl import FirecrawlApp + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = "example.com" + +def test_map_url_simple(): + result = app.map_url(TEST_URL) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) + +def test_map_url_all_params(): + result = app.map_url( + TEST_URL, + search="test", + sitemap_only=False, + include_subdomains=False, + limit=10 + ) + assert result is not None + assert result.success + assert hasattr(result, "links") + assert any("example.com" in url for url in result.links) \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_scrape.py b/apps/python-sdk/tests/e2e/test_scrape.py new file mode 100644 index 00000000..f20a80f7 --- /dev/null +++ b/apps/python-sdk/tests/e2e/test_scrape.py @@ -0,0 +1,154 @@ +import sys +import os +import subprocess +import time +import pytest +from dotenv import load_dotenv + +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../'))) +from firecrawl import FirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions +from firecrawl.firecrawl import WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction + +load_dotenv() + +API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002" +API_KEY = os.getenv("TEST_API_KEY") + +app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) + +TEST_URL = 'example.com' + +def test_scrape_url_simple(): + response = app.scrape_url( + TEST_URL, + formats=["markdown"] + ) + assert response.success + assert hasattr(response, "markdown") + assert "# Example Domain" in response.markdown + +def test_scrape_url_all_params_with_extract_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +def test_scrape_url_all_params_with_extract_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + extract = JsonConfig(prompt="Extract the title", schema=extract_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=extract_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot@fullPage", "extract", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + extract=extract, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "extract") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") + +def test_scrape_url_all_params_with_json_options(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + change_tracking = ChangeTrackingOptions(modes=["git-diff", "json"], prompt="Track changes", schema=json_schema) + actions = [ + WaitAction(type="wait", milliseconds=500), + ScreenshotAction(type="screenshot", fullPage=True), + WriteAction(type="write", text="test input"), + PressAction(type="press", key="Enter"), + ScrollAction(type="scroll", direction="down"), + ExecuteJavascriptAction(type="executeJavascript", script="function getTitle() { return document.title; }; getTitle();") + ] + response = app.scrape_url( + url=TEST_URL, + formats=["markdown", "html", "rawHtml", "links", "screenshot", "json", "changeTracking"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=15000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions, + change_tracking_options=change_tracking + ) + + assert response.success + assert hasattr(response, "markdown") + assert hasattr(response, "html") + assert hasattr(response, "rawHtml") + assert hasattr(response, "links") + assert hasattr(response, "screenshot") + assert hasattr(response, "json") + assert hasattr(response, "metadata") + assert hasattr(response, "changeTracking") \ No newline at end of file