sdk(v3): batch scrape sync and ascyn -- this test is not reliable bc of a bug in batch scrape

2025-08-16 18:45:54 +08:00 · 2025-05-27 11:57:57 -03:00 · 2025-05-27 11:57:57 -03:00 · 2e14859de0
commit 2e14859de0
parent ab1e244693
3 changed files with 353 additions and 193 deletions
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -3355,68 +3355,48 @@ class AsyncFirecrawlApp(FirecrawlApp):
        headers = self._prepare_headers()
        response = await self._async_get_request(f'{self.api_url}{endpoint}', headers)
-        if response.get('success'):
+        try:
-            try:
+            while response.get('status') != 'completed':
-                status_data = response.json()
+                await asyncio.sleep(poll_interval)
            except:
                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            while status_data['status'] != 'completed':
                time.sleep(poll_interval)
                response = await self._async_get_request(f'{self.api_url}{endpoint}', headers)
                if response.get('success'):
                    status_data = response.get('data')
                else:
                    self._handle_error(response, 'check batch scrape status')
-            if 'data' in status_data:
+                if 'data' in response:
-                data = status_data['data']
+                    data = response.get('data')
-                while 'next' in status_data:
+                    while 'next' in response:
-                    if len(status_data['data']) == 0:
+                        if len(response.get('data')) == 0:
-                        break
+                            break
-                    next_url = status_data.get('next')
+                        next_url = response.get('next')
-                    if not next_url:
+                        if not next_url:
-                        logger.warning("Expected 'next' URL is missing.")
+                            logger.warning("Expected 'next' URL is missing.")
                        break
                    try:
                        status_response = await self._async_get_request(next_url, headers)
                        if not status_response.get('success'):
                            logger.error(f"Failed to fetch next page: {status_response.status_code}")
                            break
                        try:
-                            next_data = status_response.json()
+                            response = await self._async_get_request(next_url, headers)
-                        except:
+                            next_data = response.get('data')
-                            raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            if next_data:
-                        data.extend(next_data.get('data', []))
+                                data.extend(next_data.get('data', []))
-                        status_data = next_data
+                        except Exception as e:
-                    except Exception as e:
+                            logger.error(f"Error during pagination request: {e}")
-                        logger.error(f"Error during pagination request: {e}")
+                            break
                        break
-                # Apply format transformations to each document in the data
+                    # Apply format transformations to each document in the data
-                if data:
+                    if data:
-                    for document in data:
+                        for document in data:
-                        scrape_formats_response_transform(document)
+                            scrape_formats_response_transform(document)
            response = {
-                'status': status_data.get('status'),
+                'status': response.get('status'),
-                'total': status_data.get('total'),
+                'total': response.get('total'),
-                'completed': status_data.get('completed'),
+                'completed': response.get('completed'),
-                'credits_used': status_data.get('creditsUsed'),
+                'credits_used': response.get('creditsUsed'),
-                'expires_at': status_data.get('expiresAt'),
+                'expires_at': response.get('expiresAt'),
                'data': data,
-                'next': status_data.get('next'),
+                'next': response.get('next'),
-                'error': status_data.get('error')
+                'error': response.get('error')
            }
            if 'error' in status_data:
                response['error'] = status_data['error']
            if 'next' in status_data:
                response['next'] = status_data['next']
            return BatchScrapeStatusResponse(**response)
-        else:
+        
        except Exception as e:
            await self._handle_error(response, 'check batch scrape status')
    async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
--- a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py
+++ b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py
@ -1,100 +1,76 @@
 import sys
 import os
 import subprocess
 import time
 import pytest
 from dotenv import load_dotenv
 import sys
 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
-from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
+from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
 load_dotenv()
-API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
+API_URL = "https://api.firecrawl.dev"
 API_KEY = os.getenv("TEST_API_KEY")
 app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)
 TEST_URLS = [
-    "https://example.com",
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
-    "https://iana.org"
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
 ]
@pytest.mark.asyncio
-async def test_batch_scrape_urls_async_simple():
+async def test_batch_scrape_urls_simple():
-    result = await app.batch_scrape_urls(
+    response = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown"]
    )
-    assert result.success
+    
-    assert hasattr(result, "data")
+    # Basic response assertions
-    assert len(result.data) == len(TEST_URLS)
+    assert response.success
-    assert any("Example Domain" in doc.markdown for doc in result.data)
+    assert hasattr(response, "data")
-
+    assert response.data is not None
-@pytest.mark.asyncio
+    assert isinstance(response.data, list)
-async def test_batch_scrape_urls_async_all_params_with_extract():
+    assert len(response.data) == len(TEST_URLS)
-    location = LocationConfig(country="us", languages=["en"])
+    
-    extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
+    # Content assertions for each document
-    extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
+    for doc in response.data:
    actions = [
        WaitAction(type="wait", milliseconds=500),
        ScreenshotAction(type="screenshot", fullPage=True),
        WriteAction(type="write", text="test input"),
        PressAction(type="press", key="Enter"),
        ScrollAction(type="scroll", direction="down"),
        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
    ]
    result = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
        timeout=15000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
        remove_base64_images=True,
        block_ads=True,
        proxy="basic",
        extract=extract,
        actions=actions
    )
    assert result.success
    assert hasattr(result, "data")
    assert len(result.data) == len(TEST_URLS)
    for doc in result.data:
        assert hasattr(doc, "markdown")
-        assert hasattr(doc, "html")
+        assert doc.markdown is not None
-        assert hasattr(doc, "raw_html")
+        assert isinstance(doc.markdown, str)
-        assert hasattr(doc, "links")
+        assert len(doc.markdown) > 0
-        assert hasattr(doc, "screenshot")
+        
        assert hasattr(doc, "extract")
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS
    # Check that we got content from both test pages
    markdown_contents = [doc.markdown for doc in response.data]
    assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
    assert any("Numbered Pagination" in content for content in markdown_contents)
@pytest.mark.asyncio
-async def test_batch_scrape_urls_async_all_params_with_json_options():
+async def test_batch_scrape_urls_all_params_with_json_options():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
-        WaitAction(type="wait", milliseconds=500),
+        WaitAction(milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
+        ScreenshotAction(full_page=True),
-        WriteAction(type="write", text="test input"),
+        WriteAction(text="test input"),
-        PressAction(type="press", key="Enter"),
+        PressAction(key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
+        ScrollAction(direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
-    result = await app.batch_scrape_urls(
+    response = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
-        timeout=15000,
+        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
@ -104,14 +80,134 @@ async def test_batch_scrape_urls_async_all_params_with_json_options():
        json_options=json_options,
        actions=actions
    )
-    assert result.success
+    
-    assert hasattr(result, "data")
+    # Basic response assertions
-    assert len(result.data) == len(TEST_URLS)
+    assert response.success
-    for doc in result.data:
+    assert hasattr(response, "data")
    assert response.data is not None
    assert isinstance(response.data, list)
    assert len(response.data) == len(TEST_URLS)
    # Detailed content assertions for each document
    for doc in response.data:
        # Markdown assertions
        assert hasattr(doc, "markdown")
        assert doc.markdown is not None
        assert isinstance(doc.markdown, str)
        assert len(doc.markdown) > 0
        # HTML assertions
        assert hasattr(doc, "html")
        assert doc.html is not None
        assert isinstance(doc.html, str)
        assert len(doc.html) > 0
        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
        assert doc.raw_html is not None
        assert isinstance(doc.raw_html, str)
        assert len(doc.raw_html) > 0
        # Links assertions
        assert hasattr(doc, "links")
        assert doc.links is not None
        assert isinstance(doc.links, list)
        # Screenshot assertions
        assert hasattr(doc, "screenshot")
        assert doc.screenshot is not None
        assert isinstance(doc.screenshot, str)
        assert doc.screenshot.startswith("https://")
        # JSON assertions
        assert hasattr(doc, "json")
-        assert hasattr(doc, "metadata") 
+        assert doc.json is not None
        assert isinstance(doc.json, dict)
        # Metadata assertions
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS
@pytest.mark.asyncio
 async def test_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
        WaitAction(milliseconds=500),
        ScreenshotAction(full_page=True),
        WriteAction(text="test input"),
        PressAction(key="Enter"),
        ScrollAction(direction="down"),
        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
    response = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
        remove_base64_images=True,
        block_ads=True,
        proxy="basic",
        json_options=json_options,
        actions=actions
    )
    # Basic response assertions
    assert response.success
    assert hasattr(response, "data")
    assert response.data is not None
    assert isinstance(response.data, list)
    assert len(response.data) == len(TEST_URLS)
    # Detailed content assertions for each document
    for doc in response.data:
        # Markdown assertions
        assert hasattr(doc, "markdown")
        assert doc.markdown is not None
        assert isinstance(doc.markdown, str)
        assert len(doc.markdown) > 0
        # HTML assertions
        assert hasattr(doc, "html")
        assert doc.html is not None
        assert isinstance(doc.html, str)
        assert len(doc.html) > 0
        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
        assert doc.raw_html is not None
        assert isinstance(doc.raw_html, str)
        assert len(doc.raw_html) > 0
        # Links assertions
        assert hasattr(doc, "links")
        assert doc.links is not None
        assert isinstance(doc.links, list)
        # Screenshot assertions (full page)
        assert hasattr(doc, "screenshot")
        assert doc.screenshot is not None
        assert isinstance(doc.screenshot, str)
        assert doc.screenshot.startswith("https://")
        # JSON assertions
        assert hasattr(doc, "json")
        assert doc.json is not None
        assert isinstance(doc.json, dict)
        # Metadata assertions
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS
--- a/apps/python-sdk/tests/e2e/test_batch_scrape.py
+++ b/apps/python-sdk/tests/e2e/test_batch_scrape.py
@ -14,94 +14,61 @@ API_KEY = os.getenv("TEST_API_KEY")
 app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)
 TEST_URLS = [
-    "https://example.com",
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
-    "https://www.iana.org"
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
 ]
 def wait_for_batch_completion(app, job_id, timeout=60):
    for _ in range(timeout):
        status = app.check_batch_scrape_status(job_id)
        if status.status == "completed":
            return status
        import time; time.sleep(1)
    raise TimeoutError("Batch scrape did not complete in time")
 def test_batch_scrape_urls_simple():
-    job = app.async_batch_scrape_urls(
+    response = app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown"]
    )
-    assert job.id is not None
+    
-    status = wait_for_batch_completion(app, job.id)
+    # Basic response assertions
-    assert status.success
+    assert response.success
-    assert hasattr(status, "data")
+    assert hasattr(response, "data")
-    assert len(status.data) == len(TEST_URLS)
+    assert response.data is not None
-    assert any("Example Domain" in doc.markdown for doc in status.data)
+    assert isinstance(response.data, list)
-    assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data)
+    assert len(response.data) == len(TEST_URLS)
-
+    
-def test_batch_scrape_urls_all_params_with_extract():
+    # Content assertions for each document
-    location = LocationConfig(country="us", languages=["en"])
+    for doc in response.data:
    extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
    actions = [
        WaitAction(type="wait", milliseconds=500),
        ScreenshotAction(type="screenshot", fullPage=True),
        WriteAction(type="write", text="test input"),
        PressAction(type="press", key="Enter"),
        ScrollAction(type="scroll", direction="down"),
        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
    ]
    job = app.async_batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
        timeout=15000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
        remove_base64_images=True,
        block_ads=True,
        proxy="basic",
        extract=extract,
        actions=actions
    )
    assert job.id is not None
    status = wait_for_batch_completion(app, job.id)
    assert status.success
    assert hasattr(status, "data")
    assert len(status.data) == len(TEST_URLS)
    for doc in status.data:
        assert hasattr(doc, "markdown")
-        assert hasattr(doc, "html")
+        assert doc.markdown is not None
-        assert hasattr(doc, "raw_html")
+        assert isinstance(doc.markdown, str)
-        assert hasattr(doc, "links")
+        assert len(doc.markdown) > 0
-        assert hasattr(doc, "screenshot")
+        
        assert hasattr(doc, "extract")
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS
    # Check that we got content from both test pages
    markdown_contents = [doc.markdown for doc in response.data]
    assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
    assert any("Numbered Pagination" in content for content in markdown_contents)
 def test_batch_scrape_urls_all_params_with_json_options():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
-        WaitAction(type="wait", milliseconds=500),
+        WaitAction(milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
+        ScreenshotAction(full_page=True),
-        WriteAction(type="write", text="test input"),
+        WriteAction(text="test input"),
-        PressAction(type="press", key="Enter"),
+        PressAction(key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
+        ScrollAction(direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
-    job = app.async_batch_scrape_urls(
+    response = app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
-        timeout=15000,
+        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
@ -111,16 +78,133 @@ def test_batch_scrape_urls_all_params_with_json_options():
        json_options=json_options,
        actions=actions
    )
-    assert job.id is not None
+    
-    status = wait_for_batch_completion(app, job.id)
+    # Basic response assertions
-    assert status.success
+    assert response.success
-    assert hasattr(status, "data")
+    assert hasattr(response, "data")
-    assert len(status.data) == len(TEST_URLS)
+    assert response.data is not None
-    for doc in status.data:
+    assert isinstance(response.data, list)
    assert len(response.data) == len(TEST_URLS)
    # Detailed content assertions for each document
    for doc in response.data:
        # Markdown assertions
        assert hasattr(doc, "markdown")
        assert doc.markdown is not None
        assert isinstance(doc.markdown, str)
        assert len(doc.markdown) > 0
        # HTML assertions
        assert hasattr(doc, "html")
        assert doc.html is not None
        assert isinstance(doc.html, str)
        assert len(doc.html) > 0
        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
        assert doc.raw_html is not None
        assert isinstance(doc.raw_html, str)
        assert len(doc.raw_html) > 0
        # Links assertions
        assert hasattr(doc, "links")
        assert doc.links is not None
        assert isinstance(doc.links, list)
        # Screenshot assertions
        assert hasattr(doc, "screenshot")
        assert doc.screenshot is not None
        assert isinstance(doc.screenshot, str)
        assert doc.screenshot.startswith("https://")
        # JSON assertions
        assert hasattr(doc, "json")
-        assert hasattr(doc, "metadata") 
+        assert doc.json is not None
        assert isinstance(doc.json, dict)
        # Metadata assertions
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS
 def test_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
        WaitAction(milliseconds=500),
        ScreenshotAction(full_page=True),
        WriteAction(text="test input"),
        PressAction(key="Enter"),
        ScrollAction(direction="down"),
        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
    response = app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
        remove_base64_images=True,
        block_ads=True,
        proxy="basic",
        json_options=json_options,
        actions=actions
    )
    # Basic response assertions
    assert response.success
    assert hasattr(response, "data")
    assert response.data is not None
    assert isinstance(response.data, list)
    assert len(response.data) == len(TEST_URLS)
    # Detailed content assertions for each document
    for doc in response.data:
        # Markdown assertions
        assert hasattr(doc, "markdown")
        assert doc.markdown is not None
        assert isinstance(doc.markdown, str)
        assert len(doc.markdown) > 0
        # HTML assertions
        assert hasattr(doc, "html")
        assert doc.html is not None
        assert isinstance(doc.html, str)
        assert len(doc.html) > 0
        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
        assert doc.raw_html is not None
        assert isinstance(doc.raw_html, str)
        assert len(doc.raw_html) > 0
        # Links assertions
        assert hasattr(doc, "links")
        assert doc.links is not None
        assert isinstance(doc.links, list)
        # Screenshot assertions (full page)
        assert hasattr(doc, "screenshot")
        assert doc.screenshot is not None
        assert isinstance(doc.screenshot, str)
        assert doc.screenshot.startswith("https://")
        # JSON assertions
        assert hasattr(doc, "json")
        assert doc.json is not None
        assert isinstance(doc.json, dict)
        # Metadata assertions
        assert hasattr(doc, "metadata")
        assert doc.metadata is not None
        assert isinstance(doc.metadata, dict)
        assert "url" in doc.metadata
        assert doc.metadata["url"] in TEST_URLS