sdk(v3): batch scrape sync and ascyn -- this test is not reliable bc of a bug in batch scrape

2025-08-14 15:25:54 +08:00 · 2025-05-27 11:57:57 -03:00 · 2025-05-27 11:57:57 -03:00 · 2e14859de0
commit 2e14859de0
parent ab1e244693
3 changed files with 353 additions and 193 deletions
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -3355,68 +3355,48 @@ class AsyncFirecrawlApp(FirecrawlApp):
        headers = self._prepare_headers()
        response = await self._async_get_request(f'{self.api_url}{endpoint}', headers)

-        if response.get('success'):
-            try:
-                status_data = response.json()
-            except:
-                raise Exception(f'Failed to parse Firecrawl response as JSON.')
-            
-            while status_data['status'] != 'completed':
-                time.sleep(poll_interval)
+        try:
+            while response.get('status') != 'completed':
+                await asyncio.sleep(poll_interval)
                response = await self._async_get_request(f'{self.api_url}{endpoint}', headers)
-                if response.get('success'):
-                    status_data = response.get('data')
-                else:
-                    self._handle_error(response, 'check batch scrape status')

-            if 'data' in status_data:
-                data = status_data['data']
-                while 'next' in status_data:
-                    if len(status_data['data']) == 0:
-                        break
-                    next_url = status_data.get('next')
-                    if not next_url:
-                        logger.warning("Expected 'next' URL is missing.")
-                        break
-                    try:
-                        status_response = await self._async_get_request(next_url, headers)
-                        if not status_response.get('success'):
-                            logger.error(f"Failed to fetch next page: {status_response.status_code}")
+                if 'data' in response:
+                    data = response.get('data')
+                    while 'next' in response:
+                        if len(response.get('data')) == 0:
+                            break
+                        next_url = response.get('next')
+                        if not next_url:
+                            logger.warning("Expected 'next' URL is missing.")
                            break
                        try:
-                            next_data = status_response.json()
-                        except:
-                            raise Exception(f'Failed to parse Firecrawl response as JSON.')
-                        data.extend(next_data.get('data', []))
-                        status_data = next_data
-                    except Exception as e:
-                        logger.error(f"Error during pagination request: {e}")
-                        break
+                            response = await self._async_get_request(next_url, headers)
+                            next_data = response.get('data')
+                            if next_data:
+                                data.extend(next_data.get('data', []))
+                        except Exception as e:
+                            logger.error(f"Error during pagination request: {e}")
+                            break

-                # Apply format transformations to each document in the data
-                if data:
-                    for document in data:
-                        scrape_formats_response_transform(document)
+                    # Apply format transformations to each document in the data
+                    if data:
+                        for document in data:
+                            scrape_formats_response_transform(document)

            response = {
-                'status': status_data.get('status'),
-                'total': status_data.get('total'),
-                'completed': status_data.get('completed'),
-                'credits_used': status_data.get('creditsUsed'),
-                'expires_at': status_data.get('expiresAt'),
+                'status': response.get('status'),
+                'total': response.get('total'),
+                'completed': response.get('completed'),
+                'credits_used': response.get('creditsUsed'),
+                'expires_at': response.get('expiresAt'),
                'data': data,
-                'next': status_data.get('next'),
-                'error': status_data.get('error')
+                'next': response.get('next'),
+                'error': response.get('error')
            }

-            if 'error' in status_data:
-                response['error'] = status_data['error']
-
-            if 'next' in status_data:
-                response['next'] = status_data['next']
-
            return BatchScrapeStatusResponse(**response)
-        else:
+        
+        except Exception as e:
            await self._handle_error(response, 'check batch scrape status')

    async def check_batch_scrape_errors(self, id: str) -> CrawlErrorsResponse:
--- a/apps/python-sdk/tests/e2e/async/test_batch_scrape.py
+++ b/apps/python-sdk/tests/e2e/async/test_batch_scrape.py
@ -1,100 +1,76 @@
-import sys
 import os
-import subprocess
-import time
 import pytest
 from dotenv import load_dotenv
+import sys

 sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')))
-from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, ChangeTrackingOptions, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction
+from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction

 load_dotenv()

-API_URL = os.getenv("TEST_API_URL") or "http://localhost:3002"
+API_URL = "https://api.firecrawl.dev"
 API_KEY = os.getenv("TEST_API_KEY")

 app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY)

 TEST_URLS = [
-    "https://example.com",
-    "https://iana.org"
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
 ]

@pytest.mark.asyncio
-async def test_batch_scrape_urls_async_simple():
-    result = await app.batch_scrape_urls(
+async def test_batch_scrape_urls_simple():
+    response = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown"]
    )
-    assert result.success
-    assert hasattr(result, "data")
-    assert len(result.data) == len(TEST_URLS)
-    assert any("Example Domain" in doc.markdown for doc in result.data)
-
-@pytest.mark.asyncio
-async def test_batch_scrape_urls_async_all_params_with_extract():
-    location = LocationConfig(country="us", languages=["en"])
-    extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
-    extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
-    actions = [
-        WaitAction(type="wait", milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
-        WriteAction(type="write", text="test input"),
-        PressAction(type="press", key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
-    ]
-    result = await app.batch_scrape_urls(
-        TEST_URLS,
-        formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
-        include_tags=["h1", "p"],
-        exclude_tags=["footer"],
-        only_main_content=True,
-        wait_for=1000,
-        timeout=15000,
-        location=location,
-        mobile=True,
-        skip_tls_verification=True,
-        remove_base64_images=True,
-        block_ads=True,
-        proxy="basic",
-        extract=extract,
-        actions=actions
-    )
-    assert result.success
-    assert hasattr(result, "data")
-
-    assert len(result.data) == len(TEST_URLS)
-    for doc in result.data:
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Content assertions for each document
+    for doc in response.data:
        assert hasattr(doc, "markdown")
-        assert hasattr(doc, "html")
-        assert hasattr(doc, "raw_html")
-        assert hasattr(doc, "links")
-        assert hasattr(doc, "screenshot")
-        assert hasattr(doc, "extract")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS
+    
+    # Check that we got content from both test pages
+    markdown_contents = [doc.markdown for doc in response.data]
+    assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
+    assert any("Numbered Pagination" in content for content in markdown_contents)

@pytest.mark.asyncio
-async def test_batch_scrape_urls_async_all_params_with_json_options():
+async def test_batch_scrape_urls_all_params_with_json_options():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
-        WaitAction(type="wait", milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
-        WriteAction(type="write", text="test input"),
-        PressAction(type="press", key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
+        WaitAction(milliseconds=500),
+        ScreenshotAction(full_page=True),
+        WriteAction(text="test input"),
+        PressAction(key="Enter"),
+        ScrollAction(direction="down"),
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
-    result = await app.batch_scrape_urls(
+    response = await app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
-        timeout=15000,
+        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
@ -104,14 +80,134 @@ async def test_batch_scrape_urls_async_all_params_with_json_options():
        json_options=json_options,
        actions=actions
    )
-    assert result.success
-    assert hasattr(result, "data")
-    assert len(result.data) == len(TEST_URLS)
-    for doc in result.data:
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Detailed content assertions for each document
+    for doc in response.data:
+        # Markdown assertions
        assert hasattr(doc, "markdown")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
+        # HTML assertions
        assert hasattr(doc, "html")
+        assert doc.html is not None
+        assert isinstance(doc.html, str)
+        assert len(doc.html) > 0
+        
+        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
+        assert doc.raw_html is not None
+        assert isinstance(doc.raw_html, str)
+        assert len(doc.raw_html) > 0
+        
+        # Links assertions
        assert hasattr(doc, "links")
+        assert doc.links is not None
+        assert isinstance(doc.links, list)
+        
+        # Screenshot assertions
        assert hasattr(doc, "screenshot")
+        assert doc.screenshot is not None
+        assert isinstance(doc.screenshot, str)
+        assert doc.screenshot.startswith("https://")
+        
+        # JSON assertions
        assert hasattr(doc, "json")
-        assert hasattr(doc, "metadata") 
+        assert doc.json is not None
+        assert isinstance(doc.json, dict)
+        
+        # Metadata assertions
+        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS
+
+@pytest.mark.asyncio
+async def test_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
+    location = LocationConfig(country="us", languages=["en"])
+    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
+    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
+    actions = [
+        WaitAction(milliseconds=500),
+        ScreenshotAction(full_page=True),
+        WriteAction(text="test input"),
+        PressAction(key="Enter"),
+        ScrollAction(direction="down"),
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
+    ]
+    response = await app.batch_scrape_urls(
+        TEST_URLS,
+        formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
+        include_tags=["h1", "p"],
+        exclude_tags=["footer"],
+        only_main_content=True,
+        wait_for=1000,
+        timeout=30000,
+        location=location,
+        mobile=True,
+        skip_tls_verification=True,
+        remove_base64_images=True,
+        block_ads=True,
+        proxy="basic",
+        json_options=json_options,
+        actions=actions
+    )
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Detailed content assertions for each document
+    for doc in response.data:
+        # Markdown assertions
+        assert hasattr(doc, "markdown")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
+        # HTML assertions
+        assert hasattr(doc, "html")
+        assert doc.html is not None
+        assert isinstance(doc.html, str)
+        assert len(doc.html) > 0
+        
+        # Raw HTML assertions
+        assert hasattr(doc, "raw_html")
+        assert doc.raw_html is not None
+        assert isinstance(doc.raw_html, str)
+        assert len(doc.raw_html) > 0
+        
+        # Links assertions
+        assert hasattr(doc, "links")
+        assert doc.links is not None
+        assert isinstance(doc.links, list)
+        
+        # Screenshot assertions (full page)
+        assert hasattr(doc, "screenshot")
+        assert doc.screenshot is not None
+        assert isinstance(doc.screenshot, str)
+        assert doc.screenshot.startswith("https://")
+        
+        # JSON assertions
+        assert hasattr(doc, "json")
+        assert doc.json is not None
+        assert isinstance(doc.json, dict)
+        
+        # Metadata assertions
+        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS
--- a/apps/python-sdk/tests/e2e/test_batch_scrape.py
+++ b/apps/python-sdk/tests/e2e/test_batch_scrape.py
@ -14,94 +14,61 @@ API_KEY = os.getenv("TEST_API_KEY")
 app = FirecrawlApp(api_url=API_URL, api_key=API_KEY)

 TEST_URLS = [
-    "https://example.com",
-    "https://www.iana.org"
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions",
+    "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination"
 ]

-def wait_for_batch_completion(app, job_id, timeout=60):
-    for _ in range(timeout):
-        status = app.check_batch_scrape_status(job_id)
-        if status.status == "completed":
-            return status
-        import time; time.sleep(1)
-    raise TimeoutError("Batch scrape did not complete in time")
-
 def test_batch_scrape_urls_simple():
-    job = app.async_batch_scrape_urls(
+    response = app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown"]
    )
-    assert job.id is not None
-    status = wait_for_batch_completion(app, job.id)
-    assert status.success
-    assert hasattr(status, "data")
-    assert len(status.data) == len(TEST_URLS)
-    assert any("Example Domain" in doc.markdown for doc in status.data)
-    assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data)
-
-def test_batch_scrape_urls_all_params_with_extract():
-    location = LocationConfig(country="us", languages=["en"])
-    extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
-    extract = JsonConfig(prompt="Extract the title", schema=extract_schema)
-    actions = [
-        WaitAction(type="wait", milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
-        WriteAction(type="write", text="test input"),
-        PressAction(type="press", key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
-    ]
-    job = app.async_batch_scrape_urls(
-        TEST_URLS,
-        formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"],
-        include_tags=["h1", "p"],
-        exclude_tags=["footer"],
-        only_main_content=True,
-        wait_for=1000,
-        timeout=15000,
-        location=location,
-        mobile=True,
-        skip_tls_verification=True,
-        remove_base64_images=True,
-        block_ads=True,
-        proxy="basic",
-        extract=extract,
-        actions=actions
-    )
-    assert job.id is not None
-    status = wait_for_batch_completion(app, job.id)
-    assert status.success
-    assert hasattr(status, "data")
-    assert len(status.data) == len(TEST_URLS)
-    for doc in status.data:
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Content assertions for each document
+    for doc in response.data:
        assert hasattr(doc, "markdown")
-        assert hasattr(doc, "html")
-        assert hasattr(doc, "raw_html")
-        assert hasattr(doc, "links")
-        assert hasattr(doc, "screenshot")
-        assert hasattr(doc, "extract")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS
+    
+    # Check that we got content from both test pages
+    markdown_contents = [doc.markdown for doc in response.data]
+    assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents)
+    assert any("Numbered Pagination" in content for content in markdown_contents)

 def test_batch_scrape_urls_all_params_with_json_options():
    location = LocationConfig(country="us", languages=["en"])
    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
    actions = [
-        WaitAction(type="wait", milliseconds=500),
-        ScreenshotAction(type="screenshot", fullPage=True),
-        WriteAction(type="write", text="test input"),
-        PressAction(type="press", key="Enter"),
-        ScrollAction(type="scroll", direction="down"),
-        ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();")
+        WaitAction(milliseconds=500),
+        ScreenshotAction(full_page=True),
+        WriteAction(text="test input"),
+        PressAction(key="Enter"),
+        ScrollAction(direction="down"),
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
    ]
-    job = app.async_batch_scrape_urls(
+    response = app.batch_scrape_urls(
        TEST_URLS,
        formats=["markdown", "html", "raw_html", "links", "screenshot", "json"],
        include_tags=["h1", "p"],
        exclude_tags=["footer"],
        only_main_content=True,
        wait_for=1000,
-        timeout=15000,
+        timeout=30000,
        location=location,
        mobile=True,
        skip_tls_verification=True,
@ -111,16 +78,133 @@ def test_batch_scrape_urls_all_params_with_json_options():
        json_options=json_options,
        actions=actions
    )
-    assert job.id is not None
-    status = wait_for_batch_completion(app, job.id)
-    assert status.success
-    assert hasattr(status, "data")
-    assert len(status.data) == len(TEST_URLS)
-    for doc in status.data:
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Detailed content assertions for each document
+    for doc in response.data:
+        # Markdown assertions
        assert hasattr(doc, "markdown")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
+        # HTML assertions
        assert hasattr(doc, "html")
+        assert doc.html is not None
+        assert isinstance(doc.html, str)
+        assert len(doc.html) > 0
+        
+        # Raw HTML assertions
        assert hasattr(doc, "raw_html")
+        assert doc.raw_html is not None
+        assert isinstance(doc.raw_html, str)
+        assert len(doc.raw_html) > 0
+        
+        # Links assertions
        assert hasattr(doc, "links")
+        assert doc.links is not None
+        assert isinstance(doc.links, list)
+        
+        # Screenshot assertions
        assert hasattr(doc, "screenshot")
+        assert doc.screenshot is not None
+        assert isinstance(doc.screenshot, str)
+        assert doc.screenshot.startswith("https://")
+        
+        # JSON assertions
        assert hasattr(doc, "json")
-        assert hasattr(doc, "metadata") 
+        assert doc.json is not None
+        assert isinstance(doc.json, dict)
+        
+        # Metadata assertions
+        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS
+
+def test_batch_scrape_urls_all_params_with_json_options_full_page_screenshot():
+    location = LocationConfig(country="us", languages=["en"])
+    json_schema = {"type": "object", "properties": {"title": {"type": "string"}}}
+    json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema)
+    actions = [
+        WaitAction(milliseconds=500),
+        ScreenshotAction(full_page=True),
+        WriteAction(text="test input"),
+        PressAction(key="Enter"),
+        ScrollAction(direction="down"),
+        ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();")
+    ]
+    response = app.batch_scrape_urls(
+        TEST_URLS,
+        formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"],
+        include_tags=["h1", "p"],
+        exclude_tags=["footer"],
+        only_main_content=True,
+        wait_for=1000,
+        timeout=30000,
+        location=location,
+        mobile=True,
+        skip_tls_verification=True,
+        remove_base64_images=True,
+        block_ads=True,
+        proxy="basic",
+        json_options=json_options,
+        actions=actions
+    )
+    
+    # Basic response assertions
+    assert response.success
+    assert hasattr(response, "data")
+    assert response.data is not None
+    assert isinstance(response.data, list)
+    assert len(response.data) == len(TEST_URLS)
+    
+    # Detailed content assertions for each document
+    for doc in response.data:
+        # Markdown assertions
+        assert hasattr(doc, "markdown")
+        assert doc.markdown is not None
+        assert isinstance(doc.markdown, str)
+        assert len(doc.markdown) > 0
+        
+        # HTML assertions
+        assert hasattr(doc, "html")
+        assert doc.html is not None
+        assert isinstance(doc.html, str)
+        assert len(doc.html) > 0
+        
+        # Raw HTML assertions
+        assert hasattr(doc, "raw_html")
+        assert doc.raw_html is not None
+        assert isinstance(doc.raw_html, str)
+        assert len(doc.raw_html) > 0
+        
+        # Links assertions
+        assert hasattr(doc, "links")
+        assert doc.links is not None
+        assert isinstance(doc.links, list)
+        
+        # Screenshot assertions (full page)
+        assert hasattr(doc, "screenshot")
+        assert doc.screenshot is not None
+        assert isinstance(doc.screenshot, str)
+        assert doc.screenshot.startswith("https://")
+        
+        # JSON assertions
+        assert hasattr(doc, "json")
+        assert doc.json is not None
+        assert isinstance(doc.json, dict)
+        
+        # Metadata assertions
+        assert hasattr(doc, "metadata")
+        assert doc.metadata is not None
+        assert isinstance(doc.metadata, dict)
+        assert "url" in doc.metadata
+        assert doc.metadata["url"] in TEST_URLS