From c5b64bd2945bd09a18dc8fabc432cac270f011b4 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <8574157+rafaelmmiller@users.noreply.github.com> Date: Tue, 27 May 2025 15:47:05 -0300 Subject: [PATCH] sdk(v3): async batch scrape (async and sync clients) --- apps/python-sdk/firecrawl/firecrawl.py | 210 ++++------------- .../e2e/async/test_async_batch_scrape.py | 212 +++++++++++++----- .../tests/e2e/test_async_batch_scrape.py | 202 ++++++++++++----- 3 files changed, 353 insertions(+), 271 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 784355b3..756ff555 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -930,89 +930,32 @@ class FirecrawlApp: # Validate any additional kwargs self._validate_kwargs(kwargs, "async_batch_scrape_urls") - scrape_params = {} - - # Add individual parameters - if formats is not None: - scrape_params['formats'] = formats - if headers is not None: - scrape_params['headers'] = headers - if include_tags is not None: - scrape_params['includeTags'] = include_tags - if exclude_tags is not None: - scrape_params['excludeTags'] = exclude_tags - if only_main_content is not None: - scrape_params['onlyMainContent'] = only_main_content - if wait_for is not None: - scrape_params['waitFor'] = wait_for - if timeout is not None: - scrape_params['timeout'] = timeout - if location is not None: - scrape_params['location'] = location.model_dump(exclude_none=True) - if mobile is not None: - scrape_params['mobile'] = mobile - if skip_tls_verification is not None: - scrape_params['skipTlsVerification'] = skip_tls_verification - if remove_base64_images is not None: - scrape_params['removeBase64Images'] = remove_base64_images - if block_ads is not None: - scrape_params['blockAds'] = block_ads - if proxy is not None: - scrape_params['proxy'] = proxy - if extract is not None: - extract = ensure_schema_dict(extract) - if isinstance(extract, dict) and "schema" in extract: - extract["schema"] = ensure_schema_dict(extract["schema"]) - scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True) - if json_options is not None: - json_options = ensure_schema_dict(json_options) - if isinstance(json_options, dict) and "schema" in json_options: - json_options["schema"] = ensure_schema_dict(json_options["schema"]) - - # Convert to dict if it's a JsonConfig object - if hasattr(json_options, 'dict'): - json_options_dict = json_options.model_dump(exclude_none=True) - else: - json_options_dict = json_options - - # Convert snake_case to camelCase for API - json_options_api = {} - if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None: - json_options_api['prompt'] = json_options_dict['prompt'] - if 'schema' in json_options_dict and json_options_dict['schema'] is not None: - json_options_api['schema'] = json_options_dict['schema'] - if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None: - json_options_api['systemPrompt'] = json_options_dict['system_prompt'] - if 'agent' in json_options_dict and json_options_dict['agent'] is not None: - json_options_api['agent'] = json_options_dict['agent'] - - scrape_params['jsonOptions'] = json_options_api - if actions is not None: - scrape_params['actions'] = [action.model_dump(exclude_none=True) for action in actions] - if agent is not None: - scrape_params['agent'] = agent.model_dump(exclude_none=True) - - # Add any additional kwargs - scrape_params.update(kwargs) - - # Create final params object - final_params = ScrapeParams(**scrape_params) - params_dict = final_params.model_dump(exclude_none=True) - params_dict['urls'] = urls - params_dict['origin'] = f"python-sdk@{version}" - - if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: - params_dict['extract']['schema'] = ensure_schema_dict(params_dict['extract']['schema']) - if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: - params_dict['jsonOptions']['schema'] = ensure_schema_dict(params_dict['jsonOptions']['schema']) - - # Apply format transformation for API - if 'formats' in params_dict and params_dict['formats']: - params_dict['formats'] = scrape_formats_transform(params_dict['formats']) + scrape_params = parse_scrape_options( + formats=formats, + include_tags=include_tags, + exclude_tags=exclude_tags, + only_main_content=only_main_content, + wait_for=wait_for, + timeout=timeout, + location=location, + mobile=mobile, + skip_tls_verification=skip_tls_verification, + remove_base64_images=remove_base64_images, + block_ads=block_ads, + proxy=proxy, + extract=extract, + json_options=json_options, + actions=actions, + agent=agent, + **kwargs + ) + + scrape_params['urls'] = urls + scrape_params['origin'] = f"python-sdk@{version}" # Make request headers = self._prepare_headers(idempotency_key) - response = self._post_request(f'{self.api_url}/v1/batch/scrape', params_dict, headers) + response = self._post_request(f'{self.api_url}/v1/batch/scrape', scrape_params, headers) if response.status_code == 200: try: @@ -2747,97 +2690,40 @@ class AsyncFirecrawlApp(FirecrawlApp): # Validate any additional kwargs self._validate_kwargs(kwargs, "async_batch_scrape_urls") - scrape_params = {} - - # Add individual parameters - if formats is not None: - scrape_params['formats'] = formats - if headers is not None: - scrape_params['headers'] = headers - if include_tags is not None: - scrape_params['includeTags'] = include_tags - if exclude_tags is not None: - scrape_params['excludeTags'] = exclude_tags - if only_main_content is not None: - scrape_params['onlyMainContent'] = only_main_content - if wait_for is not None: - scrape_params['waitFor'] = wait_for - if timeout is not None: - scrape_params['timeout'] = timeout - if location is not None: - scrape_params['location'] = location.model_dump(exclude_none=True) - if mobile is not None: - scrape_params['mobile'] = mobile - if skip_tls_verification is not None: - scrape_params['skipTlsVerification'] = skip_tls_verification - if remove_base64_images is not None: - scrape_params['removeBase64Images'] = remove_base64_images - if block_ads is not None: - scrape_params['blockAds'] = block_ads - if proxy is not None: - scrape_params['proxy'] = proxy - if extract is not None: - extract = ensure_schema_dict(extract) - if isinstance(extract, dict) and "schema" in extract: - extract["schema"] = ensure_schema_dict(extract["schema"]) - scrape_params['extract'] = extract if isinstance(extract, dict) else extract.model_dump(exclude_none=True) - if json_options is not None: - json_options = ensure_schema_dict(json_options) - if isinstance(json_options, dict) and "schema" in json_options: - json_options["schema"] = ensure_schema_dict(json_options["schema"]) - - # Convert to dict if it's a JsonConfig object - if hasattr(json_options, 'dict'): - json_options_dict = json_options.model_dump(exclude_none=True) - else: - json_options_dict = json_options - - # Convert snake_case to camelCase for API - json_options_api = {} - if 'prompt' in json_options_dict and json_options_dict['prompt'] is not None: - json_options_api['prompt'] = json_options_dict['prompt'] - if 'schema' in json_options_dict and json_options_dict['schema'] is not None: - json_options_api['schema'] = json_options_dict['schema'] - if 'system_prompt' in json_options_dict and json_options_dict['system_prompt'] is not None: - json_options_api['systemPrompt'] = json_options_dict['system_prompt'] - if 'agent' in json_options_dict and json_options_dict['agent'] is not None: - json_options_api['agent'] = json_options_dict['agent'] - - scrape_params['jsonOptions'] = json_options_api - if actions is not None: - scrape_params['actions'] = [action.model_dump(exclude_none=True) for action in actions] - if agent is not None: - scrape_params['agent'] = agent.model_dump(exclude_none=True) - - # Add any additional kwargs - scrape_params.update(kwargs) - - # Create final params object - final_params = ScrapeParams(**scrape_params) - params_dict = final_params.model_dump(exclude_none=True) - params_dict['urls'] = urls - params_dict['origin'] = f"python-sdk@{version}" - - if 'extract' in params_dict and params_dict['extract'] and 'schema' in params_dict['extract']: - params_dict['extract']['schema'] = ensure_schema_dict(params_dict['extract']['schema']) - if 'jsonOptions' in params_dict and params_dict['jsonOptions'] and 'schema' in params_dict['jsonOptions']: - params_dict['jsonOptions']['schema'] = ensure_schema_dict(params_dict['jsonOptions']['schema']) - - # Apply format transformation for API - if 'formats' in params_dict and params_dict['formats']: - params_dict['formats'] = scrape_formats_transform(params_dict['formats']) + scrape_params = parse_scrape_options( + formats=formats, + include_tags=include_tags, + exclude_tags=exclude_tags, + only_main_content=only_main_content, + wait_for=wait_for, + timeout=timeout, + location=location, + mobile=mobile, + skip_tls_verification=skip_tls_verification, + remove_base64_images=remove_base64_images, + block_ads=block_ads, + proxy=proxy, + extract=extract, + json_options=json_options, + actions=actions, + agent=agent, + **kwargs + ) + + scrape_params['urls'] = urls + scrape_params['origin'] = f"python-sdk@{version}" # Make request headers = self._prepare_headers(idempotency_key) response = await self._async_post_request( f'{self.api_url}/v1/batch/scrape', - params_dict, + scrape_params, headers ) - if response.get('status_code') == 200: + if response.get('success'): try: - return BatchScrapeResponse(**response.json()) + return BatchScrapeResponse(**response) except: raise Exception(f'Failed to parse Firecrawl response as JSON.') else: diff --git a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py index f28a525c..75250350 100644 --- a/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/async/test_async_batch_scrape.py @@ -1,8 +1,7 @@ import os import pytest -import sys from dotenv import load_dotenv -import asyncio +import sys sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../'))) from firecrawl.firecrawl import AsyncFirecrawlApp, JsonConfig, LocationConfig, WaitAction, ScreenshotAction, WriteAction, PressAction, ScrollAction, ExecuteJavascriptAction @@ -15,16 +14,17 @@ API_KEY = os.getenv("TEST_API_KEY") app = AsyncFirecrawlApp(api_url=API_URL, api_key=API_KEY) TEST_URLS = [ - "https://example.com", - "https://www.iana.org" + "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions", + "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination" ] + async def wait_for_batch_completion(app, job_id, timeout=60): for _ in range(timeout): status = await app.check_batch_scrape_status(job_id) if status.status == "completed": return status - await asyncio.sleep(1) + import time; time.sleep(1) raise TimeoutError("Batch scrape did not complete in time") @pytest.mark.asyncio @@ -35,55 +35,31 @@ async def test_async_batch_scrape_urls_simple(): ) assert job.id is not None status = await wait_for_batch_completion(app, job.id) + + # Basic response assertions assert status.success assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) assert len(status.data) == len(TEST_URLS) - assert any("Example Domain" in doc.markdown for doc in status.data) - assert any("Internet Assigned Numbers" in doc.markdown for doc in status.data) - -@pytest.mark.asyncio -async def test_async_batch_scrape_urls_all_params_with_extract(): - location = LocationConfig(country="us", languages=["en"]) - extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} - extract = JsonConfig(prompt="Extract the title", schema=extract_schema) - actions = [ - WaitAction(type="wait", milliseconds=500), - ScreenshotAction(type="screenshot", fullPage=True), - WriteAction(type="write", text="test input"), - PressAction(type="press", key="Enter"), - ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") - ] - job = await app.async_batch_scrape_urls( - TEST_URLS, - formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], - include_tags=["h1", "p"], - exclude_tags=["footer"], - only_main_content=True, - wait_for=1000, - timeout=15000, - location=location, - mobile=True, - skip_tls_verification=True, - remove_base64_images=True, - block_ads=True, - proxy="basic", - extract=extract, - actions=actions - ) - assert job.id is not None - status = await wait_for_batch_completion(app, job.id) - assert status.success - assert hasattr(status, "data") - assert len(status.data) == len(TEST_URLS) + + # Content assertions for each document for doc in status.data: assert hasattr(doc, "markdown") - assert hasattr(doc, "html") - assert hasattr(doc, "raw_html") - assert hasattr(doc, "links") - assert hasattr(doc, "screenshot") - assert hasattr(doc, "extract") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS + + # Check that we got content from both test pages + markdown_contents = [doc.markdown for doc in status.data] + assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents) + assert any("Numbered Pagination" in content for content in markdown_contents) @pytest.mark.asyncio async def test_async_batch_scrape_urls_all_params_with_json_options(): @@ -91,12 +67,12 @@ async def test_async_batch_scrape_urls_all_params_with_json_options(): json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) actions = [ - WaitAction(type="wait", milliseconds=500), - ScreenshotAction(type="screenshot", fullPage=True), - WriteAction(type="write", text="test input"), - PressAction(type="press", key="Enter"), - ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + WaitAction(milliseconds=500), + ScreenshotAction(fullPage=True), + WriteAction(text="test input"), + PressAction(key="Enter"), + ScrollAction(direction="down"), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") ] job = await app.async_batch_scrape_urls( TEST_URLS, @@ -105,7 +81,7 @@ async def test_async_batch_scrape_urls_all_params_with_json_options(): exclude_tags=["footer"], only_main_content=True, wait_for=1000, - timeout=15000, + timeout=30000, location=location, mobile=True, skip_tls_verification=True, @@ -117,14 +93,136 @@ async def test_async_batch_scrape_urls_all_params_with_json_options(): ) assert job.id is not None status = await wait_for_batch_completion(app, job.id) + + # Basic response assertions assert status.success assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) assert len(status.data) == len(TEST_URLS) + + # Detailed content assertions for each document for doc in status.data: + # Markdown assertions assert hasattr(doc, "markdown") - assert hasattr(doc, "html") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + + # HTML assertions + assert hasattr(doc, "html") + assert doc.html is not None + assert isinstance(doc.html, str) + assert len(doc.html) > 0 + + # Raw HTML assertions assert hasattr(doc, "raw_html") + assert doc.raw_html is not None + assert isinstance(doc.raw_html, str) + assert len(doc.raw_html) > 0 + + # Links assertions assert hasattr(doc, "links") + assert doc.links is not None + assert isinstance(doc.links, list) + + # Screenshot assertions assert hasattr(doc, "screenshot") + assert doc.screenshot is not None + assert isinstance(doc.screenshot, str) + assert doc.screenshot.startswith("https://") + + # JSON assertions assert hasattr(doc, "json") - assert hasattr(doc, "metadata") \ No newline at end of file + assert doc.json is not None + assert isinstance(doc.json, dict) + + # Metadata assertions + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS + +@pytest.mark.asyncio +async def test_async_batch_scrape_urls_all_params_with_json_options_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(milliseconds=500), + ScreenshotAction(fullPage=True), + WriteAction(text="test input"), + PressAction(key="Enter"), + ScrollAction(direction="down"), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ] + job = await app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=30000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert job.id is not None + status = await wait_for_batch_completion(app, job.id) + + # Basic response assertions + assert status.success + assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) + assert len(status.data) == len(TEST_URLS) + + # Detailed content assertions for each document + for doc in status.data: + # Markdown assertions + assert hasattr(doc, "markdown") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + + # HTML assertions + assert hasattr(doc, "html") + assert doc.html is not None + assert isinstance(doc.html, str) + assert len(doc.html) > 0 + + # Raw HTML assertions + assert hasattr(doc, "raw_html") + assert doc.raw_html is not None + assert isinstance(doc.raw_html, str) + assert len(doc.raw_html) > 0 + + # Links assertions + assert hasattr(doc, "links") + assert doc.links is not None + assert isinstance(doc.links, list) + + # Screenshot assertions (full page) + assert hasattr(doc, "screenshot") + assert doc.screenshot is not None + assert isinstance(doc.screenshot, str) + assert doc.screenshot.startswith("https://") + + # JSON assertions + assert hasattr(doc, "json") + assert doc.json is not None + assert isinstance(doc.json, dict) + + # Metadata assertions + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS \ No newline at end of file diff --git a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py index b05617fe..b8d0ae86 100644 --- a/apps/python-sdk/tests/e2e/test_async_batch_scrape.py +++ b/apps/python-sdk/tests/e2e/test_async_batch_scrape.py @@ -14,8 +14,8 @@ API_KEY = os.getenv("TEST_API_KEY") app = FirecrawlApp(api_url=API_URL, api_key=API_KEY) TEST_URLS = [ - "https://example.com", - "https://www.iana.org" + "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/actions", + "https://firecrawl-e2e-test-git-main-rafaelsideguides-projects.vercel.app/numbered-pagination" ] def wait_for_batch_completion(app, job_id, timeout=60): @@ -33,66 +33,43 @@ def test_async_batch_scrape_urls_simple(): ) assert job.id is not None status = wait_for_batch_completion(app, job.id) + + # Basic response assertions assert status.success assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) assert len(status.data) == len(TEST_URLS) - assert any("Example Domain" in doc.markdown for doc in status.data) - assert any("Internet Assigned Numbers Authority" in doc.markdown for doc in status.data) - -def test_async_batch_scrape_urls_all_params_with_extract(): - location = LocationConfig(country="us", languages=["en"]) - extract_schema = {"type": "object", "properties": {"title": {"type": "string"}}} - extract = JsonConfig(prompt="Extract the title", schema=extract_schema) - actions = [ - WaitAction(type="wait", milliseconds=500), - ScreenshotAction(type="screenshot", fullPage=True), - WriteAction(type="write", text="test input"), - PressAction(type="press", key="Enter"), - ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") - ] - job = app.async_batch_scrape_urls( - TEST_URLS, - formats=["markdown", "html", "raw_html", "links", "screenshot", "extract"], - include_tags=["h1", "p"], - exclude_tags=["footer"], - only_main_content=True, - wait_for=1000, - timeout=15000, - location=location, - mobile=True, - skip_tls_verification=True, - remove_base64_images=True, - block_ads=True, - proxy="basic", - extract=extract, - actions=actions - ) - assert job.id is not None - status = wait_for_batch_completion(app, job.id) - assert status.success - assert hasattr(status, "data") - assert len(status.data) == len(TEST_URLS) + + # Content assertions for each document for doc in status.data: assert hasattr(doc, "markdown") - assert hasattr(doc, "html") - assert hasattr(doc, "raw_html") - assert hasattr(doc, "links") - assert hasattr(doc, "screenshot") - assert hasattr(doc, "extract") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS + + # Check that we got content from both test pages + markdown_contents = [doc.markdown for doc in status.data] + assert any("This page is used for end-to-end (e2e) testing with Firecrawl." in content for content in markdown_contents) + assert any("Numbered Pagination" in content for content in markdown_contents) def test_async_batch_scrape_urls_all_params_with_json_options(): location = LocationConfig(country="us", languages=["en"]) json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) actions = [ - WaitAction(type="wait", milliseconds=500), - ScreenshotAction(type="screenshot", fullPage=True), - WriteAction(type="write", text="test input"), - PressAction(type="press", key="Enter"), - ScrollAction(type="scroll", direction="down"), - ExecuteJavascriptAction(type="execute_javascript", script="function get_title() { return document.title; }; get_title();") + WaitAction(milliseconds=500), + ScreenshotAction(fullPage=True), + WriteAction(text="test input"), + PressAction(key="Enter"), + ScrollAction(direction="down"), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") ] job = app.async_batch_scrape_urls( TEST_URLS, @@ -101,7 +78,7 @@ def test_async_batch_scrape_urls_all_params_with_json_options(): exclude_tags=["footer"], only_main_content=True, wait_for=1000, - timeout=15000, + timeout=30000, location=location, mobile=True, skip_tls_verification=True, @@ -113,14 +90,135 @@ def test_async_batch_scrape_urls_all_params_with_json_options(): ) assert job.id is not None status = wait_for_batch_completion(app, job.id) + + # Basic response assertions assert status.success assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) assert len(status.data) == len(TEST_URLS) + + # Detailed content assertions for each document for doc in status.data: + # Markdown assertions assert hasattr(doc, "markdown") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + + # HTML assertions assert hasattr(doc, "html") + assert doc.html is not None + assert isinstance(doc.html, str) + assert len(doc.html) > 0 + + # Raw HTML assertions assert hasattr(doc, "raw_html") + assert doc.raw_html is not None + assert isinstance(doc.raw_html, str) + assert len(doc.raw_html) > 0 + + # Links assertions assert hasattr(doc, "links") + assert doc.links is not None + assert isinstance(doc.links, list) + + # Screenshot assertions assert hasattr(doc, "screenshot") + assert doc.screenshot is not None + assert isinstance(doc.screenshot, str) + assert doc.screenshot.startswith("https://") + + # JSON assertions assert hasattr(doc, "json") - assert hasattr(doc, "metadata") \ No newline at end of file + assert doc.json is not None + assert isinstance(doc.json, dict) + + # Metadata assertions + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS + +def test_async_batch_scrape_urls_all_params_with_json_options_full_page_screenshot(): + location = LocationConfig(country="us", languages=["en"]) + json_schema = {"type": "object", "properties": {"title": {"type": "string"}}} + json_options = JsonConfig(prompt="Extract the title as JSON", schema=json_schema) + actions = [ + WaitAction(milliseconds=500), + ScreenshotAction(fullPage=True), + WriteAction(text="test input"), + PressAction(key="Enter"), + ScrollAction(direction="down"), + ExecuteJavascriptAction(script="function get_title() { return document.title; }; get_title();") + ] + job = app.async_batch_scrape_urls( + TEST_URLS, + formats=["markdown", "html", "raw_html", "links", "screenshot@full_page", "json"], + include_tags=["h1", "p"], + exclude_tags=["footer"], + only_main_content=True, + wait_for=1000, + timeout=30000, + location=location, + mobile=True, + skip_tls_verification=True, + remove_base64_images=True, + block_ads=True, + proxy="basic", + json_options=json_options, + actions=actions + ) + assert job.id is not None + status = wait_for_batch_completion(app, job.id) + + # Basic response assertions + assert status.success + assert hasattr(status, "data") + assert status.data is not None + assert isinstance(status.data, list) + assert len(status.data) == len(TEST_URLS) + + # Detailed content assertions for each document + for doc in status.data: + # Markdown assertions + assert hasattr(doc, "markdown") + assert doc.markdown is not None + assert isinstance(doc.markdown, str) + assert len(doc.markdown) > 0 + + # HTML assertions + assert hasattr(doc, "html") + assert doc.html is not None + assert isinstance(doc.html, str) + assert len(doc.html) > 0 + + # Raw HTML assertions + assert hasattr(doc, "raw_html") + assert doc.raw_html is not None + assert isinstance(doc.raw_html, str) + assert len(doc.raw_html) > 0 + + # Links assertions + assert hasattr(doc, "links") + assert doc.links is not None + assert isinstance(doc.links, list) + + # Screenshot assertions (full page) + assert hasattr(doc, "screenshot") + assert doc.screenshot is not None + assert isinstance(doc.screenshot, str) + assert doc.screenshot.startswith("https://") + + # JSON assertions + assert hasattr(doc, "json") + assert doc.json is not None + assert isinstance(doc.json, dict) + + # Metadata assertions + assert hasattr(doc, "metadata") + assert doc.metadata is not None + assert isinstance(doc.metadata, dict) + assert "url" in doc.metadata + assert doc.metadata["url"] in TEST_URLS \ No newline at end of file