From a4686e3c8c3e79507d9f8a68f2d66ec916337d5f Mon Sep 17 00:00:00 2001
From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com>
Date: Wed, 21 Aug 2024 15:56:48 -0300
Subject: [PATCH] fixing tests

---
 .../firecrawl/__tests__/e2e_withAuth/test.py  |  6 +-
 apps/python-sdk/firecrawl/firecrawl.py        | 80 +++++++++++++++----
 2 files changed, 70 insertions(+), 16 deletions(-)

diff --git a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
index 457c206a..8945d74d 100644
--- a/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/__tests__/e2e_withAuth/test.py
@@ -7,7 +7,7 @@ from dotenv import load_dotenv
 
 load_dotenv()
 
-API_URL = "http://127.0.0.1:3002";
+API_URL = "http://127.0.0.1:3002"
 ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
 TEST_API_KEY = os.getenv('TEST_API_KEY')
 
@@ -46,6 +46,8 @@ def test_successful_response_with_valid_preview_token():
 def test_scrape_url_e2e():
     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
     response = app.scrape_url('https://roastmywebsite.ai')
+    print(response)
+
     assert response is not None
     assert 'content' in response
     assert 'markdown' in response
@@ -145,7 +147,7 @@ def test_search_invalid_api_key():
 
 def test_llm_extraction():
     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
-    response = app.scrape_url("https://mendable.ai", {
+    response = app.scrape_url("https://firecrawl.dev", {
         'extractorOptions': {
             'mode': 'llm-extraction',
             'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 25c9663e..f67afbdb 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -155,20 +155,30 @@ class FirecrawlApp:
             json_data.update(params)
         response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
         if response.status_code == 200:
-            job_id = response.json().get('jobId')
-            if wait_until_done:
-                return self._monitor_job_status(job_id, headers, poll_interval)
+            if self.version == 'v0':
+                id = response.json().get('jobId')
             else:
-                return {'jobId': job_id}
+                id = response.json().get('id')
+
+            if wait_until_done:
+                check_url = None
+                if self.version == 'v1':
+                    check_url = response.json().get('url')
+                return self._monitor_job_status(id, headers, poll_interval, check_url)
+            else:
+                if self.version == 'v0':
+                    return {'jobId': id}
+                else:
+                    return {'id': id}
         else:
             self._handle_error(response, 'start crawl job')
 
-    def check_crawl_status(self, job_id: str) -> Any:
+    def check_crawl_status(self, id: str) -> Any:
         """
         Check the status of a crawl job using the Firecrawl API.
 
         Args:
-            job_id (str): The ID of the crawl job.
+            id (str): The ID of the crawl job.
 
         Returns:
             Any: The status of the crawl job.
@@ -176,11 +186,38 @@ class FirecrawlApp:
         Raises:
             Exception: If the status check request fails.
         """
-        endpoint = f'/{self.version}/crawl/status/{job_id}'
+
+        if self.version == 'v0':
+            endpoint = f'/{self.version}/crawl/status/{id}'
+        else:
+            endpoint = f'/{self.version}/crawl/{id}'
+
         headers = self._prepare_headers()
         response = self._get_request(f'{self.api_url}{endpoint}', headers)
         if response.status_code == 200:
-            return response.json()
+            data = response.json()
+            if self.version == 'v0':
+                return {
+                    'success': True,
+                    'status': data.get('status'),
+                    'current': data.get('current'),
+                    'current_url': data.get('current_url'),
+                    'current_step': data.get('current_step'),
+                    'total': data.get('total'),
+                    'data': data.get('data'),
+                    'partial_data': data.get('partial_data') if not data.get('data') else None,
+                }
+            elif self.version == 'v1':
+                return {
+                    'success': True,
+                    'status': data.get('status'),
+                    'totalCount': data.get('totalCount'),
+                    'creditsUsed': data.get('creditsUsed'),
+                    'expiresAt': data.get('expiresAt'),
+                    'next': data.get('next'),
+                    'data': data.get('data'),
+                    'error': data.get('error')
+                }
         else:
             self._handle_error(response, 'check crawl status')
 
@@ -292,15 +329,15 @@ class FirecrawlApp:
                 return response
         return response
 
-    def _monitor_job_status(self, job_id: str, headers: Dict[str, str], poll_interval: int) -> Any:
+    def _monitor_job_status(self, id: str, headers: Dict[str, str], poll_interval: int, check_url: Optional[str] = None) -> Any:
         """
         Monitor the status of a crawl job until completion.
 
         Args:
-            job_id (str): The ID of the crawl job.
+            id (str): The ID of the crawl job.
             headers (Dict[str, str]): The headers to include in the status check requests.
             poll_interval (int): Secounds between status checks.
-
+            check_url (Optional[str]): The URL to check for the crawl job.
         Returns:
             Any: The crawl results if the job is completed successfully.
 
@@ -308,15 +345,30 @@ class FirecrawlApp:
             Exception: If the job fails or an error occurs during status checks.
         """
         while True:
-            status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
+            api_url = ''
+            if (self.version == 'v0'):
+                if check_url:
+                    api_url = check_url
+                else:
+                    api_url = f'{self.api_url}/v0/crawl/status/{id}'
+            else:
+                if check_url:
+                    api_url = check_url
+                else:
+                    api_url = f'{self.api_url}/v1/crawl/{id}'
+
+            status_response = self._get_request(api_url, headers)
             if status_response.status_code == 200:
                 status_data = status_response.json()
                 if status_data['status'] == 'completed':
                     if 'data' in status_data:
-                        return status_data['data']
+                        if self.version == 'v0':
+                            return status_data['data']
+                        else:
+                            return status_data
                     else:
                         raise Exception('Crawl job completed but no data was returned')
-                elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
+                elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting', 'scraping']:
                     poll_interval=max(poll_interval,2)
                     time.sleep(poll_interval)  # Wait for the specified interval before checking again
                 else: