Nick:

2025-08-14 06:15:53 +08:00 · 2024-08-27 11:58:42 -03:00 · 2024-08-27 11:58:42 -03:00 · 170a8ebfe5
commit 170a8ebfe5
parent d30119707f
10 changed files with 2 additions and 496 deletions
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -7,16 +7,14 @@ import { mapController } from "../controllers/v1/map";
 import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
 import { RateLimiterMode } from "../types";
 import { authenticateUser } from "../controllers/auth";
-import { Logger } from "../lib/logger";
 import { createIdempotencyKey } from "../services/idempotency/create";
 import { validateIdempotencyKey } from "../services/idempotency/validate";
-import { ZodError } from "zod";
 import { checkTeamCredits } from "../services/billing/credit_billing";
-import { v4 as uuidv4 } from "uuid";
 import expressWs from "express-ws";
 import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
 import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
 import { crawlCancelController } from "../controllers/v1/crawl-cancel";
+import { Logger } from "../lib/logger";
 // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
 // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
 // import { searchController } from "../../src/controllers/v1/search";
@ -33,6 +31,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
            }
            const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
            if (!success) {
+                Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
                return res.status(402).json({ success: false, error: "Insufficient credits" });
            }
            req.account = { remainingCredits }
--- a/apps/python-sdk/build/lib/firecrawl/init.py
+++ b/apps/python-sdk/build/lib/firecrawl/init.py
@ -1 +0,0 @@
-from .firecrawl import FirecrawlApp
--- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py
+++ b/apps/python-sdk/build/lib/firecrawl/firecrawl.py
@ -1,299 +0,0 @@
-"""
-FirecrawlApp Module
-
-This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
-It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
-and check the status of these jobs. The module uses requests for HTTP communication
-and handles retries for certain HTTP status codes.
-
-Classes:
-    - FirecrawlApp: Main class for interacting with the Firecrawl API.
-"""
-
-import os
-import time
-from typing import Any, Dict, Optional
-
-import requests
-
-
-class FirecrawlApp:
-    """
-    Initialize the FirecrawlApp instance.
-
-    Args:
-        api_key (Optional[str]): API key for authenticating with the Firecrawl API.
-        api_url (Optional[str]): Base URL for the Firecrawl API.
-    """
-    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
-        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
-        if self.api_key is None:
-            raise ValueError('No API key provided')
-        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
-    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
-        """
-        Scrape the specified URL using the Firecrawl API.
-
-        Args:
-            url (str): The URL to scrape.
-            params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
-
-        Returns:
-            Any: The scraped data if the request is successful.
-
-        Raises:
-            Exception: If the scrape request fails.
-        """
-
-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
-        # Prepare the base scrape parameters with the URL
-        scrape_params = {'url': url}
-
-        # If there are additional params, process them
-        if params:
-            # Initialize extractorOptions if present
-            extractor_options = params.get('extractorOptions', {})
-            # Check and convert the extractionSchema if it's a Pydantic model
-            if 'extractionSchema' in extractor_options:
-                if hasattr(extractor_options['extractionSchema'], 'schema'):
-                    extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
-                # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
-                extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
-                # Update the scrape_params with the processed extractorOptions
-                scrape_params['extractorOptions'] = extractor_options
-
-            # Include any other params directly at the top level of scrape_params
-            for key, value in params.items():
-                if key != 'extractorOptions':
-                    scrape_params[key] = value
-        # Make the POST request with the prepared headers and JSON data
-        response = requests.post(
-            f'{self.api_url}/v0/scrape',
-            headers=headers,
-            json=scrape_params,
-        )
-        if response.status_code == 200:
-            response = response.json()
-            if response['success'] and 'data' in response:
-                return response['data']
-            else:
-                raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
-        elif response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
-        else:
-            raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
-
-    def search(self, query, params=None):
-        """
-        Perform a search using the Firecrawl API.
-
-        Args:
-            query (str): The search query.
-            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
-
-        Returns:
-            Any: The search results if the request is successful.
-
-        Raises:
-            Exception: If the search request fails.
-        """
-        headers = {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}'
-        }
-        json_data = {'query': query}
-        if params:
-            json_data.update(params)
-        response = requests.post(
-            f'{self.api_url}/v0/search',
-            headers=headers,
-            json=json_data
-        )
-        if response.status_code == 200:
-            response = response.json()
-            
-            if response['success'] and 'data' in response:
-                return response['data']
-            else:
-                raise Exception(f'Failed to search. Error: {response["error"]}')
-
-        elif response.status_code in [402, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
-        else:
-            raise Exception(f'Failed to search. Status code: {response.status_code}')
-
-    def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
-        """
-        Initiate a crawl job for the specified URL using the Firecrawl API.
-
-        Args:
-            url (str): The URL to crawl.
-            params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
-            wait_until_done (bool): Whether to wait until the crawl job is completed.
-            timeout (int): Timeout between status checks when waiting for job completion.
-            idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
-
-        Returns:
-            Any: The crawl job ID or the crawl results if waiting until completion.
-
-        Raises:
-            Exception: If the crawl job initiation or monitoring fails.
-        """
-        headers = self._prepare_headers(idempotency_key)
-        json_data = {'url': url}
-        if params:
-            json_data.update(params)
-        response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
-        if response.status_code == 200:
-            job_id = response.json().get('jobId')
-            if wait_until_done:
-                return self._monitor_job_status(job_id, headers, timeout)
-            else:
-                return {'jobId': job_id}
-        else:
-            self._handle_error(response, 'start crawl job')
-
-    def check_crawl_status(self, job_id):
-        """
-        Check the status of a crawl job using the Firecrawl API.
-
-        Args:
-            job_id (str): The ID of the crawl job.
-
-        Returns:
-            Any: The status of the crawl job.
-
-        Raises:
-            Exception: If the status check request fails.
-        """
-        headers = self._prepare_headers()
-        response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
-        if response.status_code == 200:
-            return response.json()
-        else:
-            self._handle_error(response, 'check crawl status')
-
-    def _prepare_headers(self, idempotency_key=None):
-        """
-        Prepare the headers for API requests.
-
-        Args:
-            idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
-
-        Returns:
-            Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
-        """
-        if idempotency_key:
-            return {
-                'Content-Type': 'application/json',
-                'Authorization': f'Bearer {self.api_key}',
-                'x-idempotency-key': idempotency_key
-            }
-
-        return {
-            'Content-Type': 'application/json',
-            'Authorization': f'Bearer {self.api_key}',
-        }
-
-    def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
-        """
-        Make a POST request with retries.
-
-        Args:
-            url (str): The URL to send the POST request to.
-            data (Dict[str, Any]): The JSON data to include in the POST request.
-            headers (Dict[str, str]): The headers to include in the POST request.
-            retries (int): Number of retries for the request.
-            backoff_factor (float): Backoff factor for retries.
-
-        Returns:
-            requests.Response: The response from the POST request.
-
-        Raises:
-            requests.RequestException: If the request fails after the specified retries.
-        """
-        for attempt in range(retries):
-            response = requests.post(url, headers=headers, json=data)
-            if response.status_code == 502:
-                time.sleep(backoff_factor * (2 ** attempt))
-            else:
-                return response
-        return response
-
-    def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
-        """
-        Make a GET request with retries.
-
-        Args:
-            url (str): The URL to send the GET request to.
-            headers (Dict[str, str]): The headers to include in the GET request.
-            retries (int): Number of retries for the request.
-            backoff_factor (float): Backoff factor for retries.
-
-        Returns:
-            requests.Response: The response from the GET request.
-
-        Raises:
-            requests.RequestException: If the request fails after the specified retries.
-        """
-        for attempt in range(retries):
-            response = requests.get(url, headers=headers)
-            if response.status_code == 502:
-                time.sleep(backoff_factor * (2 ** attempt))
-            else:
-                return response
-        return response
-
-    def _monitor_job_status(self, job_id, headers, timeout):
-        """
-        Monitor the status of a crawl job until completion.
-
-        Args:
-            job_id (str): The ID of the crawl job.
-            headers (Dict[str, str]): The headers to include in the status check requests.
-            timeout (int): Timeout between status checks.
-
-        Returns:
-            Any: The crawl results if the job is completed successfully.
-
-        Raises:
-            Exception: If the job fails or an error occurs during status checks.
-        """
-        while True:
-            status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
-            if status_response.status_code == 200:
-                status_data = status_response.json()
-                if status_data['status'] == 'completed':
-                    if 'data' in status_data:
-                        return status_data['data']
-                    else:
-                        raise Exception('Crawl job completed but no data was returned')
-                elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
-                    timeout=max(timeout,2)
-                    time.sleep(timeout)  # Wait for the specified timeout before checking again
-                else:
-                    raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
-            else:
-                self._handle_error(status_response, 'check crawl status')
-
-    def _handle_error(self, response, action):
-        """
-        Handle errors from API responses.
-
-        Args:
-            response (requests.Response): The response object from the API request.
-            action (str): Description of the action that was being performed.
-
-        Raises:
-            Exception: An exception with a message containing the status code and error details from the response.
-        """
-        if response.status_code in [402, 408, 409, 500]:
-            error_message = response.json().get('error', 'Unknown error occurred')
-            raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
-        else:
-            raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
--- a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
+++ b/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
--- a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl
+++ b/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl
--- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
+++ b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO
@ -1,179 +0,0 @@
-Metadata-Version: 2.1
-Name: firecrawl-py
-Version: 0.0.12
-Summary: Python SDK for Firecrawl API
-Home-page: https://github.com/mendableai/firecrawl
-Author: Mendable.ai
-Author-email: nick@mendable.ai
-License: GNU General Public License v3 (GPLv3)
-Project-URL: Documentation, https://docs.firecrawl.dev
-Project-URL: Source, https://github.com/mendableai/firecrawl
-Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
-Keywords: SDK API firecrawl
-Classifier: Development Status :: 5 - Production/Stable
-Classifier: Environment :: Web Environment
-Classifier: Intended Audience :: Developers
-Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
-Classifier: Natural Language :: English
-Classifier: Operating System :: OS Independent
-Classifier: Programming Language :: Python
-Classifier: Programming Language :: Python :: 3
-Classifier: Programming Language :: Python :: 3.8
-Classifier: Programming Language :: Python :: 3.9
-Classifier: Programming Language :: Python :: 3.10
-Classifier: Topic :: Internet
-Classifier: Topic :: Internet :: WWW/HTTP
-Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
-Classifier: Topic :: Software Development
-Classifier: Topic :: Software Development :: Libraries
-Classifier: Topic :: Software Development :: Libraries :: Python Modules
-Classifier: Topic :: Text Processing
-Classifier: Topic :: Text Processing :: Indexing
-Requires-Python: >=3.8
-Description-Content-Type: text/markdown
-
-# Firecrawl Python SDK
-
-The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
-
-## Installation
-
-To install the Firecrawl Python SDK, you can use pip:
-
-```bash
-pip install firecrawl-py
-```
-
-## Usage
-
-1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
-2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
-
-
-Here's an example of how to use the SDK:
-
-```python
-from firecrawl import FirecrawlApp
-
-# Initialize the FirecrawlApp with your API key
-app = FirecrawlApp(api_key='your_api_key')
-
-# Scrape a single URL
-url = 'https://mendable.ai'
-scraped_data = app.scrape_url(url)
-
-# Crawl a website
-crawl_url = 'https://mendable.ai'
-params = {
-    'pageOptions': {
-        'onlyMainContent': True
-    }
-}
-crawl_result = app.crawl_url(crawl_url, params=params)
-```
-
-### Scraping a URL
-
-To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
-
-```python
-url = 'https://example.com'
-scraped_data = app.scrape_url(url)
-```
-### Extracting structured data from a URL
-
-With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
-
-```python
-class ArticleSchema(BaseModel):
-    title: str
-    points: int 
-    by: str
-    commentsURL: str
-
-class TopArticlesSchema(BaseModel):
-    top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
-
-data = app.scrape_url('https://news.ycombinator.com', {
-    'extractorOptions': {
-        'extractionSchema': TopArticlesSchema.model_json_schema(),
-        'mode': 'llm-extraction'
-    },
-    'pageOptions':{
-        'onlyMainContent': True
-    }
-})
-print(data["llm_extraction"])
-```
-
-### Search for a query
-
-Used to search the web, get the most relevant results, scrap each page and return the markdown.
-
-```python
-query = 'what is mendable?'
-search_result = app.search(query)
-```
-
-### Crawling a Website
-
-To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
-
-The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
-
-```python
-crawl_url = 'https://example.com'
-params = {
-    'crawlerOptions': {
-        'excludes': ['blog/*'],
-        'includes': [], # leave empty for all pages
-        'limit': 1000,
-    },
-    'pageOptions': {
-        'onlyMainContent': True
-    }
-}
-crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
-```
-
-If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
-
-### Checking Crawl Status
-
-To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
-
-```python
-job_id = crawl_result['jobId']
-status = app.check_crawl_status(job_id)
-```
-
-## Error Handling
-
-The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
-
-## Running the Tests with Pytest
-
-To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
-
-### Running the Tests
-
-To run the tests, execute the following commands:
-
-Install pytest:
-```bash
-pip install pytest
-```
-
-Run:
-```bash
-pytest firecrawl/__tests__/e2e_withAuth/test.py
-```
-
-
-## Contributing
-
-Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
-
-## License
-
-The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
--- a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt
+++ b/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt
@ -1,9 +0,0 @@
-README.md
-setup.py
-firecrawl/__init__.py
-firecrawl/firecrawl.py
-firecrawl_py.egg-info/PKG-INFO
-firecrawl_py.egg-info/SOURCES.txt
-firecrawl_py.egg-info/dependency_links.txt
-firecrawl_py.egg-info/requires.txt
-firecrawl_py.egg-info/top_level.txt
--- a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt
+++ b/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt
@ -1 +0,0 @@
-
--- a/apps/python-sdk/firecrawl_py.egg-info/requires.txt
+++ b/apps/python-sdk/firecrawl_py.egg-info/requires.txt
@ -1,3 +0,0 @@
-requests
-pytest
-python-dotenv
--- a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt
+++ b/apps/python-sdk/firecrawl_py.egg-info/top_level.txt
@ -1 +0,0 @@
-firecrawl