diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index a9f4d396..27da0a1a 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -7,16 +7,14 @@ import { mapController } from "../controllers/v1/map"; import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types"; import { RateLimiterMode } from "../types"; import { authenticateUser } from "../controllers/auth"; -import { Logger } from "../lib/logger"; import { createIdempotencyKey } from "../services/idempotency/create"; import { validateIdempotencyKey } from "../services/idempotency/validate"; -import { ZodError } from "zod"; import { checkTeamCredits } from "../services/billing/credit_billing"; -import { v4 as uuidv4 } from "uuid"; import expressWs from "express-ws"; import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws"; import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; import { crawlCancelController } from "../controllers/v1/crawl-cancel"; +import { Logger } from "../lib/logger"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { searchController } from "../../src/controllers/v1/search"; @@ -33,6 +31,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R } const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); if (!success) { + Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`); return res.status(402).json({ success: false, error: "Insufficient credits" }); } req.account = { remainingCredits } diff --git a/apps/python-sdk/build/lib/firecrawl/__init__.py b/apps/python-sdk/build/lib/firecrawl/__init__.py deleted file mode 100644 index e7f8063d..00000000 --- a/apps/python-sdk/build/lib/firecrawl/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .firecrawl import FirecrawlApp diff --git a/apps/python-sdk/build/lib/firecrawl/firecrawl.py b/apps/python-sdk/build/lib/firecrawl/firecrawl.py deleted file mode 100644 index 3f50c798..00000000 --- a/apps/python-sdk/build/lib/firecrawl/firecrawl.py +++ /dev/null @@ -1,299 +0,0 @@ -""" -FirecrawlApp Module - -This module provides a class `FirecrawlApp` for interacting with the Firecrawl API. -It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs, -and check the status of these jobs. The module uses requests for HTTP communication -and handles retries for certain HTTP status codes. - -Classes: - - FirecrawlApp: Main class for interacting with the Firecrawl API. -""" - -import os -import time -from typing import Any, Dict, Optional - -import requests - - -class FirecrawlApp: - """ - Initialize the FirecrawlApp instance. - - Args: - api_key (Optional[str]): API key for authenticating with the Firecrawl API. - api_url (Optional[str]): Base URL for the Firecrawl API. - """ - def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: - self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY') - if self.api_key is None: - raise ValueError('No API key provided') - self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev') - def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any: - """ - Scrape the specified URL using the Firecrawl API. - - Args: - url (str): The URL to scrape. - params (Optional[Dict[str, Any]]): Additional parameters for the scrape request. - - Returns: - Any: The scraped data if the request is successful. - - Raises: - Exception: If the scrape request fails. - """ - - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - # Prepare the base scrape parameters with the URL - scrape_params = {'url': url} - - # If there are additional params, process them - if params: - # Initialize extractorOptions if present - extractor_options = params.get('extractorOptions', {}) - # Check and convert the extractionSchema if it's a Pydantic model - if 'extractionSchema' in extractor_options: - if hasattr(extractor_options['extractionSchema'], 'schema'): - extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema() - # Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided - extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction') - # Update the scrape_params with the processed extractorOptions - scrape_params['extractorOptions'] = extractor_options - - # Include any other params directly at the top level of scrape_params - for key, value in params.items(): - if key != 'extractorOptions': - scrape_params[key] = value - # Make the POST request with the prepared headers and JSON data - response = requests.post( - f'{self.api_url}/v0/scrape', - headers=headers, - json=scrape_params, - ) - if response.status_code == 200: - response = response.json() - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to scrape URL. Error: {response["error"]}') - elif response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to scrape URL. Status code: {response.status_code}') - - def search(self, query, params=None): - """ - Perform a search using the Firecrawl API. - - Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. - - Returns: - Any: The search results if the request is successful. - - Raises: - Exception: If the search request fails. - """ - headers = { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}' - } - json_data = {'query': query} - if params: - json_data.update(params) - response = requests.post( - f'{self.api_url}/v0/search', - headers=headers, - json=json_data - ) - if response.status_code == 200: - response = response.json() - - if response['success'] and 'data' in response: - return response['data'] - else: - raise Exception(f'Failed to search. Error: {response["error"]}') - - elif response.status_code in [402, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Failed to search. Status code: {response.status_code}') - - def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None): - """ - Initiate a crawl job for the specified URL using the Firecrawl API. - - Args: - url (str): The URL to crawl. - params (Optional[Dict[str, Any]]): Additional parameters for the crawl request. - wait_until_done (bool): Whether to wait until the crawl job is completed. - timeout (int): Timeout between status checks when waiting for job completion. - idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests. - - Returns: - Any: The crawl job ID or the crawl results if waiting until completion. - - Raises: - Exception: If the crawl job initiation or monitoring fails. - """ - headers = self._prepare_headers(idempotency_key) - json_data = {'url': url} - if params: - json_data.update(params) - response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers) - if response.status_code == 200: - job_id = response.json().get('jobId') - if wait_until_done: - return self._monitor_job_status(job_id, headers, timeout) - else: - return {'jobId': job_id} - else: - self._handle_error(response, 'start crawl job') - - def check_crawl_status(self, job_id): - """ - Check the status of a crawl job using the Firecrawl API. - - Args: - job_id (str): The ID of the crawl job. - - Returns: - Any: The status of the crawl job. - - Raises: - Exception: If the status check request fails. - """ - headers = self._prepare_headers() - response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if response.status_code == 200: - return response.json() - else: - self._handle_error(response, 'check crawl status') - - def _prepare_headers(self, idempotency_key=None): - """ - Prepare the headers for API requests. - - Args: - idempotency_key (Optional[str]): A unique key to ensure idempotency of requests. - - Returns: - Dict[str, str]: The headers including content type, authorization, and optionally idempotency key. - """ - if idempotency_key: - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - 'x-idempotency-key': idempotency_key - } - - return { - 'Content-Type': 'application/json', - 'Authorization': f'Bearer {self.api_key}', - } - - def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5): - """ - Make a POST request with retries. - - Args: - url (str): The URL to send the POST request to. - data (Dict[str, Any]): The JSON data to include in the POST request. - headers (Dict[str, str]): The headers to include in the POST request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the POST request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.post(url, headers=headers, json=data) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _get_request(self, url, headers, retries=3, backoff_factor=0.5): - """ - Make a GET request with retries. - - Args: - url (str): The URL to send the GET request to. - headers (Dict[str, str]): The headers to include in the GET request. - retries (int): Number of retries for the request. - backoff_factor (float): Backoff factor for retries. - - Returns: - requests.Response: The response from the GET request. - - Raises: - requests.RequestException: If the request fails after the specified retries. - """ - for attempt in range(retries): - response = requests.get(url, headers=headers) - if response.status_code == 502: - time.sleep(backoff_factor * (2 ** attempt)) - else: - return response - return response - - def _monitor_job_status(self, job_id, headers, timeout): - """ - Monitor the status of a crawl job until completion. - - Args: - job_id (str): The ID of the crawl job. - headers (Dict[str, str]): The headers to include in the status check requests. - timeout (int): Timeout between status checks. - - Returns: - Any: The crawl results if the job is completed successfully. - - Raises: - Exception: If the job fails or an error occurs during status checks. - """ - while True: - status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers) - if status_response.status_code == 200: - status_data = status_response.json() - if status_data['status'] == 'completed': - if 'data' in status_data: - return status_data['data'] - else: - raise Exception('Crawl job completed but no data was returned') - elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']: - timeout=max(timeout,2) - time.sleep(timeout) # Wait for the specified timeout before checking again - else: - raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}') - else: - self._handle_error(status_response, 'check crawl status') - - def _handle_error(self, response, action): - """ - Handle errors from API responses. - - Args: - response (requests.Response): The response object from the API request. - action (str): Description of the action that was being performed. - - Raises: - Exception: An exception with a message containing the status code and error details from the response. - """ - if response.status_code in [402, 408, 409, 500]: - error_message = response.json().get('error', 'Unknown error occurred') - raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}') - else: - raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}') diff --git a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz b/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz deleted file mode 100644 index 83cd7221..00000000 Binary files a/apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz and /dev/null differ diff --git a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl b/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl deleted file mode 100644 index b96c8f48..00000000 Binary files a/apps/python-sdk/dist/firecrawl_py-0.0.12-py3-none-any.whl and /dev/null differ diff --git a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO b/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO deleted file mode 100644 index 288eb7a5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/PKG-INFO +++ /dev/null @@ -1,179 +0,0 @@ -Metadata-Version: 2.1 -Name: firecrawl-py -Version: 0.0.12 -Summary: Python SDK for Firecrawl API -Home-page: https://github.com/mendableai/firecrawl -Author: Mendable.ai -Author-email: nick@mendable.ai -License: GNU General Public License v3 (GPLv3) -Project-URL: Documentation, https://docs.firecrawl.dev -Project-URL: Source, https://github.com/mendableai/firecrawl -Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues -Keywords: SDK API firecrawl -Classifier: Development Status :: 5 - Production/Stable -Classifier: Environment :: Web Environment -Classifier: Intended Audience :: Developers -Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3) -Classifier: Natural Language :: English -Classifier: Operating System :: OS Independent -Classifier: Programming Language :: Python -Classifier: Programming Language :: Python :: 3 -Classifier: Programming Language :: Python :: 3.8 -Classifier: Programming Language :: Python :: 3.9 -Classifier: Programming Language :: Python :: 3.10 -Classifier: Topic :: Internet -Classifier: Topic :: Internet :: WWW/HTTP -Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search -Classifier: Topic :: Software Development -Classifier: Topic :: Software Development :: Libraries -Classifier: Topic :: Software Development :: Libraries :: Python Modules -Classifier: Topic :: Text Processing -Classifier: Topic :: Text Processing :: Indexing -Requires-Python: >=3.8 -Description-Content-Type: text/markdown - -# Firecrawl Python SDK - -The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API. - -## Installation - -To install the Firecrawl Python SDK, you can use pip: - -```bash -pip install firecrawl-py -``` - -## Usage - -1. Get an API key from [firecrawl.dev](https://firecrawl.dev) -2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class. - - -Here's an example of how to use the SDK: - -```python -from firecrawl import FirecrawlApp - -# Initialize the FirecrawlApp with your API key -app = FirecrawlApp(api_key='your_api_key') - -# Scrape a single URL -url = 'https://mendable.ai' -scraped_data = app.scrape_url(url) - -# Crawl a website -crawl_url = 'https://mendable.ai' -params = { - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params) -``` - -### Scraping a URL - -To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary. - -```python -url = 'https://example.com' -scraped_data = app.scrape_url(url) -``` -### Extracting structured data from a URL - -With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it: - -```python -class ArticleSchema(BaseModel): - title: str - points: int - by: str - commentsURL: str - -class TopArticlesSchema(BaseModel): - top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories") - -data = app.scrape_url('https://news.ycombinator.com', { - 'extractorOptions': { - 'extractionSchema': TopArticlesSchema.model_json_schema(), - 'mode': 'llm-extraction' - }, - 'pageOptions':{ - 'onlyMainContent': True - } -}) -print(data["llm_extraction"]) -``` - -### Search for a query - -Used to search the web, get the most relevant results, scrap each page and return the markdown. - -```python -query = 'what is mendable?' -search_result = app.search(query) -``` - -### Crawling a Website - -To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format. - -The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method. - -```python -crawl_url = 'https://example.com' -params = { - 'crawlerOptions': { - 'excludes': ['blog/*'], - 'includes': [], # leave empty for all pages - 'limit': 1000, - }, - 'pageOptions': { - 'onlyMainContent': True - } -} -crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5) -``` - -If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised. - -### Checking Crawl Status - -To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job. - -```python -job_id = crawl_result['jobId'] -status = app.check_crawl_status(job_id) -``` - -## Error Handling - -The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message. - -## Running the Tests with Pytest - -To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling. - -### Running the Tests - -To run the tests, execute the following commands: - -Install pytest: -```bash -pip install pytest -``` - -Run: -```bash -pytest firecrawl/__tests__/e2e_withAuth/test.py -``` - - -## Contributing - -Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository. - -## License - -The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT). diff --git a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt b/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt deleted file mode 100644 index c25567c5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/SOURCES.txt +++ /dev/null @@ -1,9 +0,0 @@ -README.md -setup.py -firecrawl/__init__.py -firecrawl/firecrawl.py -firecrawl_py.egg-info/PKG-INFO -firecrawl_py.egg-info/SOURCES.txt -firecrawl_py.egg-info/dependency_links.txt -firecrawl_py.egg-info/requires.txt -firecrawl_py.egg-info/top_level.txt \ No newline at end of file diff --git a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt b/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/apps/python-sdk/firecrawl_py.egg-info/requires.txt b/apps/python-sdk/firecrawl_py.egg-info/requires.txt deleted file mode 100644 index c8d341f5..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/requires.txt +++ /dev/null @@ -1,3 +0,0 @@ -requests -pytest -python-dotenv diff --git a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt b/apps/python-sdk/firecrawl_py.egg-info/top_level.txt deleted file mode 100644 index 8bce1a1f..00000000 --- a/apps/python-sdk/firecrawl_py.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -firecrawl