mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 04:26:06 +08:00
Nick:
This commit is contained in:
parent
d30119707f
commit
170a8ebfe5
@ -7,16 +7,14 @@ import { mapController } from "../controllers/v1/map";
|
|||||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||||
import { RateLimiterMode } from "../types";
|
import { RateLimiterMode } from "../types";
|
||||||
import { authenticateUser } from "../controllers/auth";
|
import { authenticateUser } from "../controllers/auth";
|
||||||
import { Logger } from "../lib/logger";
|
|
||||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||||
import { ZodError } from "zod";
|
|
||||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
|
||||||
import expressWs from "express-ws";
|
import expressWs from "express-ws";
|
||||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||||
|
import { Logger } from "../lib/logger";
|
||||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||||
// import { searchController } from "../../src/controllers/v1/search";
|
// import { searchController } from "../../src/controllers/v1/search";
|
||||||
@ -33,6 +31,7 @@ function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: R
|
|||||||
}
|
}
|
||||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||||
if (!success) {
|
if (!success) {
|
||||||
|
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
req.account = { remainingCredits }
|
req.account = { remainingCredits }
|
||||||
|
@ -1 +0,0 @@
|
|||||||
from .firecrawl import FirecrawlApp
|
|
@ -1,299 +0,0 @@
|
|||||||
"""
|
|
||||||
FirecrawlApp Module
|
|
||||||
|
|
||||||
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
|
||||||
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
|
||||||
and check the status of these jobs. The module uses requests for HTTP communication
|
|
||||||
and handles retries for certain HTTP status codes.
|
|
||||||
|
|
||||||
Classes:
|
|
||||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import os
|
|
||||||
import time
|
|
||||||
from typing import Any, Dict, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
|
|
||||||
class FirecrawlApp:
|
|
||||||
"""
|
|
||||||
Initialize the FirecrawlApp instance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
|
||||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
|
||||||
"""
|
|
||||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
|
||||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
|
||||||
if self.api_key is None:
|
|
||||||
raise ValueError('No API key provided')
|
|
||||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
|
||||||
"""
|
|
||||||
Scrape the specified URL using the Firecrawl API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL to scrape.
|
|
||||||
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Any: The scraped data if the request is successful.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the scrape request fails.
|
|
||||||
"""
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
# Prepare the base scrape parameters with the URL
|
|
||||||
scrape_params = {'url': url}
|
|
||||||
|
|
||||||
# If there are additional params, process them
|
|
||||||
if params:
|
|
||||||
# Initialize extractorOptions if present
|
|
||||||
extractor_options = params.get('extractorOptions', {})
|
|
||||||
# Check and convert the extractionSchema if it's a Pydantic model
|
|
||||||
if 'extractionSchema' in extractor_options:
|
|
||||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
|
||||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
|
||||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
|
||||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
|
||||||
# Update the scrape_params with the processed extractorOptions
|
|
||||||
scrape_params['extractorOptions'] = extractor_options
|
|
||||||
|
|
||||||
# Include any other params directly at the top level of scrape_params
|
|
||||||
for key, value in params.items():
|
|
||||||
if key != 'extractorOptions':
|
|
||||||
scrape_params[key] = value
|
|
||||||
# Make the POST request with the prepared headers and JSON data
|
|
||||||
response = requests.post(
|
|
||||||
f'{self.api_url}/v0/scrape',
|
|
||||||
headers=headers,
|
|
||||||
json=scrape_params,
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
response = response.json()
|
|
||||||
if response['success'] and 'data' in response:
|
|
||||||
return response['data']
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
||||||
elif response.status_code in [402, 408, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
|
||||||
|
|
||||||
def search(self, query, params=None):
|
|
||||||
"""
|
|
||||||
Perform a search using the Firecrawl API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
query (str): The search query.
|
|
||||||
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Any: The search results if the request is successful.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the search request fails.
|
|
||||||
"""
|
|
||||||
headers = {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}'
|
|
||||||
}
|
|
||||||
json_data = {'query': query}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
response = requests.post(
|
|
||||||
f'{self.api_url}/v0/search',
|
|
||||||
headers=headers,
|
|
||||||
json=json_data
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
response = response.json()
|
|
||||||
|
|
||||||
if response['success'] and 'data' in response:
|
|
||||||
return response['data']
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
|
||||||
|
|
||||||
elif response.status_code in [402, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
|
||||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
|
||||||
|
|
||||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
|
|
||||||
"""
|
|
||||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL to crawl.
|
|
||||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
|
||||||
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
|
||||||
timeout (int): Timeout between status checks when waiting for job completion.
|
|
||||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Any: The crawl job ID or the crawl results if waiting until completion.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the crawl job initiation or monitoring fails.
|
|
||||||
"""
|
|
||||||
headers = self._prepare_headers(idempotency_key)
|
|
||||||
json_data = {'url': url}
|
|
||||||
if params:
|
|
||||||
json_data.update(params)
|
|
||||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
|
||||||
if response.status_code == 200:
|
|
||||||
job_id = response.json().get('jobId')
|
|
||||||
if wait_until_done:
|
|
||||||
return self._monitor_job_status(job_id, headers, timeout)
|
|
||||||
else:
|
|
||||||
return {'jobId': job_id}
|
|
||||||
else:
|
|
||||||
self._handle_error(response, 'start crawl job')
|
|
||||||
|
|
||||||
def check_crawl_status(self, job_id):
|
|
||||||
"""
|
|
||||||
Check the status of a crawl job using the Firecrawl API.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
job_id (str): The ID of the crawl job.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Any: The status of the crawl job.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the status check request fails.
|
|
||||||
"""
|
|
||||||
headers = self._prepare_headers()
|
|
||||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
|
||||||
if response.status_code == 200:
|
|
||||||
return response.json()
|
|
||||||
else:
|
|
||||||
self._handle_error(response, 'check crawl status')
|
|
||||||
|
|
||||||
def _prepare_headers(self, idempotency_key=None):
|
|
||||||
"""
|
|
||||||
Prepare the headers for API requests.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
|
||||||
"""
|
|
||||||
if idempotency_key:
|
|
||||||
return {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}',
|
|
||||||
'x-idempotency-key': idempotency_key
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
'Content-Type': 'application/json',
|
|
||||||
'Authorization': f'Bearer {self.api_key}',
|
|
||||||
}
|
|
||||||
|
|
||||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
|
||||||
"""
|
|
||||||
Make a POST request with retries.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL to send the POST request to.
|
|
||||||
data (Dict[str, Any]): The JSON data to include in the POST request.
|
|
||||||
headers (Dict[str, str]): The headers to include in the POST request.
|
|
||||||
retries (int): Number of retries for the request.
|
|
||||||
backoff_factor (float): Backoff factor for retries.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
requests.Response: The response from the POST request.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
requests.RequestException: If the request fails after the specified retries.
|
|
||||||
"""
|
|
||||||
for attempt in range(retries):
|
|
||||||
response = requests.post(url, headers=headers, json=data)
|
|
||||||
if response.status_code == 502:
|
|
||||||
time.sleep(backoff_factor * (2 ** attempt))
|
|
||||||
else:
|
|
||||||
return response
|
|
||||||
return response
|
|
||||||
|
|
||||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
|
||||||
"""
|
|
||||||
Make a GET request with retries.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url (str): The URL to send the GET request to.
|
|
||||||
headers (Dict[str, str]): The headers to include in the GET request.
|
|
||||||
retries (int): Number of retries for the request.
|
|
||||||
backoff_factor (float): Backoff factor for retries.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
requests.Response: The response from the GET request.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
requests.RequestException: If the request fails after the specified retries.
|
|
||||||
"""
|
|
||||||
for attempt in range(retries):
|
|
||||||
response = requests.get(url, headers=headers)
|
|
||||||
if response.status_code == 502:
|
|
||||||
time.sleep(backoff_factor * (2 ** attempt))
|
|
||||||
else:
|
|
||||||
return response
|
|
||||||
return response
|
|
||||||
|
|
||||||
def _monitor_job_status(self, job_id, headers, timeout):
|
|
||||||
"""
|
|
||||||
Monitor the status of a crawl job until completion.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
job_id (str): The ID of the crawl job.
|
|
||||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
|
||||||
timeout (int): Timeout between status checks.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Any: The crawl results if the job is completed successfully.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: If the job fails or an error occurs during status checks.
|
|
||||||
"""
|
|
||||||
while True:
|
|
||||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
|
||||||
if status_response.status_code == 200:
|
|
||||||
status_data = status_response.json()
|
|
||||||
if status_data['status'] == 'completed':
|
|
||||||
if 'data' in status_data:
|
|
||||||
return status_data['data']
|
|
||||||
else:
|
|
||||||
raise Exception('Crawl job completed but no data was returned')
|
|
||||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
|
||||||
timeout=max(timeout,2)
|
|
||||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
|
||||||
else:
|
|
||||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
|
||||||
else:
|
|
||||||
self._handle_error(status_response, 'check crawl status')
|
|
||||||
|
|
||||||
def _handle_error(self, response, action):
|
|
||||||
"""
|
|
||||||
Handle errors from API responses.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
response (requests.Response): The response object from the API request.
|
|
||||||
action (str): Description of the action that was being performed.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
Exception: An exception with a message containing the status code and error details from the response.
|
|
||||||
"""
|
|
||||||
if response.status_code in [402, 408, 409, 500]:
|
|
||||||
error_message = response.json().get('error', 'Unknown error occurred')
|
|
||||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
|
||||||
else:
|
|
||||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
|
BIN
apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
vendored
Binary file not shown.
Binary file not shown.
@ -1,179 +0,0 @@
|
|||||||
Metadata-Version: 2.1
|
|
||||||
Name: firecrawl-py
|
|
||||||
Version: 0.0.12
|
|
||||||
Summary: Python SDK for Firecrawl API
|
|
||||||
Home-page: https://github.com/mendableai/firecrawl
|
|
||||||
Author: Mendable.ai
|
|
||||||
Author-email: nick@mendable.ai
|
|
||||||
License: GNU General Public License v3 (GPLv3)
|
|
||||||
Project-URL: Documentation, https://docs.firecrawl.dev
|
|
||||||
Project-URL: Source, https://github.com/mendableai/firecrawl
|
|
||||||
Project-URL: Tracker, https://github.com/mendableai/firecrawl/issues
|
|
||||||
Keywords: SDK API firecrawl
|
|
||||||
Classifier: Development Status :: 5 - Production/Stable
|
|
||||||
Classifier: Environment :: Web Environment
|
|
||||||
Classifier: Intended Audience :: Developers
|
|
||||||
Classifier: License :: OSI Approved :: GNU General Public License v3 (GPLv3)
|
|
||||||
Classifier: Natural Language :: English
|
|
||||||
Classifier: Operating System :: OS Independent
|
|
||||||
Classifier: Programming Language :: Python
|
|
||||||
Classifier: Programming Language :: Python :: 3
|
|
||||||
Classifier: Programming Language :: Python :: 3.8
|
|
||||||
Classifier: Programming Language :: Python :: 3.9
|
|
||||||
Classifier: Programming Language :: Python :: 3.10
|
|
||||||
Classifier: Topic :: Internet
|
|
||||||
Classifier: Topic :: Internet :: WWW/HTTP
|
|
||||||
Classifier: Topic :: Internet :: WWW/HTTP :: Indexing/Search
|
|
||||||
Classifier: Topic :: Software Development
|
|
||||||
Classifier: Topic :: Software Development :: Libraries
|
|
||||||
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
||||||
Classifier: Topic :: Text Processing
|
|
||||||
Classifier: Topic :: Text Processing :: Indexing
|
|
||||||
Requires-Python: >=3.8
|
|
||||||
Description-Content-Type: text/markdown
|
|
||||||
|
|
||||||
# Firecrawl Python SDK
|
|
||||||
|
|
||||||
The Firecrawl Python SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
|
||||||
|
|
||||||
## Installation
|
|
||||||
|
|
||||||
To install the Firecrawl Python SDK, you can use pip:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install firecrawl-py
|
|
||||||
```
|
|
||||||
|
|
||||||
## Usage
|
|
||||||
|
|
||||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
|
||||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
|
||||||
|
|
||||||
|
|
||||||
Here's an example of how to use the SDK:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from firecrawl import FirecrawlApp
|
|
||||||
|
|
||||||
# Initialize the FirecrawlApp with your API key
|
|
||||||
app = FirecrawlApp(api_key='your_api_key')
|
|
||||||
|
|
||||||
# Scrape a single URL
|
|
||||||
url = 'https://mendable.ai'
|
|
||||||
scraped_data = app.scrape_url(url)
|
|
||||||
|
|
||||||
# Crawl a website
|
|
||||||
crawl_url = 'https://mendable.ai'
|
|
||||||
params = {
|
|
||||||
'pageOptions': {
|
|
||||||
'onlyMainContent': True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
crawl_result = app.crawl_url(crawl_url, params=params)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Scraping a URL
|
|
||||||
|
|
||||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
|
||||||
|
|
||||||
```python
|
|
||||||
url = 'https://example.com'
|
|
||||||
scraped_data = app.scrape_url(url)
|
|
||||||
```
|
|
||||||
### Extracting structured data from a URL
|
|
||||||
|
|
||||||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
|
||||||
|
|
||||||
```python
|
|
||||||
class ArticleSchema(BaseModel):
|
|
||||||
title: str
|
|
||||||
points: int
|
|
||||||
by: str
|
|
||||||
commentsURL: str
|
|
||||||
|
|
||||||
class TopArticlesSchema(BaseModel):
|
|
||||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
|
||||||
|
|
||||||
data = app.scrape_url('https://news.ycombinator.com', {
|
|
||||||
'extractorOptions': {
|
|
||||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
|
||||||
'mode': 'llm-extraction'
|
|
||||||
},
|
|
||||||
'pageOptions':{
|
|
||||||
'onlyMainContent': True
|
|
||||||
}
|
|
||||||
})
|
|
||||||
print(data["llm_extraction"])
|
|
||||||
```
|
|
||||||
|
|
||||||
### Search for a query
|
|
||||||
|
|
||||||
Used to search the web, get the most relevant results, scrap each page and return the markdown.
|
|
||||||
|
|
||||||
```python
|
|
||||||
query = 'what is mendable?'
|
|
||||||
search_result = app.search(query)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Crawling a Website
|
|
||||||
|
|
||||||
To crawl a website, use the `crawl_url` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
|
||||||
|
|
||||||
The `wait_until_done` parameter determines whether the method should wait for the crawl job to complete before returning the result. If set to `True`, the method will periodically check the status of the crawl job until it is completed or the specified `timeout` (in seconds) is reached. If set to `False`, the method will return immediately with the job ID, and you can manually check the status of the crawl job using the `check_crawl_status` method.
|
|
||||||
|
|
||||||
```python
|
|
||||||
crawl_url = 'https://example.com'
|
|
||||||
params = {
|
|
||||||
'crawlerOptions': {
|
|
||||||
'excludes': ['blog/*'],
|
|
||||||
'includes': [], # leave empty for all pages
|
|
||||||
'limit': 1000,
|
|
||||||
},
|
|
||||||
'pageOptions': {
|
|
||||||
'onlyMainContent': True
|
|
||||||
}
|
|
||||||
}
|
|
||||||
crawl_result = app.crawl_url(crawl_url, params=params, wait_until_done=True, timeout=5)
|
|
||||||
```
|
|
||||||
|
|
||||||
If `wait_until_done` is set to `True`, the `crawl_url` method will return the crawl result once the job is completed. If the job fails or is stopped, an exception will be raised.
|
|
||||||
|
|
||||||
### Checking Crawl Status
|
|
||||||
|
|
||||||
To check the status of a crawl job, use the `check_crawl_status` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
|
||||||
|
|
||||||
```python
|
|
||||||
job_id = crawl_result['jobId']
|
|
||||||
status = app.check_crawl_status(job_id)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Error Handling
|
|
||||||
|
|
||||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
|
||||||
|
|
||||||
## Running the Tests with Pytest
|
|
||||||
|
|
||||||
To ensure the functionality of the Firecrawl Python SDK, we have included end-to-end tests using `pytest`. These tests cover various aspects of the SDK, including URL scraping, web searching, and website crawling.
|
|
||||||
|
|
||||||
### Running the Tests
|
|
||||||
|
|
||||||
To run the tests, execute the following commands:
|
|
||||||
|
|
||||||
Install pytest:
|
|
||||||
```bash
|
|
||||||
pip install pytest
|
|
||||||
```
|
|
||||||
|
|
||||||
Run:
|
|
||||||
```bash
|
|
||||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Contributing
|
|
||||||
|
|
||||||
Contributions to the Firecrawl Python SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
|
||||||
|
|
||||||
## License
|
|
||||||
|
|
||||||
The Firecrawl Python SDK is open-source and released under the [MIT License](https://opensource.org/licenses/MIT).
|
|
@ -1,9 +0,0 @@
|
|||||||
README.md
|
|
||||||
setup.py
|
|
||||||
firecrawl/__init__.py
|
|
||||||
firecrawl/firecrawl.py
|
|
||||||
firecrawl_py.egg-info/PKG-INFO
|
|
||||||
firecrawl_py.egg-info/SOURCES.txt
|
|
||||||
firecrawl_py.egg-info/dependency_links.txt
|
|
||||||
firecrawl_py.egg-info/requires.txt
|
|
||||||
firecrawl_py.egg-info/top_level.txt
|
|
@ -1,3 +0,0 @@
|
|||||||
requests
|
|
||||||
pytest
|
|
||||||
python-dotenv
|
|
@ -1 +0,0 @@
|
|||||||
firecrawl
|
|
Loading…
x
Reference in New Issue
Block a user