mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:46:04 +08:00
tomkosms review
This commit is contained in:
parent
86f41460e0
commit
e7db5a2d5b
@ -108,16 +108,46 @@ class CrawlScrapeOptions(pydantic.BaseModel):
|
|||||||
blockAds: Optional[bool] = None
|
blockAds: Optional[bool] = None
|
||||||
proxy: Optional[Literal["basic", "stealth"]] = None
|
proxy: Optional[Literal["basic", "stealth"]] = None
|
||||||
|
|
||||||
class Action(pydantic.BaseModel):
|
class WaitAction(pydantic.BaseModel):
|
||||||
"""Action to perform during scraping."""
|
"""Wait action to perform during scraping."""
|
||||||
type: Literal["wait", "click", "screenshot", "write", "press", "scroll", "scrape", "executeJavascript"]
|
type: Literal["wait"]
|
||||||
milliseconds: Optional[int] = None
|
milliseconds: int
|
||||||
selector: Optional[str] = None
|
selector: Optional[str] = None
|
||||||
|
|
||||||
|
class ScreenshotAction(pydantic.BaseModel):
|
||||||
|
"""Screenshot action to perform during scraping."""
|
||||||
|
type: Literal["screenshot"]
|
||||||
fullPage: Optional[bool] = None
|
fullPage: Optional[bool] = None
|
||||||
text: Optional[str] = None
|
|
||||||
key: Optional[str] = None
|
class ClickAction(pydantic.BaseModel):
|
||||||
direction: Optional[Literal["up", "down"]] = None
|
"""Click action to perform during scraping."""
|
||||||
script: Optional[str] = None
|
type: Literal["click"]
|
||||||
|
selector: str
|
||||||
|
|
||||||
|
class WriteAction(pydantic.BaseModel):
|
||||||
|
"""Write action to perform during scraping."""
|
||||||
|
type: Literal["write"]
|
||||||
|
text: str
|
||||||
|
|
||||||
|
class PressAction(pydantic.BaseModel):
|
||||||
|
"""Press action to perform during scraping."""
|
||||||
|
type: Literal["press"]
|
||||||
|
key: str
|
||||||
|
|
||||||
|
class ScrollAction(pydantic.BaseModel):
|
||||||
|
"""Scroll action to perform during scraping."""
|
||||||
|
type: Literal["scroll"]
|
||||||
|
direction: Literal["up", "down"]
|
||||||
|
selector: Optional[str] = None
|
||||||
|
|
||||||
|
class ScrapeAction(pydantic.BaseModel):
|
||||||
|
"""Scrape action to perform during scraping."""
|
||||||
|
type: Literal["scrape"]
|
||||||
|
|
||||||
|
class ExecuteJavascriptAction(pydantic.BaseModel):
|
||||||
|
"""Execute javascript action to perform during scraping."""
|
||||||
|
type: Literal["executeJavascript"]
|
||||||
|
script: str
|
||||||
|
|
||||||
class ExtractConfig(pydantic.BaseModel):
|
class ExtractConfig(pydantic.BaseModel):
|
||||||
"""Configuration for extraction."""
|
"""Configuration for extraction."""
|
||||||
@ -129,7 +159,7 @@ class ScrapeParams(CrawlScrapeOptions):
|
|||||||
"""Parameters for scraping operations."""
|
"""Parameters for scraping operations."""
|
||||||
extract: Optional[ExtractConfig] = None
|
extract: Optional[ExtractConfig] = None
|
||||||
jsonOptions: Optional[ExtractConfig] = None
|
jsonOptions: Optional[ExtractConfig] = None
|
||||||
actions: Optional[List[Action]] = None
|
actions: Optional[List[Union[WaitAction, ScreenshotAction, ClickAction, WriteAction, PressAction, ScrollAction, ScrapeAction, ExecuteJavascriptAction]]] = None
|
||||||
|
|
||||||
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
class ScrapeResponse(FirecrawlDocument[T], Generic[T]):
|
||||||
"""Response from scraping operations."""
|
"""Response from scraping operations."""
|
||||||
@ -240,7 +270,7 @@ class SearchParams(pydantic.BaseModel):
|
|||||||
location: Optional[str] = None
|
location: Optional[str] = None
|
||||||
origin: Optional[str] = "api"
|
origin: Optional[str] = "api"
|
||||||
timeout: Optional[int] = 60000
|
timeout: Optional[int] = 60000
|
||||||
scrapeOptions: Optional[Dict[str, Any]] = None
|
scrapeOptions: Optional[CrawlScrapeOptions] = None
|
||||||
|
|
||||||
class SearchResponse(pydantic.BaseModel):
|
class SearchResponse(pydantic.BaseModel):
|
||||||
"""Response from search operations."""
|
"""Response from search operations."""
|
||||||
@ -295,10 +325,14 @@ class GenerateLLMsTextResponse(pydantic.BaseModel):
|
|||||||
id: str
|
id: str
|
||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
class GenerateLLMsTextStatusResponseData(pydantic.BaseModel):
|
||||||
|
llmstxt: str
|
||||||
|
llmsfulltxt: Optional[str] = None
|
||||||
|
|
||||||
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
class GenerateLLMsTextStatusResponse(pydantic.BaseModel):
|
||||||
"""Status response from LLMs.txt generation operations."""
|
"""Status response from LLMs.txt generation operations."""
|
||||||
success: bool = True
|
success: bool = True
|
||||||
data: Optional[Dict[str, str]] = None # {llmstxt: str, llmsfulltxt?: str}
|
data: Optional[GenerateLLMsTextStatusResponseData] = None
|
||||||
status: Literal["processing", "completed", "failed"]
|
status: Literal["processing", "completed", "failed"]
|
||||||
error: Optional[str] = None
|
error: Optional[str] = None
|
||||||
expiresAt: str
|
expiresAt: str
|
||||||
@ -322,13 +356,16 @@ class FirecrawlApp:
|
|||||||
|
|
||||||
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")
|
||||||
|
|
||||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]:
|
def scrape_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Scrape and extract content from a URL.
|
Scrape and extract content from a URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to scrape
|
url (str): Target URL to scrape
|
||||||
params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration:
|
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
||||||
Content Options:
|
Content Options:
|
||||||
* formats - Content types to retrieve (markdown/html/etc)
|
* formats - Content types to retrieve (markdown/html/etc)
|
||||||
* includeTags - HTML tags to include
|
* includeTags - HTML tags to include
|
||||||
@ -410,7 +447,10 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, 'scrape URL')
|
self._handle_error(response, 'scrape URL')
|
||||||
|
|
||||||
def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse:
|
def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse:
|
||||||
"""
|
"""
|
||||||
Search for content using Firecrawl.
|
Search for content using Firecrawl.
|
||||||
|
|
||||||
@ -520,14 +560,18 @@ class FirecrawlApp:
|
|||||||
self._handle_error(response, 'start crawl job')
|
self._handle_error(response, 'start crawl job')
|
||||||
|
|
||||||
|
|
||||||
def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse:
|
def async_crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[CrawlParams] = None,
|
||||||
|
idempotency_key: Optional[str] = None) -> CrawlResponse:
|
||||||
"""
|
"""
|
||||||
Start an asynchronous crawl job.
|
Start an asynchronous crawl job.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to start crawling from
|
url (str): Target URL to start crawling from
|
||||||
|
|
||||||
params (Optional[Dict[str, Any]]): See CrawlParams model:
|
params (Optional[CrawlParams]): See CrawlParams model:
|
||||||
|
|
||||||
URL Discovery:
|
URL Discovery:
|
||||||
* includePaths - Patterns of URLs to include
|
* includePaths - Patterns of URLs to include
|
||||||
@ -754,7 +798,10 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
raise Exception("Crawl job failed to start")
|
raise Exception("Crawl job failed to start")
|
||||||
|
|
||||||
def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse:
|
def map_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[MapParams] = None) -> MapResponse:
|
||||||
"""
|
"""
|
||||||
Map and discover links from a URL.
|
Map and discover links from a URL.
|
||||||
|
|
||||||
@ -1891,7 +1938,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
if response.status == 502:
|
if response.status == 502:
|
||||||
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
||||||
continue
|
continue
|
||||||
if response.status != 200:
|
if response.status >= 300:
|
||||||
await self._handle_error(response, "make POST request")
|
await self._handle_error(response, "make POST request")
|
||||||
return await response.json()
|
return await response.json()
|
||||||
except aiohttp.ClientError as e:
|
except aiohttp.ClientError as e:
|
||||||
@ -1930,7 +1977,7 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
if response.status == 502:
|
if response.status == 502:
|
||||||
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
await asyncio.sleep(backoff_factor * (2 ** attempt))
|
||||||
continue
|
continue
|
||||||
if response.status != 200:
|
if response.status >= 300: # Accept any 2xx status code as success
|
||||||
await self._handle_error(response, "make GET request")
|
await self._handle_error(response, "make GET request")
|
||||||
return await response.json()
|
return await response.json()
|
||||||
except aiohttp.ClientError as e:
|
except aiohttp.ClientError as e:
|
||||||
@ -2060,13 +2107,16 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
else:
|
else:
|
||||||
raise Exception("Batch scrape job failed to start")
|
raise Exception("Batch scrape job failed to start")
|
||||||
|
|
||||||
async def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> ScrapeResponse[Any]:
|
async def scrape_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[ScrapeParams] = None) -> ScrapeResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrape and extract content from a URL.
|
Asynchronously scrape and extract content from a URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to scrape
|
url (str): Target URL to scrape
|
||||||
params (Optional[Dict[str, Any]]): See ScrapeParams model for configuration:
|
params (Optional[ScrapeParams]): See ScrapeParams model for configuration:
|
||||||
Content Options:
|
Content Options:
|
||||||
* formats - Content types to retrieve (markdown/html/etc)
|
* formats - Content types to retrieve (markdown/html/etc)
|
||||||
* includeTags - HTML tags to include
|
* includeTags - HTML tags to include
|
||||||
@ -2122,7 +2172,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to scrape URL. Error: {response}')
|
raise Exception(f'Failed to scrape URL. Error: {response}')
|
||||||
|
|
||||||
async def batch_scrape_urls(self, urls: List[str], params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse:
|
async def batch_scrape_urls(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
params: Optional[ScrapeParams] = None) -> BatchScrapeStatusResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously scrape multiple URLs and monitor until completion.
|
Asynchronously scrape multiple URLs and monitor until completion.
|
||||||
|
|
||||||
@ -2282,13 +2335,17 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to start crawl. Error: {response.get("error")}')
|
raise Exception(f'Failed to start crawl. Error: {response.get("error")}')
|
||||||
|
|
||||||
async def async_crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, idempotency_key: Optional[str] = None) -> CrawlResponse:
|
async def async_crawl_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[CrawlParams] = None,
|
||||||
|
idempotency_key: Optional[str] = None) -> CrawlResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous crawl job without waiting for completion.
|
Initiate an asynchronous crawl job without waiting for completion.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to start crawling from
|
url (str): Target URL to start crawling from
|
||||||
params (Optional[Dict[str, Any]]): See CrawlParams model:
|
params (Optional[CrawlParams]): See CrawlParams model:
|
||||||
URL Discovery:
|
URL Discovery:
|
||||||
* includePaths - Patterns of URLs to include
|
* includePaths - Patterns of URLs to include
|
||||||
* excludePaths - Patterns of URLs to exclude
|
* excludePaths - Patterns of URLs to exclude
|
||||||
@ -2442,13 +2499,16 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
raise Exception(f'Job failed or was stopped. Status: {status_data["status"]}')
|
||||||
|
|
||||||
async def map_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> MapResponse:
|
async def map_url(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[MapParams] = None) -> MapResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously map and discover links from a URL.
|
Asynchronously map and discover links from a URL.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
url (str): Target URL to map
|
url (str): Target URL to map
|
||||||
params (Optional[Dict[str, Any]]): See MapParams model:
|
params (Optional[MapParams]): See MapParams model:
|
||||||
Discovery Options:
|
Discovery Options:
|
||||||
* search - Filter pattern for URLs
|
* search - Filter pattern for URLs
|
||||||
* ignoreSitemap - Skip sitemap.xml
|
* ignoreSitemap - Skip sitemap.xml
|
||||||
@ -2486,7 +2546,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
else:
|
else:
|
||||||
raise Exception(f'Failed to map URL. Error: {response}')
|
raise Exception(f'Failed to map URL. Error: {response}')
|
||||||
|
|
||||||
async def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
async def extract(
|
||||||
|
self,
|
||||||
|
urls: List[str],
|
||||||
|
params: Optional[ExtractParams] = None) -> ExtractResponse[Any]:
|
||||||
"""
|
"""
|
||||||
Asynchronously extract structured information from URLs.
|
Asynchronously extract structured information from URLs.
|
||||||
|
|
||||||
@ -2792,7 +2855,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
async def generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse:
|
async def generate_llms_text(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextStatusResponse:
|
||||||
"""
|
"""
|
||||||
Generate LLMs.txt for a given URL and monitor until completion.
|
Generate LLMs.txt for a given URL and monitor until completion.
|
||||||
|
|
||||||
@ -2843,7 +2909,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
|
|
||||||
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
return {'success': False, 'error': 'LLMs.txt generation job terminated unexpectedly'}
|
||||||
|
|
||||||
async def async_generate_llms_text(self, url: str, params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse:
|
async def async_generate_llms_text(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
params: Optional[Union[Dict[str, Any], GenerateLLMsTextParams]] = None) -> GenerateLLMsTextResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
Initiate an asynchronous LLMs.txt generation job without waiting for completion.
|
||||||
|
|
||||||
@ -2996,7 +3065,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
|
|
||||||
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
||||||
|
|
||||||
async def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse:
|
async def async_deep_research(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> DeepResearchResponse:
|
||||||
"""
|
"""
|
||||||
Initiate an asynchronous deep research job without waiting for completion.
|
Initiate an asynchronous deep research job without waiting for completion.
|
||||||
|
|
||||||
@ -3069,7 +3141,10 @@ class AsyncFirecrawlApp(FirecrawlApp):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise ValueError(str(e))
|
raise ValueError(str(e))
|
||||||
|
|
||||||
async def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse:
|
async def search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> SearchResponse:
|
||||||
"""
|
"""
|
||||||
Asynchronously search for content using Firecrawl.
|
Asynchronously search for content using Firecrawl.
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user