mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 04:35:52 +08:00
commit
9e87d05b77
2
.github/workflows/fly.yml
vendored
2
.github/workflows/fly.yml
vendored
@ -132,7 +132,7 @@ jobs:
|
||||
working-directory: ./apps/python-sdk
|
||||
- name: Run E2E tests for Python SDK
|
||||
run: |
|
||||
pytest firecrawl/__tests__/e2e_withAuth/test.py
|
||||
pytest firecrawl/__tests__/v1/e2e_withAuth/test.py
|
||||
working-directory: ./apps/python-sdk
|
||||
|
||||
js-sdk-tests:
|
||||
|
2
.gitignore
vendored
2
.gitignore
vendored
@ -19,3 +19,5 @@ apps/test-suite/load-test-results/test-run-report.json
|
||||
apps/playwright-service-ts/node_modules/
|
||||
apps/playwright-service-ts/package-lock.json
|
||||
|
||||
*.pyc
|
||||
.rdb
|
||||
|
8
.gitmodules
vendored
8
.gitmodules
vendored
@ -1,6 +1,6 @@
|
||||
[submodule "apps/go-sdk/firecrawl"]
|
||||
path = apps/go-sdk/firecrawl
|
||||
[submodule "apps/go-sdk/firecrawl-go"]
|
||||
path = apps/go-sdk/firecrawl-go
|
||||
url = https://github.com/mendableai/firecrawl-go
|
||||
[submodule "apps/go-sdk/examples"]
|
||||
path = apps/go-sdk/examples
|
||||
[submodule "apps/go-sdk/firecrawl-go-examples"]
|
||||
path = apps/go-sdk/firecrawl-go-examples
|
||||
url = https://github.com/mendableai/firecrawl-go-examples
|
||||
|
@ -44,7 +44,6 @@ BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
300
README.md
300
README.md
@ -6,7 +6,7 @@ _This repository is in its early development stages. We are still merging custom
|
||||
|
||||
## What is Firecrawl?
|
||||
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required.
|
||||
[Firecrawl](https://firecrawl.dev?ref=github) is an API service that takes a URL, crawls it, and converts it into clean markdown or structured data. We crawl all accessible subpages and give you clean data for each. No sitemap required. Check out our [documentation](https://docs.firecrawl.dev).
|
||||
|
||||
_Pst. hey, you, join our stargazers :)_
|
||||
|
||||
@ -41,18 +41,26 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
||||
Used to crawl a URL and all accessible subpages. This submits a crawl job and returns a job ID to check the status of the crawl.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/crawl \
|
||||
curl -X POST https://api.firecrawl.dev/v1/crawl \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-H 'Authorization: Bearer fc-YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"limit": 100,
|
||||
"scrapeOptions": {
|
||||
"formats": ["markdown", "html"]
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
Returns a jobId
|
||||
Returns a crawl job id and the url to check the status of the crawl.
|
||||
|
||||
```json
|
||||
{ "jobId": "1234-5678-9101" }
|
||||
{
|
||||
"success": true,
|
||||
"id": "123-456-789",
|
||||
"url": "https://api.firecrawl.dev/v1/crawl/123-456-789"
|
||||
}
|
||||
```
|
||||
|
||||
### Check Crawl Job
|
||||
@ -60,7 +68,7 @@ Returns a jobId
|
||||
Used to check the status of a crawl job and get its result.
|
||||
|
||||
```bash
|
||||
curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
||||
curl -X GET https://api.firecrawl.dev/v1/crawl/123-456-789 \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY'
|
||||
```
|
||||
@ -68,18 +76,20 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
||||
```json
|
||||
{
|
||||
"status": "completed",
|
||||
"current": 22,
|
||||
"total": 22,
|
||||
"total": 36,
|
||||
"creditsUsed": 36,
|
||||
"expiresAt": "2024-00-00T00:00:00.000Z",
|
||||
"data": [
|
||||
{
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "[Firecrawl Docs home page!...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"js-focus-visible lg:[--scroll-mt:9.5rem]\" data-js-focus-visible=\"\">...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Build a 'Chat with website' using Groq Llama 3 | Firecrawl",
|
||||
"language": "en",
|
||||
"sourceURL": "https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
"description": "Learn how to use Firecrawl, Groq Llama 3, and Langchain to build a 'Chat with your website' bot.",
|
||||
"ogLocaleAlternate": [],
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
]
|
||||
@ -88,14 +98,15 @@ curl -X GET https://api.firecrawl.dev/v0/crawl/status/1234-5678-9101 \
|
||||
|
||||
### Scraping
|
||||
|
||||
Used to scrape a URL and get its content.
|
||||
Used to scrape a URL and get its content in the specified formats.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
curl -X POST https://api.firecrawl.dev/v1/scrape \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://mendable.ai"
|
||||
"url": "https://docs.firecrawl.dev",
|
||||
"formats" : ["markdown", "html"]
|
||||
}'
|
||||
```
|
||||
|
||||
@ -105,55 +116,83 @@ Response:
|
||||
{
|
||||
"success": true,
|
||||
"data": {
|
||||
"content": "Raw Content ",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"markdown": "Launch Week I is here! [See our Day 2 Release 🚀](https://www.firecrawl.dev/blog/launch-week-i-day-2-doubled-rate-limits)[💥 Get 2 months free...",
|
||||
"html": "<!DOCTYPE html><html lang=\"en\" class=\"light\" style=\"color-scheme: light;\"><body class=\"__variable_36bd41 __variable_d7dc5d font-inter ...",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
"title": "Home - Firecrawl",
|
||||
"description": "Firecrawl crawls and converts any website into clean markdown.",
|
||||
"language": "en",
|
||||
"keywords": "Firecrawl,Markdown,Data,Mendable,Langchain",
|
||||
"robots": "follow, index",
|
||||
"ogTitle": "Firecrawl",
|
||||
"ogDescription": "Turn any website into LLM-ready data.",
|
||||
"ogUrl": "https://www.firecrawl.dev/",
|
||||
"ogImage": "https://www.firecrawl.dev/og.png?123",
|
||||
"ogLocaleAlternate": [],
|
||||
"ogSiteName": "Firecrawl",
|
||||
"sourceURL": "https://firecrawl.dev",
|
||||
"statusCode": 200
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Search (Beta)
|
||||
### Map (Alpha)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
Used to map a URL and get urls of the website. This returns most links present on the website.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
"url": "https://firecrawl.dev"
|
||||
}'
|
||||
```
|
||||
|
||||
Response:
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://firecrawl.dev",
|
||||
"https://www.firecrawl.dev/pricing",
|
||||
"https://www.firecrawl.dev/blog",
|
||||
"https://www.firecrawl.dev/playground",
|
||||
"https://www.firecrawl.dev/smart-crawl",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Intelligent Extraction (Beta)
|
||||
#### Map with search
|
||||
|
||||
Map with `search` param allows you to search for specific urls inside a website.
|
||||
|
||||
```bash cURL
|
||||
curl -X POST https://api.firecrawl.dev/v1/map \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"url": "https://firecrawl.dev",
|
||||
"search": "docs"
|
||||
}'
|
||||
```
|
||||
|
||||
Response will be an ordered list from the most relevant to the least relevant.
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "success",
|
||||
"links": [
|
||||
"https://docs.firecrawl.dev",
|
||||
"https://docs.firecrawl.dev/sdks/python",
|
||||
"https://docs.firecrawl.dev/learn/rag-llama3",
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### LLM Extraction (v0) (Beta)
|
||||
|
||||
Used to extract structured data from scraped pages.
|
||||
|
||||
@ -220,6 +259,42 @@ curl -X POST https://api.firecrawl.dev/v0/scrape \
|
||||
}
|
||||
```
|
||||
|
||||
|
||||
### Search (v0) (Beta)
|
||||
|
||||
Used to search the web, get the most relevant results, scrape each page and return the markdown.
|
||||
|
||||
```bash
|
||||
curl -X POST https://api.firecrawl.dev/v0/search \
|
||||
-H 'Content-Type: application/json' \
|
||||
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||
-d '{
|
||||
"query": "firecrawl",
|
||||
"pageOptions": {
|
||||
"fetchPageContent": true // false for a fast serp api
|
||||
}
|
||||
}'
|
||||
```
|
||||
|
||||
```json
|
||||
{
|
||||
"success": true,
|
||||
"data": [
|
||||
{
|
||||
"url": "https://mendable.ai",
|
||||
"markdown": "# Markdown Content",
|
||||
"provider": "web-scraper",
|
||||
"metadata": {
|
||||
"title": "Mendable | AI for CX and Sales",
|
||||
"description": "AI for CX and Sales",
|
||||
"language": null,
|
||||
"sourceURL": "https://www.mendable.ai/"
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Using Python SDK
|
||||
|
||||
### Installing Python SDK
|
||||
@ -231,24 +306,28 @@ pip install firecrawl-py
|
||||
### Crawl a website
|
||||
|
||||
```python
|
||||
from firecrawl import FirecrawlApp
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="YOUR_API_KEY")
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}})
|
||||
# Scrape a website:
|
||||
scrape_status = app.scrape_url(
|
||||
'https://firecrawl.dev',
|
||||
params={'formats': ['markdown', 'html']}
|
||||
)
|
||||
print(scrape_status)
|
||||
|
||||
# Get the markdown
|
||||
for result in crawl_result:
|
||||
print(result['markdown'])
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL, use the `scrape_url` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```python
|
||||
url = 'https://example.com'
|
||||
scraped_data = app.scrape_url(url)
|
||||
# Crawl a website:
|
||||
crawl_status = app.crawl_url(
|
||||
'https://firecrawl.dev',
|
||||
params={
|
||||
'limit': 100,
|
||||
'scrapeOptions': {'formats': ['markdown', 'html']}
|
||||
},
|
||||
wait_until_done=True,
|
||||
poll_interval=30
|
||||
)
|
||||
print(crawl_status)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
@ -256,6 +335,11 @@ scraped_data = app.scrape_url(url)
|
||||
With LLM extraction, you can easily extract structured data from any URL. We support pydantic schemas to make it easier for you too. Here is how you to use it:
|
||||
|
||||
```python
|
||||
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY", version="v0")
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
@ -277,15 +361,6 @@ data = app.scrape_url('https://news.ycombinator.com', {
|
||||
print(data["llm_extraction"])
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
Performs a web search, retrieve the top results, extract data from each page, and returns their markdown.
|
||||
|
||||
```python
|
||||
query = 'What is Mendable?'
|
||||
search_result = app.search(query)
|
||||
```
|
||||
|
||||
## Using the Node SDK
|
||||
|
||||
### Installation
|
||||
@ -301,54 +376,33 @@ npm install @mendable/firecrawl-js
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `scrapeUrl` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```js
|
||||
try {
|
||||
const url = "https://example.com";
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
console.log(scrapedData);
|
||||
} catch (error) {
|
||||
console.error("Error occurred while scraping:", error.message);
|
||||
import FirecrawlApp, { CrawlParams, CrawlStatusResponse } from '@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website
|
||||
const scrapeResponse = await app.scrapeUrl('https://firecrawl.dev', {
|
||||
formats: ['markdown', 'html'],
|
||||
});
|
||||
|
||||
if (scrapeResponse) {
|
||||
console.log(scrapeResponse)
|
||||
}
|
||||
|
||||
// Crawl a website
|
||||
const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
||||
limit: 100,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html'],
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
|
||||
if (crawlResponse) {
|
||||
console.log(crawlResponse)
|
||||
}
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website with error handling, use the `crawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```js
|
||||
const crawlUrl = "https://example.com";
|
||||
const params = {
|
||||
crawlerOptions: {
|
||||
excludes: ["blog/"],
|
||||
includes: [], // leave empty for all pages
|
||||
limit: 1000,
|
||||
},
|
||||
pageOptions: {
|
||||
onlyMainContent: true,
|
||||
},
|
||||
};
|
||||
const waitUntilDone = true;
|
||||
const timeout = 5;
|
||||
const crawlResult = await app.crawlUrl(
|
||||
crawlUrl,
|
||||
params,
|
||||
waitUntilDone,
|
||||
timeout
|
||||
);
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job with error handling, use the `checkCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```js
|
||||
const status = await app.checkCrawlStatus(jobId);
|
||||
console.log(status);
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
@ -360,6 +414,7 @@ import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({
|
||||
apiKey: "fc-YOUR_API_KEY",
|
||||
version: "v0"
|
||||
});
|
||||
|
||||
// Define schema to extract contents into
|
||||
@ -384,19 +439,6 @@ const scrapeResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
console.log(scrapeResult.data["llm_extraction"]);
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
With the `search` method, you can search for a query in a search engine and get the top results along with the page content for each result. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
```js
|
||||
const query = "what is mendable?";
|
||||
const searchResults = await app.search(query, {
|
||||
pageOptions: {
|
||||
fetchPageContent: true, // Fetch the page content for each search result
|
||||
},
|
||||
});
|
||||
```
|
||||
|
||||
## Contributing
|
||||
|
||||
We love contributions! Please read our [contributing guide](CONTRIBUTING.md) before submitting a pull request.
|
||||
|
@ -65,7 +65,6 @@ BULL_AUTH_KEY= @
|
||||
LOGTAIL_KEY= # Use if you're configuring basic logging with logtail
|
||||
PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback
|
||||
LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api
|
||||
SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages
|
||||
POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs
|
||||
POSTHOG_HOST= # set if you'd like to send posthog events like job logs
|
||||
|
@ -32,8 +32,6 @@ BULL_AUTH_KEY=@
|
||||
LOGTAIL_KEY=
|
||||
# set if you have a llamaparse key you'd like to use to parse pdfs
|
||||
LLAMAPARSE_API_KEY=
|
||||
# set if you have a serper key you'd like to use as a search api
|
||||
SERPER_API_KEY=
|
||||
# set if you'd like to send slack server health status messages
|
||||
SLACK_WEBHOOK_URL=
|
||||
# set if you'd like to send posthog events like job logs
|
||||
|
2
apps/api/.gitignore
vendored
2
apps/api/.gitignore
vendored
@ -7,5 +7,5 @@ dump.rdb
|
||||
|
||||
/.next/
|
||||
|
||||
# Sentry Config File
|
||||
.rdb
|
||||
.sentryclirc
|
||||
|
924
apps/api/openapi-v0.json
Normal file
924
apps/api/openapi-v0.json
Normal file
@ -0,0 +1,924 @@
|
||||
{
|
||||
"openapi": "3.0.0",
|
||||
"info": {
|
||||
"title": "Firecrawl API",
|
||||
"version": "0.0.0",
|
||||
"description": "API for interacting with Firecrawl services to perform web scraping and crawling tasks.",
|
||||
"contact": {
|
||||
"name": "Firecrawl Support",
|
||||
"url": "https://firecrawl.dev/support",
|
||||
"email": "support@firecrawl.dev"
|
||||
}
|
||||
},
|
||||
"servers": [
|
||||
{
|
||||
"url": "https://api.firecrawl.dev/v0"
|
||||
}
|
||||
],
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
},
|
||||
"extractorOptions": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/ScrapeResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl": {
|
||||
"post": {
|
||||
"summary": "Crawl multiple URLs based on options",
|
||||
"operationId": "crawlUrls",
|
||||
"tags": ["Crawling"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The base URL to start crawling from"
|
||||
},
|
||||
"crawlerOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"includes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to include"
|
||||
},
|
||||
"excludes": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "URL patterns to exclude"
|
||||
},
|
||||
"generateImgAltText": {
|
||||
"type": "boolean",
|
||||
"description": "Generate alt text for images using LLMs (must have a paid plan)",
|
||||
"default": false
|
||||
},
|
||||
"returnOnlyUrls": {
|
||||
"type": "boolean",
|
||||
"description": "If true, returns only the URLs as a list on the crawl status. Attention: the return response will be a list of URLs inside the data, not a list of documents.",
|
||||
"default": false
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["default", "fast"],
|
||||
"description": "The crawling mode to use. Fast mode crawls 4x faster websites without sitemap, but may not be as accurate and shouldn't be used in heavy js-rendered websites.",
|
||||
"default": "default"
|
||||
},
|
||||
"ignoreSitemap": {
|
||||
"type": "boolean",
|
||||
"description": "Ignore the website sitemap when crawling",
|
||||
"default": false
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of pages to crawl",
|
||||
"default": 10000
|
||||
},
|
||||
"allowBackwardCrawling": {
|
||||
"type": "boolean",
|
||||
"description": "Enables the crawler to navigate from a specific URL to previously linked pages. For instance, from 'example.com/product/123' back to 'example.com/product'",
|
||||
"default": false
|
||||
},
|
||||
"allowExternalContentLinks": {
|
||||
"type": "boolean",
|
||||
"description": "Allows the crawler to follow links to external websites.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/CrawlResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/search": {
|
||||
"post": {
|
||||
"summary": "Search for a keyword in Google, returns top page results with markdown content for each page",
|
||||
"operationId": "searchGoogle",
|
||||
"tags": ["Search"],
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"requestBody": {
|
||||
"required": true,
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"query": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"description": "The query to search for"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"fetchPageContent": {
|
||||
"type": "boolean",
|
||||
"description": "Fetch the content of each page. If false, defaults to a basic fast serp API.",
|
||||
"default": true
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
"searchOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of results. Max is 20 during beta."
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["query"]
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"$ref": "#/components/schemas/SearchResponse"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/status/{jobId}": {
|
||||
"get": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Get the status of a crawl job",
|
||||
"operationId": "getCrawlStatus",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Status of the job (completed, active, failed, paused)"
|
||||
},
|
||||
"current": {
|
||||
"type": "integer",
|
||||
"description": "Current page number"
|
||||
},
|
||||
"total": {
|
||||
"type": "integer",
|
||||
"description": "Total number of pages"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Data returned from the job (null when it is in progress)"
|
||||
},
|
||||
"partial_data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/components/schemas/CrawlStatusResponseObj"
|
||||
},
|
||||
"description": "Partial documents returned as it is being crawled (streaming). **This feature is currently in alpha - expect breaking changes** When a page is ready, it will append to the partial_data array, so there is no need to wait for the entire website to be crawled. When the crawl is done, partial_data will become empty and the result will be available in `data`. There is a max of 50 items in the array response. The oldest item (top of the array) will be removed when the new item is added to the array."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"/crawl/cancel/{jobId}": {
|
||||
"delete": {
|
||||
"tags": ["Crawl"],
|
||||
"summary": "Cancel a crawl job",
|
||||
"operationId": "cancelCrawlJob",
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
],
|
||||
"parameters": [
|
||||
{
|
||||
"name": "jobId",
|
||||
"in": "path",
|
||||
"description": "ID of the crawl job",
|
||||
"required": true,
|
||||
"schema": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
],
|
||||
"responses": {
|
||||
"200": {
|
||||
"description": "Successful response",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"status": {
|
||||
"type": "string",
|
||||
"description": "Returns cancelled."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"402": {
|
||||
"description": "Payment required",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Payment required to access this resource."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"429": {
|
||||
"description": "Too many requests",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "Request rate limit exceeded. Please wait and try again later."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"500": {
|
||||
"description": "Server error",
|
||||
"content": {
|
||||
"application/json": {
|
||||
"schema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"example": "An unexpected error occurred on the server."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"components": {
|
||||
"securitySchemes": {
|
||||
"bearerAuth": {
|
||||
"type": "http",
|
||||
"scheme": "bearer"
|
||||
}
|
||||
},
|
||||
"schemas": {
|
||||
"ScrapeResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlStatusResponseObj": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"SearchResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"security": [
|
||||
{
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
@ -18,8 +18,8 @@
|
||||
"paths": {
|
||||
"/scrape": {
|
||||
"post": {
|
||||
"summary": "Scrape a single URL and optionally extract information using an LLM",
|
||||
"operationId": "scrapeAndExtractFromUrl",
|
||||
"summary": "Scrape a single URL",
|
||||
"operationId": "scrape",
|
||||
"tags": ["Scraping"],
|
||||
"security": [
|
||||
{
|
||||
@ -38,94 +38,47 @@
|
||||
"format": "uri",
|
||||
"description": "The URL to scrape"
|
||||
},
|
||||
"pageOptions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the HTML version of the content on page. Will output a html key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"includeRawHtml": {
|
||||
"type": "boolean",
|
||||
"description": "Include the raw HTML content of the page. Will output a rawHtml key in the response.",
|
||||
"default": false
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": false
|
||||
},
|
||||
"removeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a screenshot of the top of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"fullPageScreenshot": {
|
||||
"type": "boolean",
|
||||
"description": "Include a full page screenshot of the page that you are scraping.",
|
||||
"default": false
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
}
|
||||
"formats": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "html", "rawHtml", "links", "screenshot", "screenshot@fullPage"]
|
||||
},
|
||||
"description": "Specific formats to return.\n\n - markdown: The page in Markdown format.\n - html: The page's HTML, trimmed to include only meaningful content.\n - rawHtml: The page's original HTML.\n - links: The links on the page.\n - screenshot: A screenshot of the top of the page.\n - screenshot@fullPage: A screenshot of the full page. (overridden by screenshot if present)",
|
||||
"default": ["markdown"]
|
||||
},
|
||||
"extractorOptions": {
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Options for extraction of structured information from the page content. Note: LLM-based extraction is not performed by default and only occurs when explicitly configured. The 'markdown' mode simply returns the scraped markdown and is the default mode for scraping.",
|
||||
"default": {},
|
||||
"properties": {
|
||||
"mode": {
|
||||
"type": "string",
|
||||
"enum": ["markdown", "llm-extraction", "llm-extraction-from-raw-html", "llm-extraction-from-markdown"],
|
||||
"description": "The extraction mode to use. 'markdown': Returns the scraped markdown content, does not perform LLM extraction. 'llm-extraction': Extracts information from the cleaned and parsed content using LLM. 'llm-extraction-from-raw-html': Extracts information directly from the raw HTML using LLM. 'llm-extraction-from-markdown': Extracts information from the markdown content using LLM."
|
||||
},
|
||||
"extractionPrompt": {
|
||||
"type": "string",
|
||||
"description": "A prompt describing what information to extract from the page, applicable for LLM extraction modes."
|
||||
},
|
||||
"extractionSchema": {
|
||||
"type": "object",
|
||||
"additionalProperties": true,
|
||||
"description": "The schema for the data to be extracted, required only for LLM extraction modes.",
|
||||
"required": [
|
||||
"company_mission",
|
||||
"supports_sso",
|
||||
"is_open_source"
|
||||
]
|
||||
}
|
||||
}
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"includeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"excludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyMainContent": {
|
||||
"type": "boolean",
|
||||
"description": "Only return the main content of the page excluding headers, navs, footers, etc.",
|
||||
"default": true
|
||||
},
|
||||
"timeout": {
|
||||
"type": "integer",
|
||||
"description": "Timeout in milliseconds for the request",
|
||||
"default": 30000
|
||||
},
|
||||
"waitFor": {
|
||||
"type": "integer",
|
||||
"description": "Wait x amount of milliseconds for the page to load to fetch content",
|
||||
"default": 0
|
||||
}
|
||||
},
|
||||
"required": ["url"]
|
||||
@ -741,24 +694,42 @@
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Warning message to let you know of any issues."
|
||||
},
|
||||
"data": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -780,27 +751,16 @@
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
"type": "object",
|
||||
"description": "Displayed when using LLM Extraction. Extracted data from the page following the schema defined.",
|
||||
"nullable": true
|
||||
},
|
||||
"warning": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Can be displayed when using LLM Extraction. Warning message will let you know any issues with the extraction."
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -810,24 +770,33 @@
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if `includeHtml` is true"
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeRawHtml` is true"
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"index": {
|
||||
"type": "integer",
|
||||
"description": "The number of the page that was crawled. This is useful for `partial_data` so you know which page the data is from."
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -849,11 +818,11 @@
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"pageError": {
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
@ -871,34 +840,63 @@
|
||||
"data": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"url": {
|
||||
"type": "string"
|
||||
"markdown": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Markdown content of the page if the `markdown` format was specified (default)"
|
||||
},
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "HTML version of the content on page if the `html` format was specified"
|
||||
},
|
||||
"rawHtml": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if the `rawHtml` format was specified"
|
||||
},
|
||||
"links": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"markdown": {
|
||||
"type": "string"
|
||||
},
|
||||
"content": {
|
||||
"type": "string"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
"nullable": true,
|
||||
"description": "Links on the page if the `links` format was specified"
|
||||
},
|
||||
"screenshot": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "URL of the screenshot of the page if the `screenshot` or `screenshot@fullSize` format was specified"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": {
|
||||
"type": "string"
|
||||
},
|
||||
"language": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"statusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
},
|
||||
"error": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -909,8 +907,15 @@
|
||||
"CrawlResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"jobId": {
|
||||
"success": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"id": {
|
||||
"type": "string"
|
||||
},
|
||||
"url": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -61,6 +61,8 @@
|
||||
"@sentry/node": "^8.26.0",
|
||||
"@sentry/profiling-node": "^8.26.0",
|
||||
"@supabase/supabase-js": "^2.44.2",
|
||||
"@types/express-ws": "^3.0.4",
|
||||
"@types/ws": "^8.5.12",
|
||||
"ajv": "^8.16.0",
|
||||
"async": "^3.2.5",
|
||||
"async-mutex": "^0.5.0",
|
||||
@ -76,6 +78,7 @@
|
||||
"dotenv": "^16.3.1",
|
||||
"dotenv-cli": "^7.4.2",
|
||||
"express-rate-limit": "^7.3.1",
|
||||
"express-ws": "^5.0.2",
|
||||
"form-data": "^4.0.0",
|
||||
"glob": "^10.4.2",
|
||||
"gpt3-tokenizer": "^1.1.5",
|
||||
@ -110,8 +113,9 @@
|
||||
"unstructured-client": "^0.11.3",
|
||||
"uuid": "^10.0.0",
|
||||
"wordpos": "^2.1.0",
|
||||
"ws": "^8.18.0",
|
||||
"xml2js": "^0.6.2",
|
||||
"zod": "^3.23.4",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.1"
|
||||
},
|
||||
"nodemonConfig": {
|
||||
|
101
apps/api/pnpm-lock.yaml
generated
101
apps/api/pnpm-lock.yaml
generated
@ -47,6 +47,12 @@ importers:
|
||||
'@supabase/supabase-js':
|
||||
specifier: ^2.44.2
|
||||
version: 2.44.2
|
||||
'@types/express-ws':
|
||||
specifier: ^3.0.4
|
||||
version: 3.0.4
|
||||
'@types/ws':
|
||||
specifier: ^8.5.12
|
||||
version: 8.5.12
|
||||
ajv:
|
||||
specifier: ^8.16.0
|
||||
version: 8.16.0
|
||||
@ -92,6 +98,9 @@ importers:
|
||||
express-rate-limit:
|
||||
specifier: ^7.3.1
|
||||
version: 7.3.1(express@4.19.2)
|
||||
express-ws:
|
||||
specifier: ^5.0.2
|
||||
version: 5.0.2(express@4.19.2)
|
||||
form-data:
|
||||
specifier: ^4.0.0
|
||||
version: 4.0.0
|
||||
@ -115,7 +124,7 @@ importers:
|
||||
version: 0.0.28
|
||||
langchain:
|
||||
specifier: ^0.2.8
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
version: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
languagedetect:
|
||||
specifier: ^2.0.0
|
||||
version: 2.0.0
|
||||
@ -194,11 +203,14 @@ importers:
|
||||
wordpos:
|
||||
specifier: ^2.1.0
|
||||
version: 2.1.0
|
||||
ws:
|
||||
specifier: ^8.18.0
|
||||
version: 8.18.0
|
||||
xml2js:
|
||||
specifier: ^0.6.2
|
||||
version: 0.6.2
|
||||
zod:
|
||||
specifier: ^3.23.4
|
||||
specifier: ^3.23.8
|
||||
version: 3.23.8
|
||||
zod-to-json-schema:
|
||||
specifier: ^3.23.1
|
||||
@ -1637,6 +1649,9 @@ packages:
|
||||
'@types/express-serve-static-core@4.19.3':
|
||||
resolution: {integrity: sha512-KOzM7MhcBFlmnlr/fzISFF5vGWVSvN6fTd4T+ExOt08bA/dA5kpSzY52nMsI1KDFmUREpJelPYyuslLRSjjgCg==}
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
resolution: {integrity: sha512-Yjj18CaivG5KndgcvzttWe8mPFinPCHJC2wvyQqVzA7hqeufM8EtWMj6mpp5omg3s8XALUexhOu8aXAyi/DyJQ==}
|
||||
|
||||
'@types/express@4.17.21':
|
||||
resolution: {integrity: sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==}
|
||||
|
||||
@ -1739,8 +1754,8 @@ packages:
|
||||
'@types/whatwg-url@11.0.5':
|
||||
resolution: {integrity: sha512-coYR071JRaHa+xoEvvYqvnIHaVqaYrLPbsufM9BF63HkwI5Lgmy2QR8Q5K/lYDYo5AK82wOvSOS0UsLTpTG7uQ==}
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
resolution: {integrity: sha512-vmQSUcfalpIq0R9q7uTo2lXs6eGIpt9wtnLdMv9LVpIjCA/+ufZRozlVoVelIYixx1ugCBKDhn89vnsEGOCx9A==}
|
||||
'@types/ws@8.5.12':
|
||||
resolution: {integrity: sha512-3tPRkv1EtkDpzlgyKyI8pGsGZAGPEaXeu0DOj5DI25Ja91bdAYddYHbADRYVrZMRbfW+1l5YwXVDKohDJNQxkQ==}
|
||||
|
||||
'@types/yargs-parser@21.0.3':
|
||||
resolution: {integrity: sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==}
|
||||
@ -2506,6 +2521,12 @@ packages:
|
||||
peerDependencies:
|
||||
express: 4 || 5 || ^5.0.0-beta.1
|
||||
|
||||
express-ws@5.0.2:
|
||||
resolution: {integrity: sha512-0uvmuk61O9HXgLhGl3QhNSEtRsQevtmbL94/eILaliEADZBHZOQUAiHFrGPrgsjikohyrmSG5g+sCfASTt0lkQ==}
|
||||
engines: {node: '>=4.5.0'}
|
||||
peerDependencies:
|
||||
express: ^4.0.0 || ^5.0.0-alpha.1
|
||||
|
||||
express@4.19.2:
|
||||
resolution: {integrity: sha512-5T6nhjsT+EOMzuck8JjBHARTHfMht0POzlA60WV2pMD3gyXw2LZnZ+ueGdNxG+0calOJcWKbpFcuzLZ91YWq9Q==}
|
||||
engines: {node: '>= 0.10.0'}
|
||||
@ -4647,8 +4668,20 @@ packages:
|
||||
resolution: {integrity: sha512-+QU2zd6OTD8XWIJCbffaiQeH9U73qIqafo1x6V1snCWYGJf6cVE0cDR4D8xRzcEnfI21IFrUPzPGtcPf8AC+Rw==}
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
|
||||
ws@8.17.1:
|
||||
resolution: {integrity: sha512-6XQFvXTkbfUOZOKKILFG1PDK2NDQs4azKQl26T0YS5CxqWLgXajbPZ+h4gZekJyRqFU8pvnbAbbs/3TgRPy+GQ==}
|
||||
ws@7.5.10:
|
||||
resolution: {integrity: sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==}
|
||||
engines: {node: '>=8.3.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
utf-8-validate: ^5.0.2
|
||||
peerDependenciesMeta:
|
||||
bufferutil:
|
||||
optional: true
|
||||
utf-8-validate:
|
||||
optional: true
|
||||
|
||||
ws@8.18.0:
|
||||
resolution: {integrity: sha512-8VbfWfHLbbwu3+N6OKsOMpBdT4kXPDDB9cJk2bJ6mh9ucxdlnNvH1e+roYkKmN9Nxw2yjz7VzeO9oOz2zJ04Pw==}
|
||||
engines: {node: '>=10.0.0'}
|
||||
peerDependencies:
|
||||
bufferutil: ^4.0.1
|
||||
@ -5286,13 +5319,13 @@ snapshots:
|
||||
|
||||
'@js-sdsl/ordered-map@4.4.2': {}
|
||||
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
||||
dependencies:
|
||||
ansi-styles: 5.2.0
|
||||
camelcase: 6.3.0
|
||||
decamelize: 1.2.0
|
||||
js-tiktoken: 1.0.12
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
ml-distance: 4.0.1
|
||||
mustache: 4.2.0
|
||||
p-queue: 6.6.2
|
||||
@ -5304,9 +5337,9 @@ snapshots:
|
||||
- langchain
|
||||
- openai
|
||||
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))':
|
||||
'@langchain/openai@0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
js-tiktoken: 1.0.12
|
||||
openai: 4.52.2
|
||||
zod: 3.23.8
|
||||
@ -5315,9 +5348,9 @@ snapshots:
|
||||
- encoding
|
||||
- langchain
|
||||
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)':
|
||||
'@langchain/textsplitters@0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)':
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
js-tiktoken: 1.0.12
|
||||
transitivePeerDependencies:
|
||||
- langchain
|
||||
@ -6545,8 +6578,8 @@ snapshots:
|
||||
dependencies:
|
||||
'@supabase/node-fetch': 2.6.15
|
||||
'@types/phoenix': 1.6.5
|
||||
'@types/ws': 8.5.10
|
||||
ws: 8.17.1
|
||||
'@types/ws': 8.5.12
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
@ -6643,6 +6676,12 @@ snapshots:
|
||||
'@types/range-parser': 1.2.7
|
||||
'@types/send': 0.17.4
|
||||
|
||||
'@types/express-ws@3.0.4':
|
||||
dependencies:
|
||||
'@types/express': 4.17.21
|
||||
'@types/express-serve-static-core': 4.19.3
|
||||
'@types/ws': 8.5.12
|
||||
|
||||
'@types/express@4.17.21':
|
||||
dependencies:
|
||||
'@types/body-parser': 1.19.5
|
||||
@ -6766,7 +6805,7 @@ snapshots:
|
||||
dependencies:
|
||||
'@types/webidl-conversions': 7.0.3
|
||||
|
||||
'@types/ws@8.5.10':
|
||||
'@types/ws@8.5.12':
|
||||
dependencies:
|
||||
'@types/node': 20.14.1
|
||||
|
||||
@ -7521,6 +7560,14 @@ snapshots:
|
||||
dependencies:
|
||||
express: 4.19.2
|
||||
|
||||
express-ws@5.0.2(express@4.19.2):
|
||||
dependencies:
|
||||
express: 4.19.2
|
||||
ws: 7.5.10
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- utf-8-validate
|
||||
|
||||
express@4.19.2:
|
||||
dependencies:
|
||||
accepts: 1.3.8
|
||||
@ -8440,17 +8487,17 @@ snapshots:
|
||||
|
||||
kleur@3.0.3: {}
|
||||
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1):
|
||||
langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0):
|
||||
dependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
'@langchain/openai': 0.2.1(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))
|
||||
'@langchain/textsplitters': 0.0.3(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
binary-extensions: 2.3.0
|
||||
js-tiktoken: 1.0.12
|
||||
js-yaml: 4.1.0
|
||||
jsonpointer: 5.0.1
|
||||
langchainhub: 0.0.11
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langsmith: 0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
ml-distance: 4.0.1
|
||||
openapi-types: 12.1.3
|
||||
p-retry: 4.6.2
|
||||
@ -8470,14 +8517,14 @@ snapshots:
|
||||
pdf-parse: 1.1.1
|
||||
puppeteer: 22.12.1(typescript@5.4.5)
|
||||
redis: 4.6.14
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- openai
|
||||
|
||||
langchainhub@0.0.11: {}
|
||||
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2):
|
||||
langsmith@0.1.34(@langchain/core@0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2))(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2):
|
||||
dependencies:
|
||||
'@types/uuid': 9.0.8
|
||||
commander: 10.0.1
|
||||
@ -8486,8 +8533,8 @@ snapshots:
|
||||
p-retry: 4.6.2
|
||||
uuid: 9.0.1
|
||||
optionalDependencies:
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1))(openai@4.52.2)
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.17.1)
|
||||
'@langchain/core': 0.2.12(langchain@0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0))(openai@4.52.2)
|
||||
langchain: 0.2.8(@supabase/supabase-js@2.44.2)(axios@1.7.2)(cheerio@1.0.0-rc.12)(handlebars@4.7.8)(html-to-text@9.0.5)(ioredis@5.4.1)(mammoth@1.7.2)(mongodb@6.6.2(socks@2.8.3))(openai@4.52.2)(pdf-parse@1.1.1)(puppeteer@22.12.1(typescript@5.4.5))(redis@4.6.14)(ws@8.18.0)
|
||||
openai: 4.52.2
|
||||
|
||||
languagedetect@2.0.0: {}
|
||||
@ -9195,7 +9242,7 @@ snapshots:
|
||||
chromium-bidi: 0.5.24(devtools-protocol@0.0.1299070)
|
||||
debug: 4.3.5
|
||||
devtools-protocol: 0.0.1299070
|
||||
ws: 8.17.1
|
||||
ws: 8.18.0
|
||||
transitivePeerDependencies:
|
||||
- bufferutil
|
||||
- supports-color
|
||||
@ -9877,7 +9924,9 @@ snapshots:
|
||||
imurmurhash: 0.1.4
|
||||
signal-exit: 4.1.0
|
||||
|
||||
ws@8.17.1: {}
|
||||
ws@7.5.10: {}
|
||||
|
||||
ws@8.18.0: {}
|
||||
|
||||
xml2js@0.6.2:
|
||||
dependencies:
|
||||
|
@ -1,12 +1,16 @@
|
||||
### Crawl Website
|
||||
POST http://localhost:3002/v0/scrape HTTP/1.1
|
||||
Authorization: Bearer fc
|
||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
||||
content-type: application/json
|
||||
|
||||
{
|
||||
"url":"firecrawl.dev"
|
||||
"url":"corterix.com"
|
||||
}
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1
|
||||
Authorization: Bearer fc-4e6259caf03b42a4b6c9261e0f96e673
|
||||
|
||||
|
||||
### Check Job Status
|
||||
GET http://localhost:3002/v0/jobs/active HTTP/1.1
|
||||
|
@ -404,7 +404,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(firstResponse.statusCode).toBe(200);
|
||||
|
||||
@ -414,7 +414,7 @@ describe("E2E Tests for API Routes", () => {
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.set("x-idempotency-key", uniqueIdempotencyKey)
|
||||
.send({ url: 'https://mendable.ai' });
|
||||
.send({ url: 'https://docs.firecrawl.dev' });
|
||||
|
||||
expect(secondResponse.statusCode).toBe(409);
|
||||
expect(secondResponse.body.error).toBe('Idempotency key already used');
|
||||
|
951
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
951
apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts
Normal file
@ -0,0 +1,951 @@
|
||||
import request from "supertest";
|
||||
import dotenv from "dotenv";
|
||||
import {
|
||||
ScrapeRequest,
|
||||
ScrapeResponseRequestTest,
|
||||
} from "../../controllers/v1/types";
|
||||
|
||||
dotenv.config();
|
||||
const TEST_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe("E2E Tests for v1 API Routes", () => {
|
||||
beforeAll(() => {
|
||||
process.env.USE_DB_AUTHENTICATION = "true";
|
||||
});
|
||||
|
||||
afterAll(() => {
|
||||
delete process.env.USE_DB_AUTHENTICATION;
|
||||
});
|
||||
|
||||
describe("GET /is-production", () => {
|
||||
it.concurrent("should return the production status", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).get(
|
||||
"/is-production"
|
||||
);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("isProduction");
|
||||
});
|
||||
});
|
||||
|
||||
describe("POST /v1/scrape", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/scrape"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("content");
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
expect(response.body.data.metadata.title).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.description).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.keywords).toBe(
|
||||
"Roast My Website,Roast,Website,GitHub,Firecrawl"
|
||||
);
|
||||
expect(response.body.data.metadata.robots).toBe("follow, index");
|
||||
expect(response.body.data.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.ogDescription).toBe(
|
||||
"Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️"
|
||||
);
|
||||
expect(response.body.data.metadata.ogUrl).toBe(
|
||||
"https://www.roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.ogImage).toBe(
|
||||
"https://www.roastmywebsite.ai/og.png"
|
||||
);
|
||||
expect(response.body.data.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.body.data.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.body.data.metadata.sourceURL).toBe(
|
||||
"https://roastmywebsite.ai"
|
||||
);
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["markdown", "html"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("_Roast_");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
// formats: ["markdown", "html"],
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a valid scrape with PDF file without explicit .pdf extension', async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://arxiv.org/pdf/astro-ph/9301001"
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send(scrapeRequest);
|
||||
await new Promise((r) => setTimeout(r, 6000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.markdown).toContain('Broad Line Radio Galaxy');
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key with removeTags option", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const responseWithoutRemoveTags: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
expect(responseWithoutRemoveTags.statusCode).toBe(200);
|
||||
expect(responseWithoutRemoveTags.body).toHaveProperty("data");
|
||||
|
||||
if (!("data" in responseWithoutRemoveTags.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("markdown");
|
||||
expect(responseWithoutRemoveTags.body.data).toHaveProperty("metadata");
|
||||
expect(responseWithoutRemoveTags.body.data).not.toHaveProperty("html");
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("[FAQ](/faq/)"); // .nav
|
||||
expect(responseWithoutRemoveTags.body.data.markdown).toContain("Hartley Brody 2023"); // #footer
|
||||
|
||||
const scrapeRequestWithRemoveTags: ScrapeRequest = {
|
||||
url: "https://www.scrapethissite.com/",
|
||||
excludeTags: ['.nav', '#footer', 'strong'],
|
||||
onlyMainContent: false // default is true
|
||||
};
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequestWithRemoveTags);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data.markdown).not.toContain("Hartley Brody 2023");
|
||||
expect(response.body.data.markdown).not.toContain("[FAQ](/faq/)"); //
|
||||
}, 30000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 400 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/400' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(400);
|
||||
}, 60000);
|
||||
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 401 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/401' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(401);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 403 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/403' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(403);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 404 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/404' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(404);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 405 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/405' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(405);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent('should return a successful response for a scrape with 500 page', async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post('/v1/scrape')
|
||||
.set('Authorization', `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set('Content-Type', 'application/json')
|
||||
.send({ url: 'https://httpstat.us/500' });
|
||||
await new Promise((r) => setTimeout(r, 5000));
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty('data');
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty('markdown');
|
||||
expect(response.body.data).toHaveProperty('metadata');
|
||||
expect(response.body.data.metadata.statusCode).toBe(500);
|
||||
}, 60000);
|
||||
|
||||
it.concurrent("should return a timeout error when scraping takes longer than the specified timeout", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev", timeout: 1000 });
|
||||
|
||||
expect(response.statusCode).toBe(408);
|
||||
}, 3000);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and includeHtml set to true",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["html","rawHtml"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("markdown");
|
||||
expect(response.body.data).toHaveProperty("html");
|
||||
expect(response.body.data).toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.html).toContain("<h1");
|
||||
expect(response.body.data.rawHtml).toContain("<html");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with waitFor",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://ycombinator.com/companies",
|
||||
formats: ["markdown"],
|
||||
waitFor: 5000
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).toHaveProperty("markdown");
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("links");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.markdown).toContain("PagerDuty");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid links on page",
|
||||
async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://roastmywebsite.ai",
|
||||
formats: ["links"],
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/scrape")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("data");
|
||||
if (!("data" in response.body)) {
|
||||
throw new Error("Expected response body to have 'data' property");
|
||||
}
|
||||
expect(response.body.data).not.toHaveProperty("html");
|
||||
expect(response.body.data).not.toHaveProperty("rawHtml");
|
||||
expect(response.body.data).toHaveProperty("links");
|
||||
expect(response.body.data).toHaveProperty("metadata");
|
||||
expect(response.body.data.links).toContain("https://firecrawl.dev");
|
||||
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||
expect(response.body.data.metadata.error).toBeUndefined();
|
||||
},
|
||||
30000
|
||||
);
|
||||
|
||||
|
||||
});
|
||||
|
||||
describe("POST /v1/map", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/map"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://roastmywebsite.ai"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://usemotion.com",
|
||||
search: "pricing"
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("usemotion.com/pricing");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
});
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: true
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||
}, 10000)
|
||||
|
||||
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||
const mapRequest = {
|
||||
url: "https://www.firecrawl.dev",
|
||||
search: "docs",
|
||||
includeSubdomains: false
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("links");
|
||||
if (!("links" in response.body)) {
|
||||
throw new Error("Expected response body to have 'links' property");
|
||||
}
|
||||
const links = response.body.links as unknown[];
|
||||
expect(Array.isArray(links)).toBe(true);
|
||||
expect(links.length).toBeGreaterThan(0);
|
||||
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||
})
|
||||
|
||||
it.concurrent("should return an error for invalid URL", async () => {
|
||||
const mapRequest = {
|
||||
url: "invalid-url",
|
||||
includeSubdomains: true,
|
||||
search: "test",
|
||||
};
|
||||
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/map")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(mapRequest);
|
||||
|
||||
expect(response.statusCode).toBe(400);
|
||||
expect(response.body).toHaveProperty("success", false);
|
||||
expect(response.body).toHaveProperty("error");
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
describe("POST /v1/crawl", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||
"/v1/crawl"
|
||||
);
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent("should throw error for blocklisted URL", async () => {
|
||||
const scrapeRequest: ScrapeRequest = {
|
||||
url: "https://facebook.com/fake-test",
|
||||
};
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send(scrapeRequest);
|
||||
|
||||
expect(response.statusCode).toBe(403);
|
||||
expect(response.body.error).toBe("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer invalid-api-key`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent("should return a successful response", async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://firecrawl.dev" });
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("id");
|
||||
expect(response.body.id).toMatch(
|
||||
/^[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[1-5][0-9a-fA-F]{3}-[89abAB][0-9a-fA-F]{3}-[0-9a-fA-F]{12}$/
|
||||
);
|
||||
expect(response.body).toHaveProperty("success", true);
|
||||
expect(response.body).toHaveProperty("url");
|
||||
expect(response.body.url).toContain("/v1/crawl/");
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid includes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
includePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let response;
|
||||
let isFinished = false;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(5);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url).toContain("firecrawl.dev/blog");
|
||||
});
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content"); // v0
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
180000
|
||||
); // 180 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with a valid API key and valid excludes option",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://firecrawl.dev",
|
||||
limit: 10,
|
||||
excludePaths: ["blog/*"],
|
||||
});
|
||||
|
||||
let isFinished = false;
|
||||
let response;
|
||||
|
||||
while (!isFinished) {
|
||||
response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
isFinished = response.body.status === "completed";
|
||||
|
||||
if (!isFinished) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThan(3);
|
||||
urls.forEach((url: string) => {
|
||||
expect(url.startsWith("https://www.firecrawl.dev/blog/")).toBeFalsy();
|
||||
});
|
||||
},
|
||||
90000
|
||||
); // 90 seconds
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful response with max depth option for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({
|
||||
url: "https://www.scrapethissite.com",
|
||||
maxDepth: 1,
|
||||
});
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
expect(["active", "waiting", "completed", "scraping"]).toContain(response.body.status);
|
||||
// wait for 60 seconds
|
||||
let isCompleted = false;
|
||||
while (!isCompleted) {
|
||||
const statusCheckResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(statusCheckResponse.statusCode).toBe(200);
|
||||
isCompleted = statusCheckResponse.body.status === "completed";
|
||||
if (!isCompleted) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
const completedResponse = await request(
|
||||
TEST_URL
|
||||
)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
const urls = completedResponse.body.data.map(
|
||||
(item: any) => item.metadata?.sourceURL
|
||||
);
|
||||
expect(urls.length).toBeGreaterThanOrEqual(1);
|
||||
|
||||
// Check if all URLs have a maximum depth of 1
|
||||
urls.forEach((url: string) => {
|
||||
const pathSplits = new URL(url).pathname.split("/");
|
||||
const depth =
|
||||
pathSplits.length -
|
||||
(pathSplits[0].length === 0 &&
|
||||
pathSplits[pathSplits.length - 1].length === 0
|
||||
? 1
|
||||
: 0);
|
||||
expect(depth).toBeLessThanOrEqual(2);
|
||||
});
|
||||
},
|
||||
180000
|
||||
);
|
||||
})
|
||||
|
||||
describe("GET /v1/crawl/:jobId", () => {
|
||||
it.concurrent("should require authorization", async () => {
|
||||
const response = await request(TEST_URL).get("/v1/crawl/123");
|
||||
expect(response.statusCode).toBe(401);
|
||||
});
|
||||
|
||||
it.concurrent(
|
||||
"should return an error response with an invalid API key",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/123")
|
||||
.set("Authorization", `Bearer invalid-api-key`);
|
||||
expect(response.statusCode).toBe(401);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return Job not found for invalid job ID",
|
||||
async () => {
|
||||
const response = await request(TEST_URL)
|
||||
.get("/v1/crawl/invalidJobId")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(404);
|
||||
}
|
||||
);
|
||||
|
||||
it.concurrent(
|
||||
"should return a successful crawl status response for a valid crawl job",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.mendable.ai" });
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
let isCompleted = false;
|
||||
|
||||
while (!isCompleted) {
|
||||
const response = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(response.statusCode).toBe(200);
|
||||
expect(response.body).toHaveProperty("status");
|
||||
|
||||
if (response.body.status === "completed") {
|
||||
isCompleted = true;
|
||||
} else {
|
||||
await new Promise((r) => setTimeout(r, 1000)); // Wait for 1 second before checking again
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((resolve) => setTimeout(resolve, 1000)); // wait for data to be saved on the database
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("completed");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).not.toHaveProperty("content");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(
|
||||
200
|
||||
);
|
||||
expect(
|
||||
completedResponse.body.data[0].metadata.error
|
||||
).toBeUndefined();
|
||||
|
||||
const childrenLinks = completedResponse.body.data.filter(
|
||||
(doc) =>
|
||||
doc.metadata &&
|
||||
doc.metadata.sourceURL
|
||||
);
|
||||
|
||||
expect(childrenLinks.length).toBe(completedResponse.body.data.length);
|
||||
},
|
||||
180000
|
||||
); // 120 seconds
|
||||
|
||||
it.concurrent(
|
||||
"If someone cancels a crawl job, it should turn into failed status",
|
||||
async () => {
|
||||
const crawlResponse = await request(TEST_URL)
|
||||
.post("/v1/crawl")
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||
.set("Content-Type", "application/json")
|
||||
.send({ url: "https://docs.tatum.io", limit: 200 });
|
||||
|
||||
expect(crawlResponse.statusCode).toBe(200);
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
|
||||
const responseCancel = await request(TEST_URL)
|
||||
.delete(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
expect(responseCancel.statusCode).toBe(200);
|
||||
expect(responseCancel.body).toHaveProperty("status");
|
||||
expect(responseCancel.body.status).toBe("cancelled");
|
||||
|
||||
await new Promise((r) => setTimeout(r, 10000));
|
||||
const completedResponse = await request(TEST_URL)
|
||||
.get(`/v1/crawl/${crawlResponse.body.id}`)
|
||||
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`);
|
||||
|
||||
expect(completedResponse.statusCode).toBe(200);
|
||||
expect(completedResponse.body).toHaveProperty("status");
|
||||
expect(completedResponse.body.status).toBe("cancelled");
|
||||
expect(completedResponse.body).toHaveProperty("data");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("markdown");
|
||||
expect(completedResponse.body.data[0]).toHaveProperty("metadata");
|
||||
expect(completedResponse.body.data[0].metadata.statusCode).toBe(200);
|
||||
expect(completedResponse.body.data[0].metadata.error).toBeUndefined();
|
||||
},
|
||||
60000
|
||||
); // 60 seconds
|
||||
})
|
||||
});
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,4 @@
|
||||
import { crawlController } from '../crawl'
|
||||
import { crawlController } from '../v0/crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
|
@ -1,23 +1,36 @@
|
||||
import { parseApi } from "../../src/lib/parseApi";
|
||||
import { getRateLimiter } from "../../src/services/rate-limiter";
|
||||
import { parseApi } from "../lib/parseApi";
|
||||
import { getRateLimiter } from "../services/rate-limiter";
|
||||
import {
|
||||
AuthResponse,
|
||||
NotificationType,
|
||||
PlanType,
|
||||
RateLimiterMode,
|
||||
} from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { withAuth } from "../../src/lib/withAuth";
|
||||
} from "../types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { withAuth } from "../lib/withAuth";
|
||||
import { RateLimiterRedis } from "rate-limiter-flexible";
|
||||
import { setTraceAttributes } from "@hyperdx/node-opentelemetry";
|
||||
import { sendNotification } from "../services/notification/email_notification";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { redlock } from "../../src/services/redlock";
|
||||
import { getValue } from "../../src/services/redis";
|
||||
import { setValue } from "../../src/services/redis";
|
||||
import { redlock } from "../services/redlock";
|
||||
import { getValue } from "../services/redis";
|
||||
import { setValue } from "../services/redis";
|
||||
import { validate } from "uuid";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
// const { data, error } = await supabase_service
|
||||
// .from('api_keys')
|
||||
// .select(`
|
||||
// key,
|
||||
// team_id,
|
||||
// teams (
|
||||
// subscriptions (
|
||||
// price_id
|
||||
// )
|
||||
// )
|
||||
// `)
|
||||
// .eq('key', normalizedApi)
|
||||
// .limit(1)
|
||||
// .single();
|
||||
function normalizedApiIsUuid(potentialUuid: string): boolean {
|
||||
// Check if the string is a valid UUID
|
||||
return validate(potentialUuid);
|
||||
@ -119,7 +132,11 @@ export async function supaAuthenticateUser(
|
||||
let priceId: string | null = null;
|
||||
|
||||
if (token == "this_is_just_a_preview_token") {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
if (mode == RateLimiterMode.CrawlStatus) {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
} else {
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||
}
|
||||
teamId = "preview";
|
||||
} else {
|
||||
normalizedApi = parseApi(token);
|
||||
@ -155,7 +172,7 @@ export async function supaAuthenticateUser(
|
||||
await setValue(
|
||||
cacheKey,
|
||||
JSON.stringify({ team_id: teamId, price_id: priceId }),
|
||||
10
|
||||
60
|
||||
);
|
||||
}
|
||||
} catch (error) {
|
||||
@ -234,6 +251,13 @@ export async function supaAuthenticateUser(
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.Map:
|
||||
rateLimiter = getRateLimiter(
|
||||
RateLimiterMode.Map,
|
||||
token,
|
||||
subscriptionData.plan
|
||||
);
|
||||
break;
|
||||
case RateLimiterMode.CrawlStatus:
|
||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||
break;
|
||||
@ -286,6 +310,9 @@ export async function supaAuthenticateUser(
|
||||
token === "this_is_just_a_preview_token" &&
|
||||
(mode === RateLimiterMode.Scrape ||
|
||||
mode === RateLimiterMode.Preview ||
|
||||
mode === RateLimiterMode.Map ||
|
||||
mode === RateLimiterMode.Crawl ||
|
||||
mode === RateLimiterMode.CrawlStatus ||
|
||||
mode === RateLimiterMode.Search)
|
||||
) {
|
||||
return { success: true, team_id: "preview" };
|
||||
|
@ -1,234 +0,0 @@
|
||||
import { ExtractorOptions, PageOptions } from './../lib/entities';
|
||||
import { Request, Response } from "express";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { PlanType, RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { Document } from "../lib/entities";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from '../lib/LLM-extraction/helpers';
|
||||
import { defaultPageOptions, defaultExtractorOptions, defaultTimeout, defaultOrigin } from '../lib/default-values';
|
||||
import { addScrapeJob } from '../services/queue-jobs';
|
||||
import { getScrapeQueue } from '../services/queue-service';
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from '../lib/logger';
|
||||
import { getJobPriority } from '../lib/job-priority';
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({plan, team_id, basePriority: 10})
|
||||
|
||||
const job = await addScrapeJob({
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
}, {}, jobId, jobPriority);
|
||||
|
||||
let doc;
|
||||
|
||||
const err = await Sentry.startSpan({ name: "Wait for job to finish", op: "bullmq.wait", attributes: { job: jobId } }, async (span) => {
|
||||
try {
|
||||
doc = (await new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + timeout) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await job.getState();
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(job.id)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(job.id)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
}))[0]
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
returnCode: 408,
|
||||
}
|
||||
} else if (typeof e === "string" && (e.includes("Error generating completions: ") || e.includes("Invalid schema for function") || e.includes("LLM extraction did not match the extraction schema you provided."))) {
|
||||
return {
|
||||
success: false,
|
||||
error: e,
|
||||
returnCode: 500,
|
||||
};
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
span.setAttribute("result", JSON.stringify(doc));
|
||||
return null;
|
||||
});
|
||||
|
||||
if (err !== null) {
|
||||
return err;
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return { success: true, error: "No page found", returnCode: 200, data: doc };
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
if (typeof extractorOptions.extractionSchema !== "object" || extractorOptions.extractionSchema === null) {
|
||||
return res.status(400).json({ error: "extractorOptions.extractionSchema must be an object if llm-extraction mode is specified" });
|
||||
}
|
||||
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
// checkCredits
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 0; // billing for doc done on queue end
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(
|
||||
team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [result.data],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: typeof error === "string" ? error : (error?.message ?? "Internal Server Error") });
|
||||
}
|
||||
}
|
@ -1,11 +1,10 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
import { Job } from "bullmq";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { checkAlerts } from "../../services/alerts";
|
||||
import { exec } from "node:child_process";
|
||||
import { sendSlackWebhook } from "../../services/alerts/slack";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { getScrapeQueue } from "../../../services/queue-service";
|
||||
import { checkAlerts } from "../../../services/alerts";
|
||||
import { sendSlackWebhook } from "../../../services/alerts/slack";
|
||||
|
||||
export async function cleanBefore24hCompleteJobsController(
|
||||
req: Request,
|
||||
@ -94,26 +93,34 @@ export async function autoscalerController(req: Request, res: Response) {
|
||||
|
||||
const scrapeQueue = getScrapeQueue();
|
||||
|
||||
const [webScraperActive, webScraperWaiting, webScraperPriority] = await Promise.all([
|
||||
scrapeQueue.getActiveCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
scrapeQueue.getPrioritizedCount(),
|
||||
]);
|
||||
const [webScraperActive, webScraperWaiting, webScraperPriority] =
|
||||
await Promise.all([
|
||||
scrapeQueue.getActiveCount(),
|
||||
scrapeQueue.getWaitingCount(),
|
||||
scrapeQueue.getPrioritizedCount(),
|
||||
]);
|
||||
|
||||
let waitingAndPriorityCount = webScraperWaiting + webScraperPriority;
|
||||
|
||||
// get number of machines active
|
||||
const request = await fetch('https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines',
|
||||
const request = await fetch(
|
||||
"https://api.machines.dev/v1/apps/firecrawl-scraper-js/machines",
|
||||
{
|
||||
headers: {
|
||||
'Authorization': `Bearer ${process.env.FLY_API_TOKEN}`
|
||||
}
|
||||
Authorization: `Bearer ${process.env.FLY_API_TOKEN}`,
|
||||
},
|
||||
}
|
||||
)
|
||||
);
|
||||
const machines = await request.json();
|
||||
|
||||
// Only worker machines
|
||||
const activeMachines = machines.filter(machine => (machine.state === 'started' || machine.state === "starting" || machine.state === "replacing") && machine.config.env["FLY_PROCESS_GROUP"] === "worker").length;
|
||||
const activeMachines = machines.filter(
|
||||
(machine) =>
|
||||
(machine.state === "started" ||
|
||||
machine.state === "starting" ||
|
||||
machine.state === "replacing") &&
|
||||
machine.config.env["FLY_PROCESS_GROUP"] === "worker"
|
||||
).length;
|
||||
|
||||
let targetMachineCount = activeMachines;
|
||||
|
||||
@ -123,29 +130,57 @@ export async function autoscalerController(req: Request, res: Response) {
|
||||
|
||||
// Scale up logic
|
||||
if (webScraperActive > 9000 || waitingAndPriorityCount > 2000) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 3));
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 3
|
||||
);
|
||||
} else if (webScraperActive > 5000 || waitingAndPriorityCount > 1000) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + (baseScaleUp * 2));
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp * 2
|
||||
);
|
||||
} else if (webScraperActive > 1000 || waitingAndPriorityCount > 500) {
|
||||
targetMachineCount = Math.min(maxNumberOfMachines, activeMachines + baseScaleUp);
|
||||
targetMachineCount = Math.min(
|
||||
maxNumberOfMachines,
|
||||
activeMachines + baseScaleUp
|
||||
);
|
||||
}
|
||||
|
||||
// Scale down logic
|
||||
if (webScraperActive < 100 && waitingAndPriorityCount < 50) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 3));
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 3
|
||||
);
|
||||
} else if (webScraperActive < 500 && waitingAndPriorityCount < 200) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - (baseScaleDown * 2));
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown * 2
|
||||
);
|
||||
} else if (webScraperActive < 1000 && waitingAndPriorityCount < 500) {
|
||||
targetMachineCount = Math.max(minNumberOfMachines, activeMachines - baseScaleDown);
|
||||
targetMachineCount = Math.max(
|
||||
minNumberOfMachines,
|
||||
activeMachines - baseScaleDown
|
||||
);
|
||||
}
|
||||
|
||||
if (targetMachineCount !== activeMachines) {
|
||||
Logger.info(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`);
|
||||
Logger.info(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting`
|
||||
);
|
||||
|
||||
if(targetMachineCount > activeMachines) {
|
||||
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
|
||||
if (targetMachineCount > activeMachines) {
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
} else {
|
||||
sendSlackWebhook(`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`, false, process.env.SLACK_AUTOSCALER ?? "");
|
||||
sendSlackWebhook(
|
||||
`🐂 Scaling from ${activeMachines} to ${targetMachineCount} - ${webScraperActive} active, ${webScraperWaiting} waiting - Current DateTime: ${new Date().toISOString()}`,
|
||||
false,
|
||||
process.env.SLACK_AUTOSCALER ?? ""
|
||||
);
|
||||
}
|
||||
return res.status(200).json({
|
||||
mode: "scale-descale",
|
@ -1,7 +1,7 @@
|
||||
import { Request, Response } from "express";
|
||||
import Redis from "ioredis";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../services/rate-limiter";
|
||||
import { Logger } from "../../../lib/logger";
|
||||
import { redisRateLimitClient } from "../../../services/rate-limiter";
|
||||
|
||||
export async function redisHealthController(req: Request, res: Response) {
|
||||
const retryOperation = async (operation, retries = 3) => {
|
58
apps/api/src/controllers/v0/crawl-cancel.ts
Normal file
58
apps/api/src/controllers/v0/crawl-cancel.ts
Normal file
@ -0,0 +1,58 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { supabase_service } from "../../../src/services/supabase";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
||||
try {
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
|
||||
const { success, team_id, error, status } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.CrawlStatus
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ error: "Job not found" });
|
||||
}
|
||||
|
||||
// check if the job belongs to the team
|
||||
if (useDbAuthentication) {
|
||||
const { data, error: supaError } = await supabase_service
|
||||
.from("bulljobs_teams")
|
||||
.select("*")
|
||||
.eq("job_id", req.params.jobId)
|
||||
.eq("team_id", team_id);
|
||||
if (supaError) {
|
||||
return res.status(500).json({ error: supaError.message });
|
||||
}
|
||||
|
||||
if (data.length === 0) {
|
||||
return res.status(403).json({ error: "Unauthorized" });
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
sc.cancelled = true;
|
||||
await saveCrawl(req.params.jobId, sc);
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
}
|
||||
|
||||
res.json({
|
||||
status: "cancelled"
|
||||
});
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res.status(500).json({ error: error.message });
|
||||
}
|
||||
}
|
@ -1,10 +1,10 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../src/lib/supabase-jobs";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { supabaseGetJobsById } from "../../../src/lib/supabase-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
@ -1,32 +1,20 @@
|
||||
import { Request, Response } from "express";
|
||||
import { checkTeamCredits } from "../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||
import {
|
||||
defaultCrawlPageOptions,
|
||||
defaultCrawlerOptions,
|
||||
defaultOrigin,
|
||||
} from "../../src/lib/default-values";
|
||||
import { checkTeamCredits } from "../../../src/services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { logCrawl } from "../../../src/services/logging/crawl_log";
|
||||
import { validateIdempotencyKey } from "../../../src/services/idempotency/validate";
|
||||
import { createIdempotencyKey } from "../../../src/services/idempotency/create";
|
||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../../src/lib/default-values";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
import { getJobPriority } from "../../src/lib/job-priority";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../../src/services/queue-service";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function crawlController(req: Request, res: Response) {
|
||||
try {
|
@ -1,12 +1,12 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../../src/types";
|
||||
import { isUrlBlocked } from "../../../src/scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { addCrawlJob, crawlToCrawler, lockURL, saveCrawl, StoredCrawl } from "../../../src/lib/crawl-redis";
|
||||
import { addScrapeJob } from "../../../src/services/queue-jobs";
|
||||
import { checkAndUpdateURL } from "../../../src/lib/validateUrl";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlPreviewController(req: Request, res: Response) {
|
@ -1,8 +1,8 @@
|
||||
|
||||
import { AuthResponse, RateLimiterMode } from "../types";
|
||||
import { AuthResponse, RateLimiterMode } from "../../types";
|
||||
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { authenticateUser } from "../auth";
|
||||
|
||||
|
||||
export const keyAuthController = async (req: Request, res: Response) => {
|
288
apps/api/src/controllers/v0/scrape.ts
Normal file
288
apps/api/src/controllers/v0/scrape.ts
Normal file
@ -0,0 +1,288 @@
|
||||
import { ExtractorOptions, PageOptions } from "./../../lib/entities";
|
||||
import { Request, Response } from "express";
|
||||
import {
|
||||
billTeam,
|
||||
checkTeamCredits,
|
||||
} from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { Document } from "../../lib/entities";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; // Import the isUrlBlocked function
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import {
|
||||
defaultPageOptions,
|
||||
defaultExtractorOptions,
|
||||
defaultTimeout,
|
||||
defaultOrigin,
|
||||
} from "../../lib/default-values";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function scrapeHelper(
|
||||
jobId: string,
|
||||
req: Request,
|
||||
team_id: string,
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
plan?: PlanType
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
data?: Document;
|
||||
returnCode: number;
|
||||
}> {
|
||||
const url = req.body.url;
|
||||
if (!url) {
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
return {
|
||||
success: false,
|
||||
error:
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||
returnCode: 403,
|
||||
};
|
||||
}
|
||||
|
||||
const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 });
|
||||
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions,
|
||||
team_id,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
origin: req.body.origin ?? defaultOrigin,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority
|
||||
);
|
||||
|
||||
let doc;
|
||||
|
||||
const err = await Sentry.startSpan(
|
||||
{
|
||||
name: "Wait for job to finish",
|
||||
op: "bullmq.wait",
|
||||
attributes: { job: jobId },
|
||||
},
|
||||
async (span) => {
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
span.setAttribute("timedOut", true);
|
||||
return {
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
returnCode: 408,
|
||||
};
|
||||
} else if (
|
||||
typeof e === "string" &&
|
||||
(e.includes("Error generating completions: ") ||
|
||||
e.includes("Invalid schema for function") ||
|
||||
e.includes(
|
||||
"LLM extraction did not match the extraction schema you provided."
|
||||
))
|
||||
) {
|
||||
return {
|
||||
success: false,
|
||||
error: e,
|
||||
returnCode: 500,
|
||||
};
|
||||
} else {
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
span.setAttribute("result", JSON.stringify(doc));
|
||||
return null;
|
||||
}
|
||||
);
|
||||
|
||||
if (err !== null) {
|
||||
return err;
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return {
|
||||
success: true,
|
||||
error: "No page found",
|
||||
returnCode: 200,
|
||||
data: doc,
|
||||
};
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
// Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
|
||||
if (
|
||||
!pageOptions.includeRawHtml &&
|
||||
extractorOptions.mode == "llm-extraction-from-raw-html"
|
||||
) {
|
||||
if (doc.rawHtml) {
|
||||
delete doc.rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
if (!pageOptions.includeHtml) {
|
||||
if (doc.html) {
|
||||
delete doc.html;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
data: doc,
|
||||
returnCode: 200,
|
||||
};
|
||||
}
|
||||
|
||||
export async function scrapeController(req: Request, res: Response) {
|
||||
try {
|
||||
let earlyReturn = false;
|
||||
// make sure to authenticate user first, Bearer <token>
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
RateLimiterMode.Scrape
|
||||
);
|
||||
if (!success) {
|
||||
return res.status(status).json({ error });
|
||||
}
|
||||
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
|
||||
const extractorOptions = {
|
||||
...defaultExtractorOptions,
|
||||
...req.body.extractorOptions,
|
||||
};
|
||||
const origin = req.body.origin ?? defaultOrigin;
|
||||
let timeout = req.body.timeout ?? defaultTimeout;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
if (
|
||||
typeof extractorOptions.extractionSchema !== "object" ||
|
||||
extractorOptions.extractionSchema === null
|
||||
) {
|
||||
return res
|
||||
.status(400)
|
||||
.json({
|
||||
error:
|
||||
"extractorOptions.extractionSchema must be an object if llm-extraction mode is specified",
|
||||
});
|
||||
}
|
||||
|
||||
pageOptions.onlyMainContent = true;
|
||||
timeout = req.body.timeout ?? 90000;
|
||||
}
|
||||
|
||||
// checkCredits
|
||||
try {
|
||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||
await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
earlyReturn = true;
|
||||
return res.status(402).json({ error: "Insufficient credits" });
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
earlyReturn = true;
|
||||
return res
|
||||
.status(500)
|
||||
.json({
|
||||
error:
|
||||
"Error checking team credits. Please contact hello@firecrawl.com for help.",
|
||||
});
|
||||
}
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const result = await scrapeHelper(
|
||||
jobId,
|
||||
req,
|
||||
team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
plan
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens =
|
||||
result.data && result.data.markdown
|
||||
? numTokensFromString(result.data.markdown, "gpt-3.5-turbo")
|
||||
: 0;
|
||||
|
||||
if (result.success) {
|
||||
let creditsToBeBilled = 0; // billing for doc done on queue end
|
||||
const creditsPerLLMExtract = 50;
|
||||
|
||||
if (extractorOptions.mode.includes("llm-extraction")) {
|
||||
// creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
|
||||
creditsToBeBilled += creditsPerLLMExtract;
|
||||
}
|
||||
|
||||
let startTimeBilling = new Date().getTime();
|
||||
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
const billingResult = await billTeam(team_id, creditsToBeBilled);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error:
|
||||
"Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: result.success,
|
||||
message: result.error,
|
||||
num_docs: 1,
|
||||
docs: [result.data],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: crawlerOptions,
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: extractorOptions,
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(result.returnCode).json(result);
|
||||
} catch (error) {
|
||||
Sentry.captureException(error);
|
||||
Logger.error(error);
|
||||
return res
|
||||
.status(500)
|
||||
.json({
|
||||
error:
|
||||
typeof error === "string"
|
||||
? error
|
||||
: error?.message ?? "Internal Server Error",
|
||||
});
|
||||
}
|
||||
}
|
@ -1,18 +1,18 @@
|
||||
import { Request, Response } from "express";
|
||||
import { WebScraperDataProvider } from "../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { PlanType, RateLimiterMode } from "../types";
|
||||
import { logJob } from "../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../lib/entities";
|
||||
import { search } from "../search";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { WebScraperDataProvider } from "../../scraper/WebScraper";
|
||||
import { billTeam, checkTeamCredits } from "../../services/billing/credit_billing";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { PlanType, RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { PageOptions, SearchOptions } from "../../lib/entities";
|
||||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../lib/logger";
|
||||
import { getJobPriority } from "../lib/job-priority";
|
||||
import { getScrapeQueue } from "../services/queue-service";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { addScrapeJob } from "../services/queue-jobs";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function searchHelper(
|
||||
jobId: string,
|
||||
@ -112,24 +112,7 @@ export async function searchHelper(
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
}
|
||||
|
||||
const docs = (await Promise.all(jobs.map(x => new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + 60000) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await x.getState();
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(x.id)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(x.id)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
})))).map(x => x[0]);
|
||||
const docs = (await Promise.all(jobs.map(x => waitForJob(x.id, 60000)))).map(x => x[0]);
|
||||
|
||||
if (docs.length === 0) {
|
||||
return { success: true, error: "No search results found", returnCode: 200 };
|
||||
@ -166,17 +149,16 @@ export async function searchController(req: Request, res: Response) {
|
||||
}
|
||||
const crawlerOptions = req.body.crawlerOptions ?? {};
|
||||
const pageOptions = req.body.pageOptions ?? {
|
||||
includeHtml: false,
|
||||
onlyMainContent: true,
|
||||
fetchPageContent: true,
|
||||
removeTags: [],
|
||||
fallback: false,
|
||||
includeHtml: req.body.pageOptions?.includeHtml ?? false,
|
||||
onlyMainContent: req.body.pageOptions?.onlyMainContent ?? false,
|
||||
fetchPageContent: req.body.pageOptions?.fetchPageContent ?? true,
|
||||
removeTags: req.body.pageOptions?.removeTags ?? [],
|
||||
fallback: req.body.pageOptions?.fallback ?? false,
|
||||
};
|
||||
const origin = req.body.origin ?? "api";
|
||||
|
||||
const searchOptions = req.body.searchOptions ?? { limit: 5 };
|
||||
|
||||
|
||||
const jobId = uuidv4();
|
||||
|
||||
try {
|
@ -1,6 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../src/lib/crawl-redis";
|
||||
import { Logger } from "../../../src/lib/logger";
|
||||
import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis";
|
||||
import { getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
47
apps/api/src/controllers/v1/__tests__/crawl.test.ts.WIP
Normal file
@ -0,0 +1,47 @@
|
||||
import { crawlController } from '../crawl'
|
||||
import { Request, Response } from 'express';
|
||||
import { authenticateUser } from '../auth'; // Ensure this import is correct
|
||||
import { createIdempotencyKey } from '../../services/idempotency/create';
|
||||
import { validateIdempotencyKey } from '../../services/idempotency/validate';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
|
||||
jest.mock('../auth', () => ({
|
||||
authenticateUser: jest.fn().mockResolvedValue({
|
||||
success: true,
|
||||
team_id: 'team123',
|
||||
error: null,
|
||||
status: 200
|
||||
}),
|
||||
reduce: jest.fn()
|
||||
}));
|
||||
jest.mock('../../services/idempotency/validate');
|
||||
|
||||
describe('crawlController', () => {
|
||||
it('should prevent duplicate requests using the same idempotency key', async () => {
|
||||
const req = {
|
||||
headers: {
|
||||
'x-idempotency-key': await uuidv4(),
|
||||
'Authorization': `Bearer ${process.env.TEST_API_KEY}`
|
||||
},
|
||||
body: {
|
||||
url: 'https://mendable.ai'
|
||||
}
|
||||
} as unknown as Request;
|
||||
const res = {
|
||||
status: jest.fn().mockReturnThis(),
|
||||
json: jest.fn()
|
||||
} as unknown as Response;
|
||||
|
||||
// Mock the idempotency key validation to return false for the second call
|
||||
(validateIdempotencyKey as jest.Mock).mockResolvedValueOnce(true).mockResolvedValueOnce(false);
|
||||
|
||||
// First request should succeed
|
||||
await crawlController(req, res);
|
||||
expect(res.status).not.toHaveBeenCalledWith(409);
|
||||
|
||||
// Second request with the same key should fail
|
||||
await crawlController(req, res);
|
||||
expect(res.status).toHaveBeenCalledWith(409);
|
||||
expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' });
|
||||
});
|
||||
});
|
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
64
apps/api/src/controllers/v1/__tests__/urlValidation.test.ts
Normal file
@ -0,0 +1,64 @@
|
||||
import { url } from "../types";
|
||||
|
||||
describe("URL Schema Validation", () => {
|
||||
beforeEach(() => {
|
||||
jest.resetAllMocks();
|
||||
});
|
||||
|
||||
it("should prepend http:// to URLs without a protocol", () => {
|
||||
const result = url.parse("example.com");
|
||||
expect(result).toBe("http://example.com");
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("http://example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should allow valid URLs with http or https", () => {
|
||||
expect(() => url.parse("example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject URLs with unsupported protocols", () => {
|
||||
expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL");
|
||||
});
|
||||
|
||||
it("should reject URLs without a valid top-level domain", () => {
|
||||
expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path");
|
||||
});
|
||||
|
||||
it("should reject blocked URLs", () => {
|
||||
expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains correctly", () => {
|
||||
expect(() => url.parse("http://sub.example.com")).not.toThrow();
|
||||
expect(() => url.parse("https://blog.example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with paths correctly", () => {
|
||||
expect(() => url.parse("http://example.com/path")).not.toThrow();
|
||||
expect(() => url.parse("https://example.com/another/path")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should handle URLs with subdomains that are blocked", () => {
|
||||
expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should handle URLs with paths that are blocked", () => {
|
||||
expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs starting with 'http://http'", () => {
|
||||
expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol.");
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://example.com/http://example.com")).not.toThrow();
|
||||
});
|
||||
|
||||
it("should reject malformed URLs containing multiple 'http://'", () => {
|
||||
expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL");
|
||||
});
|
||||
})
|
@ -1,9 +1,9 @@
|
||||
import { Request, Response } from "express";
|
||||
import { authenticateUser } from "./auth";
|
||||
import { RateLimiterMode } from "../../src/types";
|
||||
import { supabase_service } from "../../src/services/supabase";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../src/lib/crawl-redis";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, saveCrawl } from "../../lib/crawl-redis";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
export async function crawlCancelController(req: Request, res: Response) {
|
159
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
159
apps/api/src/controllers/v1/crawl-status-ws.ts
Normal file
@ -0,0 +1,159 @@
|
||||
import { authMiddleware } from "../../routes/v1";
|
||||
import { RateLimiterMode } from "../../types";
|
||||
import { authenticateUser } from "../auth";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, Document, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { WebSocket } from "ws";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength, isCrawlFinished, isCrawlFinishedLocked } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error",
|
||||
error: string,
|
||||
}
|
||||
|
||||
type CatchupMessage = {
|
||||
type: "catchup",
|
||||
data: CrawlStatusResponse,
|
||||
}
|
||||
|
||||
type DocumentMessage = {
|
||||
type: "document",
|
||||
data: Document,
|
||||
}
|
||||
|
||||
type DoneMessage = { type: "done" }
|
||||
|
||||
type Message = ErrorMessage | CatchupMessage | DoneMessage | DocumentMessage;
|
||||
|
||||
function send(ws: WebSocket, msg: Message) {
|
||||
if (ws.readyState === 1) {
|
||||
return new Promise((resolve, reject) => {
|
||||
ws.send(JSON.stringify(msg), (err) => {
|
||||
if (err) reject(err);
|
||||
else resolve(null);
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
function close(ws: WebSocket, code: number, msg: Message) {
|
||||
if (ws.readyState <= 1) {
|
||||
ws.close(code, JSON.stringify(msg));
|
||||
}
|
||||
}
|
||||
|
||||
async function crawlStatusWS(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return close(ws, 1008, { type: "error", error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return close(ws, 3003, { type: "error", error: "Forbidden" });
|
||||
}
|
||||
|
||||
let doneJobIDs = [];
|
||||
let finished = false;
|
||||
|
||||
const loop = async () => {
|
||||
if (finished) return;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
|
||||
if (jobIDs.length === doneJobIDs.length) {
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
|
||||
const notDoneJobIDs = jobIDs.filter(x => !doneJobIDs.includes(x));
|
||||
const jobStatuses = await Promise.all(notDoneJobIDs.map(async x => [x, await getScrapeQueue().getJobState(x)]));
|
||||
const newlyDoneJobIDs = jobStatuses.filter(x => x[1] === "completed" || x[1] === "failed").map(x => x[0]);
|
||||
|
||||
for (const jobID of newlyDoneJobIDs) {
|
||||
const job = await getJob(jobID);
|
||||
|
||||
if (job.returnvalue) {
|
||||
send(ws, {
|
||||
type: "document",
|
||||
data: legacyDocumentConverter(job.returnvalue),
|
||||
})
|
||||
} else {
|
||||
return close(ws, 3000, { type: "error", error: job.failedReason });
|
||||
}
|
||||
}
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
};
|
||||
|
||||
setTimeout(loop, 1000);
|
||||
|
||||
doneJobIDs = await getDoneJobsOrdered(req.params.jobId);
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobs = await getJobs(doneJobIDs);
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
send(ws, {
|
||||
type: "catchup",
|
||||
data: {
|
||||
status,
|
||||
total: jobIDs.length,
|
||||
completed: doneJobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
}
|
||||
});
|
||||
|
||||
if (status !== "scraping") {
|
||||
finished = true;
|
||||
return close(ws, 1000, { type: "done" });
|
||||
}
|
||||
}
|
||||
|
||||
// Basically just middleware and error wrapping
|
||||
export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAuth<CrawlStatusParams, undefined, undefined>) {
|
||||
try {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
null,
|
||||
RateLimiterMode.CrawlStatus,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return close(ws, 3000, {
|
||||
type: "error",
|
||||
error,
|
||||
});
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
|
||||
await crawlStatusWS(ws, req);
|
||||
} catch (err) {
|
||||
Sentry.captureException(err);
|
||||
|
||||
const id = uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
return close(ws, 1011, {
|
||||
type: "error",
|
||||
error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
|
||||
});
|
||||
}
|
||||
}
|
116
apps/api/src/controllers/v1/crawl-status.ts
Normal file
116
apps/api/src/controllers/v1/crawl-status.ts
Normal file
@ -0,0 +1,116 @@
|
||||
import { Response } from "express";
|
||||
import { CrawlStatusParams, CrawlStatusResponse, ErrorResponse, legacyDocumentConverter, RequestWithAuth } from "./types";
|
||||
import { getCrawl, getCrawlExpiry, getCrawlJobs, getDoneJobsOrdered, getDoneJobsOrderedLength } from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { supabaseGetJobById, supabaseGetJobsById } from "../../lib/supabase-jobs";
|
||||
|
||||
export async function getJob(id: string) {
|
||||
const job = await getScrapeQueue().getJob(id);
|
||||
if (!job) return job;
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobById(id);
|
||||
|
||||
if (supabaseData) {
|
||||
job.returnvalue = supabaseData.docs;
|
||||
}
|
||||
}
|
||||
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
|
||||
return job;
|
||||
}
|
||||
|
||||
export async function getJobs(ids: string[]) {
|
||||
const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x);
|
||||
|
||||
if (process.env.USE_DB_AUTHENTICATION === "true") {
|
||||
const supabaseData = await supabaseGetJobsById(ids);
|
||||
|
||||
supabaseData.forEach(x => {
|
||||
const job = jobs.find(y => y.id === x.job_id);
|
||||
if (job) {
|
||||
job.returnvalue = x.docs;
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
jobs.forEach(job => {
|
||||
job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue;
|
||||
});
|
||||
|
||||
return jobs;
|
||||
}
|
||||
|
||||
export async function crawlStatusController(req: RequestWithAuth<CrawlStatusParams, undefined, CrawlStatusResponse>, res: Response<CrawlStatusResponse>) {
|
||||
const sc = await getCrawl(req.params.jobId);
|
||||
if (!sc) {
|
||||
return res.status(404).json({ success: false, error: "Job not found" });
|
||||
}
|
||||
|
||||
if (sc.team_id !== req.auth.team_id) {
|
||||
return res.status(403).json({ success: false, error: "Forbidden" });
|
||||
}
|
||||
|
||||
const start = typeof req.query.skip === "string" ? parseInt(req.query.skip, 10) : 0;
|
||||
const end = typeof req.query.limit === "string" ? (start + parseInt(req.query.limit, 10) - 1) : undefined;
|
||||
|
||||
const jobIDs = await getCrawlJobs(req.params.jobId);
|
||||
const jobStatuses = await Promise.all(jobIDs.map(x => getScrapeQueue().getJobState(x)));
|
||||
const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] = sc.cancelled ? "cancelled" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "scraping";
|
||||
const doneJobsLength = await getDoneJobsOrderedLength(req.params.jobId);
|
||||
const doneJobsOrder = await getDoneJobsOrdered(req.params.jobId, start, end ?? -1);
|
||||
|
||||
let doneJobs = [];
|
||||
|
||||
if (end === undefined) { // determine 10 megabyte limit
|
||||
let bytes = 0;
|
||||
const bytesLimit = 10485760; // 10 MiB in bytes
|
||||
const factor = 100; // chunking for faster retrieval
|
||||
|
||||
for (let i = 0; i < doneJobsOrder.length && bytes < bytesLimit; i += factor) {
|
||||
// get current chunk and retrieve jobs
|
||||
const currentIDs = doneJobsOrder.slice(i, i+factor);
|
||||
const jobs = await getJobs(currentIDs);
|
||||
|
||||
// iterate through jobs and add them one them one to the byte counter
|
||||
// both loops will break once we cross the byte counter
|
||||
for (let ii = 0; ii < jobs.length && bytes < bytesLimit; ii++) {
|
||||
const job = jobs[ii];
|
||||
doneJobs.push(job);
|
||||
bytes += JSON.stringify(legacyDocumentConverter(job.returnvalue)).length;
|
||||
}
|
||||
}
|
||||
|
||||
// if we ran over the bytes limit, remove the last document
|
||||
if (bytes > bytesLimit) {
|
||||
doneJobs.splice(doneJobs.length - 1, 1);
|
||||
}
|
||||
} else {
|
||||
doneJobs = await getJobs(doneJobsOrder);
|
||||
}
|
||||
|
||||
const data = doneJobs.map(x => x.returnvalue);
|
||||
|
||||
const nextURL = new URL(`${req.protocol}://${req.get("host")}/v1/crawl/${req.params.jobId}`);
|
||||
|
||||
nextURL.searchParams.set("skip", (start + data.length).toString());
|
||||
|
||||
if (typeof req.query.limit === "string") {
|
||||
nextURL.searchParams.set("limit", req.query.limit);
|
||||
}
|
||||
|
||||
res.status(200).json({
|
||||
status,
|
||||
completed: doneJobsLength,
|
||||
total: jobIDs.length,
|
||||
creditsUsed: jobIDs.length,
|
||||
expiresAt: (await getCrawlExpiry(req.params.jobId)).toISOString(),
|
||||
next:
|
||||
status !== "scraping" && (start + data.length) === doneJobsLength // if there's not gonna be any documents after this
|
||||
? undefined
|
||||
: nextURL.href,
|
||||
data: data.map(x => legacyDocumentConverter(x)),
|
||||
});
|
||||
}
|
||||
|
157
apps/api/src/controllers/v1/crawl.ts
Normal file
157
apps/api/src/controllers/v1/crawl.ts
Normal file
@ -0,0 +1,157 @@
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
CrawlRequest,
|
||||
crawlRequestSchema,
|
||||
CrawlResponse,
|
||||
legacyCrawlerOptions,
|
||||
legacyScrapeOptions,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
addCrawlJob,
|
||||
addCrawlJobs,
|
||||
crawlToCrawler,
|
||||
lockURL,
|
||||
lockURLs,
|
||||
saveCrawl,
|
||||
StoredCrawl,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { logCrawl } from "../../services/logging/crawl_log";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { addScrapeJob } from "../../services/queue-jobs";
|
||||
import { Logger } from "../../lib/logger";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
|
||||
export async function crawlController(
|
||||
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||
res: Response<CrawlResponse>
|
||||
) {
|
||||
req.body = crawlRequestSchema.parse(req.body);
|
||||
|
||||
const id = uuidv4();
|
||||
|
||||
await logCrawl(id, req.auth.team_id);
|
||||
|
||||
const { remainingCredits } = req.account;
|
||||
|
||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||
|
||||
// TODO: @rafa, is this right? copied from v0
|
||||
if (Array.isArray(crawlerOptions.includes)) {
|
||||
for (const x of crawlerOptions.includes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(crawlerOptions.excludes)) {
|
||||
for (const x of crawlerOptions.excludes) {
|
||||
try {
|
||||
new RegExp(x);
|
||||
} catch (e) {
|
||||
return res.status(400).json({ success: false, error: e.message });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
} catch (e) {
|
||||
Logger.debug(
|
||||
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||
e
|
||||
)}`
|
||||
);
|
||||
}
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||
? null
|
||||
: await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null && sitemap.length > 0) {
|
||||
let jobPriority = 20;
|
||||
// If it is over 1000, we need to get the job priority,
|
||||
// otherwise we can use the default priority of 20
|
||||
if(sitemap.length > 1000){
|
||||
// set base to 21
|
||||
jobPriority = await getJobPriority({plan: req.auth.plan, team_id: req.auth.team_id, basePriority: 21})
|
||||
}
|
||||
const jobs = sitemap.map((x) => {
|
||||
const url = x.url;
|
||||
const uuid = uuidv4();
|
||||
return {
|
||||
name: uuid,
|
||||
data: {
|
||||
url,
|
||||
mode: "single_urls",
|
||||
team_id: req.auth.team_id,
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
sitemapped: true,
|
||||
v1: true,
|
||||
},
|
||||
opts: {
|
||||
jobId: uuid,
|
||||
priority: 20,
|
||||
},
|
||||
};
|
||||
});
|
||||
|
||||
await lockURLs(
|
||||
id,
|
||||
jobs.map((x) => x.data.url)
|
||||
);
|
||||
await addCrawlJobs(
|
||||
id,
|
||||
jobs.map((x) => x.opts.jobId)
|
||||
);
|
||||
await getScrapeQueue().addBulk(jobs);
|
||||
} else {
|
||||
await lockURL(id, sc, req.body.url);
|
||||
const job = await addScrapeJob(
|
||||
{
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: crawlerOptions,
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions: pageOptions,
|
||||
origin: "api",
|
||||
crawl_id: id,
|
||||
webhook: req.body.webhook,
|
||||
v1: true,
|
||||
},
|
||||
{
|
||||
priority: 15,
|
||||
}
|
||||
);
|
||||
await addCrawlJob(id, job.id);
|
||||
}
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
id,
|
||||
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||
});
|
||||
}
|
6
apps/api/src/controllers/v1/liveness.ts
Normal file
6
apps/api/src/controllers/v1/liveness.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
export async function livenessController(req: Request, res: Response) {
|
||||
//TODO: add checks if the application is live and healthy like checking the redis connection
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
122
apps/api/src/controllers/v1/map.ts
Normal file
122
apps/api/src/controllers/v1/map.ts
Normal file
@ -0,0 +1,122 @@
|
||||
import { Response } from "express";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import {
|
||||
legacyCrawlerOptions,
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
import { MapResponse, MapRequest } from "./types";
|
||||
import { configDotenv } from "dotenv";
|
||||
import {
|
||||
checkAndUpdateURLForMap,
|
||||
isSameDomain,
|
||||
isSameSubdomain,
|
||||
removeDuplicateUrls,
|
||||
} from "../../lib/validateUrl";
|
||||
import { fireEngineMap } from "../../search/fireEngine";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
|
||||
configDotenv();
|
||||
|
||||
export async function mapController(
|
||||
req: RequestWithAuth<{}, MapResponse, MapRequest>,
|
||||
res: Response<MapResponse>
|
||||
) {
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
req.body = mapRequestSchema.parse(req.body);
|
||||
|
||||
|
||||
const limit = req.body.limit;
|
||||
const id = uuidv4();
|
||||
let links: string[] = [req.body.url];
|
||||
|
||||
const sc: StoredCrawl = {
|
||||
originUrl: req.body.url,
|
||||
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||
pageOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
createdAt: Date.now(),
|
||||
plan: req.auth.plan,
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
|
||||
const sitemap = req.body.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
||||
|
||||
if (sitemap !== null) {
|
||||
sitemap.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
|
||||
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||
|
||||
let mapUrl = req.body.search
|
||||
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||
: `site:${req.body.url}`;
|
||||
// www. seems to exclude subdomains in some cases
|
||||
const mapResults = await fireEngineMap(mapUrl, {
|
||||
// limit to 50 results (beta)
|
||||
numResults: Math.min(limit, 50),
|
||||
});
|
||||
|
||||
if (mapResults.length > 0) {
|
||||
if (req.body.search) {
|
||||
// Ensure all map results are first, maintaining their order
|
||||
links = [
|
||||
mapResults[0].url,
|
||||
...mapResults.slice(1).map((x) => x.url),
|
||||
...links,
|
||||
];
|
||||
} else {
|
||||
mapResults.map((x) => {
|
||||
links.push(x.url);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||
|
||||
// allows for subdomains to be included
|
||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||
|
||||
// if includeSubdomains is false, filter out subdomains
|
||||
if (!req.body.includeSubdomains) {
|
||||
links = links.filter((x) => isSameSubdomain(x, req.body.url));
|
||||
}
|
||||
|
||||
// remove duplicates that could be due to http/https or www
|
||||
links = removeDuplicateUrls(links);
|
||||
|
||||
await billTeam(req.auth.team_id, 1);
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
const linksToReturn = links.slice(0, limit);
|
||||
|
||||
logJob({
|
||||
job_id: id,
|
||||
success: links.length > 0,
|
||||
message: "Map completed",
|
||||
num_docs: links.length,
|
||||
docs: linksToReturn,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "map",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: {},
|
||||
origin: req.body.origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: 0,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
links: linksToReturn,
|
||||
});
|
||||
}
|
6
apps/api/src/controllers/v1/readiness.ts
Normal file
6
apps/api/src/controllers/v1/readiness.ts
Normal file
@ -0,0 +1,6 @@
|
||||
import { Request, Response } from "express";
|
||||
|
||||
export async function readinessController(req: Request, res: Response) {
|
||||
// TODO: add checks when the application is ready to serve traffic
|
||||
res.status(200).json({ status: "ok" });
|
||||
}
|
108
apps/api/src/controllers/v1/scrape.ts
Normal file
108
apps/api/src/controllers/v1/scrape.ts
Normal file
@ -0,0 +1,108 @@
|
||||
import { Request, Response } from "express";
|
||||
import { Logger } from '../../lib/logger';
|
||||
import { Document, legacyDocumentConverter, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export async function scrapeController(req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
|
||||
req.body = scrapeRequestSchema.parse(req.body);
|
||||
let earlyReturn = false;
|
||||
|
||||
const origin = req.body.origin;
|
||||
const timeout = req.body.timeout;
|
||||
const pageOptions = legacyScrapeOptions(req.body);
|
||||
const jobId = uuidv4();
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const jobPriority = await getJobPriority({plan: req.auth.plan as PlanType, team_id: req.auth.team_id, basePriority: 10})
|
||||
|
||||
const job = await addScrapeJob({
|
||||
url: req.body.url,
|
||||
mode: "single_urls",
|
||||
crawlerOptions: {},
|
||||
team_id: req.auth.team_id,
|
||||
pageOptions,
|
||||
extractorOptions: {},
|
||||
origin: req.body.origin,
|
||||
}, {}, jobId, jobPriority);
|
||||
|
||||
let doc: any | undefined;
|
||||
try {
|
||||
doc = (await waitForJob(job.id, timeout))[0];
|
||||
} catch (e) {
|
||||
Logger.error(`Error in scrapeController: ${e}`);
|
||||
if (e instanceof Error && e.message.startsWith("Job wait")) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
});
|
||||
} else {
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: "Internal server error",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
await job.remove();
|
||||
|
||||
if (!doc) {
|
||||
console.error("!!! PANIC DOC IS", doc, job);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
warning: "No page found",
|
||||
data: doc
|
||||
});
|
||||
}
|
||||
|
||||
delete doc.index;
|
||||
delete doc.provider;
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
|
||||
|
||||
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||
if (earlyReturn) {
|
||||
// Don't bill if we're early returning
|
||||
return;
|
||||
}
|
||||
|
||||
const billingResult = await billTeam(
|
||||
req.auth.team_id,
|
||||
creditsToBeBilled
|
||||
);
|
||||
if (!billingResult.success) {
|
||||
return res.status(402).json({
|
||||
success: false,
|
||||
error: "Failed to bill team. Insufficient credits or subscription not found.",
|
||||
});
|
||||
}
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
message: "Scrape completed",
|
||||
num_docs: 1,
|
||||
docs: [doc],
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "scrape",
|
||||
url: req.body.url,
|
||||
crawlerOptions: {},
|
||||
pageOptions: pageOptions,
|
||||
origin: origin,
|
||||
extractor_options: { mode: "markdown" },
|
||||
num_tokens: numTokens,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: legacyDocumentConverter(doc),
|
||||
});
|
||||
}
|
321
apps/api/src/controllers/v1/types.ts
Normal file
321
apps/api/src/controllers/v1/types.ts
Normal file
@ -0,0 +1,321 @@
|
||||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { PageOptions } from "../../lib/entities";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { PlanType } from "../../types";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
| "html"
|
||||
| "rawHtml"
|
||||
| "links"
|
||||
| "screenshot"
|
||||
| "screenshot@fullPage";
|
||||
|
||||
export const url = z.preprocess(
|
||||
(x) => {
|
||||
if (!protocolIncluded(x as string)) {
|
||||
return `http://${x}`;
|
||||
}
|
||||
return x;
|
||||
},
|
||||
z
|
||||
.string()
|
||||
.url()
|
||||
.regex(/^https?:\/\//, "URL uses unsupported protocol")
|
||||
.refine(
|
||||
(x) => /\.[a-z]{2,}(\/|$)/i.test(x),
|
||||
"URL must have a valid top-level domain or be a valid path"
|
||||
)
|
||||
.refine(
|
||||
(x) => checkUrl(x as string),
|
||||
"Invalid URL"
|
||||
)
|
||||
.refine(
|
||||
(x) => !isUrlBlocked(x as string),
|
||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."
|
||||
)
|
||||
);
|
||||
|
||||
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||
|
||||
export const scrapeOptions = z.object({
|
||||
formats: z
|
||||
.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
])
|
||||
.array()
|
||||
.optional()
|
||||
.default(["markdown"]),
|
||||
headers: z.record(z.string(), z.string()).optional(),
|
||||
includeTags: z.string().array().optional(),
|
||||
excludeTags: z.string().array().optional(),
|
||||
onlyMainContent: z.boolean().default(true),
|
||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||
parsePDF: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||
|
||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type ScrapeRequest = {
|
||||
// url: string;
|
||||
// formats?: Format[];
|
||||
// headers?: { [K: string]: string };
|
||||
// includeTags?: string[];
|
||||
// excludeTags?: string[];
|
||||
// onlyMainContent?: boolean;
|
||||
// timeout?: number;
|
||||
// waitFor?: number;
|
||||
// }
|
||||
|
||||
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
|
||||
|
||||
const crawlerOptions = z.object({
|
||||
includePaths: z.string().array().default([]),
|
||||
excludePaths: z.string().array().default([]),
|
||||
maxDepth: z.number().default(10), // default?
|
||||
limit: z.number().default(10000), // default?
|
||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||
allowExternalLinks: z.boolean().default(false),
|
||||
ignoreSitemap: z.boolean().default(true),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlerOptions = {
|
||||
// includePaths?: string[];
|
||||
// excludePaths?: string[];
|
||||
// maxDepth?: number;
|
||||
// limit?: number;
|
||||
// allowBackwardLinks?: boolean; // >> TODO: CHANGE THIS NAME???
|
||||
// allowExternalLinks?: boolean;
|
||||
// ignoreSitemap?: boolean;
|
||||
// };
|
||||
|
||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||
|
||||
export const crawlRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||
webhook: z.string().url().optional(),
|
||||
limit: z.number().default(10000),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type CrawlRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// scrapeOptions?: Exclude<ScrapeRequest, "url">;
|
||||
// };
|
||||
|
||||
export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
||||
|
||||
export const mapRequestSchema = crawlerOptions.extend({
|
||||
url,
|
||||
origin: z.string().optional().default("api"),
|
||||
includeSubdomains: z.boolean().default(true),
|
||||
search: z.string().optional(),
|
||||
ignoreSitemap: z.boolean().default(false),
|
||||
limit: z.number().min(1).max(50).default(5000).optional(),
|
||||
}).strict(strictMessage);
|
||||
|
||||
// export type MapRequest = {
|
||||
// url: string;
|
||||
// crawlerOptions?: CrawlerOptions;
|
||||
// };
|
||||
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: {
|
||||
title?: string;
|
||||
description?: string;
|
||||
language?: string;
|
||||
keywords?: string;
|
||||
robots?: string;
|
||||
ogTitle?: string;
|
||||
ogDescription?: string;
|
||||
ogUrl?: string;
|
||||
ogImage?: string;
|
||||
ogAudio?: string;
|
||||
ogDeterminer?: string;
|
||||
ogLocale?: string;
|
||||
ogLocaleAlternate?: string[];
|
||||
ogSiteName?: string;
|
||||
ogVideo?: string;
|
||||
dcTermsCreated?: string;
|
||||
dcDateCreated?: string;
|
||||
dcDate?: string;
|
||||
dcTermsType?: string;
|
||||
dcType?: string;
|
||||
dcTermsAudience?: string;
|
||||
dcTermsSubject?: string;
|
||||
dcSubject?: string;
|
||||
dcDescription?: string;
|
||||
dcTermsKeywords?: string;
|
||||
modifiedTime?: string;
|
||||
publishedTime?: string;
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type ErrorResponse = {
|
||||
success: false;
|
||||
error: string;
|
||||
details?: any;
|
||||
};
|
||||
|
||||
export type ScrapeResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document;
|
||||
};
|
||||
|
||||
export interface ScrapeResponseRequestTest {
|
||||
statusCode: number;
|
||||
body: ScrapeResponse;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
export type CrawlResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
id: string;
|
||||
url: string;
|
||||
};
|
||||
|
||||
export type MapResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
links: string[];
|
||||
};
|
||||
|
||||
export type CrawlStatusParams = {
|
||||
jobId: string;
|
||||
};
|
||||
|
||||
export type CrawlStatusResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
status: "scraping" | "completed" | "failed" | "cancelled";
|
||||
completed: number;
|
||||
total: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: string;
|
||||
next?: string;
|
||||
data: Document[];
|
||||
};
|
||||
|
||||
type AuthObject = {
|
||||
team_id: string;
|
||||
plan: PlanType;
|
||||
};
|
||||
|
||||
type Account = {
|
||||
remainingCredits: number;
|
||||
};
|
||||
|
||||
export interface RequestWithMaybeAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth?: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface RequestWithAuth<
|
||||
ReqParams = {},
|
||||
ReqBody = undefined,
|
||||
ResBody = undefined,
|
||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||
auth: AuthObject;
|
||||
account?: Account;
|
||||
}
|
||||
|
||||
export interface ResponseWithSentry<
|
||||
ResBody = undefined,
|
||||
> extends Response<ResBody> {
|
||||
sentry?: string,
|
||||
}
|
||||
|
||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||
return {
|
||||
includes: x.includePaths,
|
||||
excludes: x.excludePaths,
|
||||
maxCrawledLinks: x.limit,
|
||||
maxCrawledDepth: x.maxDepth,
|
||||
limit: x.limit,
|
||||
generateImgAltText: false,
|
||||
allowBackwardCrawling: x.allowBackwardLinks,
|
||||
allowExternalContentLinks: x.allowExternalLinks,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
|
||||
return {
|
||||
includeMarkdown: x.formats.includes("markdown"),
|
||||
includeHtml: x.formats.includes("html"),
|
||||
includeRawHtml: x.formats.includes("rawHtml"),
|
||||
onlyIncludeTags: x.includeTags,
|
||||
removeTags: x.excludeTags,
|
||||
onlyMainContent: x.onlyMainContent,
|
||||
waitFor: x.waitFor,
|
||||
includeLinks: x.formats.includes("links"),
|
||||
screenshot: x.formats.includes("screenshot"),
|
||||
fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
|
||||
parsePDF: x.parsePDF,
|
||||
};
|
||||
}
|
||||
|
||||
export function legacyDocumentConverter(doc: any): Document {
|
||||
if (doc.metadata) {
|
||||
if (doc.metadata.screenshot) {
|
||||
doc.screenshot = doc.metadata.screenshot;
|
||||
delete doc.metadata.screenshot;
|
||||
}
|
||||
|
||||
if (doc.metadata.fullPageScreenshot) {
|
||||
doc.fullPageScreenshot = doc.metadata.fullPageScreenshot;
|
||||
delete doc.metadata.fullPageScreenshot;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: doc.markdown,
|
||||
links: doc.linksOnPage,
|
||||
rawHtml: doc.rawHtml,
|
||||
html: doc.html,
|
||||
screenshot: doc.screenshot ?? doc.fullPageScreenshot,
|
||||
metadata: {
|
||||
...doc.metadata,
|
||||
pageError: undefined,
|
||||
pageStatusCode: undefined,
|
||||
error: doc.metadata.pageError,
|
||||
statusCode: doc.metadata.pageStatusCode,
|
||||
},
|
||||
};
|
||||
}
|
@ -1,7 +1,7 @@
|
||||
import "dotenv/config";
|
||||
import "./services/sentry"
|
||||
import * as Sentry from "@sentry/node";
|
||||
import express from "express";
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import bodyParser from "body-parser";
|
||||
import cors from "cors";
|
||||
import { getScrapeQueue } from "./services/queue-service";
|
||||
@ -15,8 +15,12 @@ import { ScrapeEvents } from "./lib/scrape-events";
|
||||
import http from 'node:http';
|
||||
import https from 'node:https';
|
||||
import CacheableLookup from 'cacheable-lookup';
|
||||
|
||||
|
||||
import { v1Router } from "./routes/v1";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "./controllers/v1/crawl-status-ws";
|
||||
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
|
||||
import { ZodError } from "zod";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
|
||||
const { createBullBoard } = require("@bull-board/api");
|
||||
const { BullAdapter } = require("@bull-board/api/bullAdapter");
|
||||
@ -49,7 +53,8 @@ if (cluster.isMaster) {
|
||||
}
|
||||
});
|
||||
} else {
|
||||
const app = express();
|
||||
const ws = expressWs(express());
|
||||
const app = ws.app;
|
||||
|
||||
global.isProduction = process.env.IS_PRODUCTION === "true";
|
||||
|
||||
@ -82,6 +87,7 @@ if (cluster.isMaster) {
|
||||
|
||||
// register router
|
||||
app.use(v0Router);
|
||||
app.use("/v1", v1Router);
|
||||
app.use(adminRouter);
|
||||
|
||||
const DEFAULT_PORT = process.env.PORT ?? 3002;
|
||||
@ -186,6 +192,27 @@ if (cluster.isMaster) {
|
||||
|
||||
Sentry.setupExpressErrorHandler(app);
|
||||
|
||||
app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: ResponseWithSentry<ErrorResponse>, next: NextFunction) => {
|
||||
if (err instanceof ZodError) {
|
||||
res.status(400).json({ success: false, error: "Bad Request", details: err.errors });
|
||||
} else {
|
||||
const id = res.sentry ?? uuidv4();
|
||||
let verbose = JSON.stringify(err);
|
||||
if (verbose === "{}") {
|
||||
if (err instanceof Error) {
|
||||
verbose = JSON.stringify({
|
||||
message: err.message,
|
||||
name: err.name,
|
||||
stack: err.stack,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Logger.error("Error occurred in request! (" + req.path + ") -- ID " + id + " -- " + verbose);
|
||||
res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
|
||||
}
|
||||
});
|
||||
|
||||
Logger.info(`Worker ${process.pid} started`);
|
||||
}
|
||||
|
||||
|
32
apps/api/src/lib/checkCredits.ts
Normal file
32
apps/api/src/lib/checkCredits.ts
Normal file
@ -0,0 +1,32 @@
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import { Logger } from "./logger";
|
||||
|
||||
type checkCreditsResponse = {
|
||||
status: number;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
export const checkCredits = async (team_id: string): Promise<checkCreditsResponse> => {
|
||||
try {
|
||||
const {
|
||||
success: creditsCheckSuccess,
|
||||
message: creditsCheckMessage
|
||||
} = await checkTeamCredits(team_id, 1);
|
||||
if (!creditsCheckSuccess) {
|
||||
return {
|
||||
status: 402,
|
||||
error: "Insufficient credits"
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
Logger.error(error);
|
||||
return {
|
||||
status: 500,
|
||||
error: "Error checking team credits. Please contact hello@firecrawl.com for help."
|
||||
};
|
||||
}
|
||||
return {
|
||||
status: 200,
|
||||
error: null
|
||||
}
|
||||
};
|
@ -27,6 +27,14 @@ export async function getCrawl(id: string): Promise<StoredCrawl | null> {
|
||||
return JSON.parse(x);
|
||||
}
|
||||
|
||||
export async function getCrawlExpiry(id: string): Promise<Date> {
|
||||
const d = new Date();
|
||||
const ttl = await redisConnection.pttl("crawl:" + id);
|
||||
d.setMilliseconds(d.getMilliseconds() + ttl);
|
||||
d.setMilliseconds(0);
|
||||
return d;
|
||||
}
|
||||
|
||||
export async function addCrawlJob(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX");
|
||||
@ -39,13 +47,27 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
|
||||
|
||||
export async function addCrawlJobDone(id: string, job_id: string) {
|
||||
await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
|
||||
await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
|
||||
await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrderedLength(id: string): Promise<number> {
|
||||
return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered");
|
||||
}
|
||||
|
||||
export async function getDoneJobsOrdered(id: string, start = 0, end = -1): Promise<string[]> {
|
||||
return await redisConnection.lrange("crawl:" + id + ":jobs_done_ordered", start, end);
|
||||
}
|
||||
|
||||
export async function isCrawlFinished(id: string) {
|
||||
return (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs"));
|
||||
}
|
||||
|
||||
export async function isCrawlFinishedLocked(id: string) {
|
||||
return (await redisConnection.exists("crawl:" + id + ":finish"));
|
||||
}
|
||||
|
||||
export async function finishCrawl(id: string) {
|
||||
if (await isCrawlFinished(id)) {
|
||||
const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes");
|
||||
|
@ -11,6 +11,7 @@ export interface Progress {
|
||||
}
|
||||
|
||||
export type PageOptions = {
|
||||
includeMarkdown?: boolean;
|
||||
onlyMainContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
@ -24,6 +25,7 @@ export type PageOptions = {
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
includeLinks?: boolean;
|
||||
useFastMode?: boolean; // beta
|
||||
disableJSDom?: boolean; // beta
|
||||
atsv?: boolean; // beta
|
||||
|
159
apps/api/src/lib/validateUrl.test.ts
Normal file
159
apps/api/src/lib/validateUrl.test.ts
Normal file
@ -0,0 +1,159 @@
|
||||
import { isSameDomain, removeDuplicateUrls } from "./validateUrl";
|
||||
import { isSameSubdomain } from "./validateUrl";
|
||||
|
||||
describe("isSameDomain", () => {
|
||||
it("should return true for a subdomain", () => {
|
||||
const result = isSameDomain("http://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain", () => {
|
||||
const result = isSameDomain("http://example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameDomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with different protocols", () => {
|
||||
const result = isSameDomain("https://sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameDomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameDomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for a subdomain with www prefix", () => {
|
||||
const result = isSameDomain("http://www.sub.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same domain with www prefix", () => {
|
||||
const result = isSameDomain("http://docs.s.s.example.com", "http://example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
|
||||
|
||||
|
||||
describe("isSameSubdomain", () => {
|
||||
it("should return false for a subdomain", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://docs.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for different subdomains", () => {
|
||||
const result = isSameSubdomain("http://docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for different domains", () => {
|
||||
const result = isSameSubdomain("http://example.com", "http://another.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
|
||||
it("should return false for invalid URLs", () => {
|
||||
const result = isSameSubdomain("invalid-url", "http://example.com");
|
||||
expect(result).toBe(false);
|
||||
const result2 = isSameSubdomain("http://example.com", "invalid-url");
|
||||
expect(result2).toBe(false);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with different protocols", () => {
|
||||
const result = isSameSubdomain("https://docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return true for the same subdomain with www prefix", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://docs.example.com");
|
||||
expect(result).toBe(true);
|
||||
});
|
||||
|
||||
it("should return false for a subdomain with www prefix and different subdomain", () => {
|
||||
const result = isSameSubdomain("http://www.docs.example.com", "http://blog.example.com");
|
||||
expect(result).toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
describe("removeDuplicateUrls", () => {
|
||||
it("should remove duplicate URLs with different protocols", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com",
|
||||
"http://www.example.com",
|
||||
"https://www.example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should keep URLs with different paths", () => {
|
||||
const urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page1?param=1",
|
||||
"https://example.com/page1#section1"
|
||||
]);
|
||||
});
|
||||
|
||||
it("should prefer https over http", () => {
|
||||
const urls = [
|
||||
"http://example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should prefer non-www over www", () => {
|
||||
const urls = [
|
||||
"https://www.example.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
|
||||
it("should handle empty input", () => {
|
||||
const urls: string[] = [];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual([]);
|
||||
});
|
||||
|
||||
it("should handle URLs with different cases", () => {
|
||||
const urls = [
|
||||
"https://EXAMPLE.com",
|
||||
"https://example.com"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://EXAMPLE.com"]);
|
||||
});
|
||||
|
||||
it("should handle URLs with trailing slashes", () => {
|
||||
const urls = [
|
||||
"https://example.com",
|
||||
"https://example.com/"
|
||||
];
|
||||
const result = removeDuplicateUrls(urls);
|
||||
expect(result).toEqual(["https://example.com"]);
|
||||
});
|
||||
});
|
@ -1,9 +1,8 @@
|
||||
|
||||
const protocolIncluded = (url: string) => {
|
||||
export const protocolIncluded = (url: string) => {
|
||||
// if :// not in the start of the url assume http (maybe https?)
|
||||
// regex checks if :// appears before any .
|
||||
return(/^([^.:]+:\/\/)/.test(url));
|
||||
}
|
||||
return /^([^.:]+:\/\/)/.test(url);
|
||||
};
|
||||
|
||||
const getURLobj = (s: string) => {
|
||||
// URL fails if we dont include the protocol ie google.com
|
||||
@ -18,7 +17,6 @@ const getURLobj = (s: string) => {
|
||||
};
|
||||
|
||||
export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
@ -30,9 +28,143 @@ export const checkAndUpdateURL = (url: string) => {
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
export const checkUrl = (url: string) => {
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
|
||||
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
|
||||
}
|
||||
|
||||
return url;
|
||||
};
|
||||
|
||||
/**
|
||||
* Same domain check
|
||||
* It checks if the domain of the url is the same as the base url
|
||||
* It accounts true for subdomains and www.subdomains
|
||||
* @param url
|
||||
* @param baseUrl
|
||||
* @returns
|
||||
*/
|
||||
export function isSameDomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
return domain1 === domain2;
|
||||
}
|
||||
|
||||
|
||||
export function isSameSubdomain(url: string, baseUrl: string) {
|
||||
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
||||
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
||||
|
||||
if (error1 || error2) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const typedUrlObj1 = urlObj1 as URL;
|
||||
const typedUrlObj2 = urlObj2 as URL;
|
||||
|
||||
const cleanHostname = (hostname: string) => {
|
||||
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
|
||||
};
|
||||
|
||||
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
|
||||
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
|
||||
|
||||
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
|
||||
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
|
||||
|
||||
// Check if the domains are the same and the subdomains are the same
|
||||
return domain1 === domain2 && subdomain1 === subdomain2;
|
||||
}
|
||||
|
||||
|
||||
export const checkAndUpdateURLForMap = (url: string) => {
|
||||
if (!protocolIncluded(url)) {
|
||||
url = `http://${url}`;
|
||||
}
|
||||
// remove last slash if present
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
|
||||
|
||||
const { error, urlObj } = getURLobj(url);
|
||||
if (error) {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
const typedUrlObj = urlObj as URL;
|
||||
|
||||
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
||||
throw new Error("Invalid URL");
|
||||
}
|
||||
|
||||
// remove any query params
|
||||
url = url.split("?")[0].trim();
|
||||
|
||||
return { urlObj: typedUrlObj, url: url };
|
||||
};
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
export function removeDuplicateUrls(urls: string[]): string[] {
|
||||
const urlMap = new Map<string, string>();
|
||||
|
||||
for (const url of urls) {
|
||||
const parsedUrl = new URL(url);
|
||||
const protocol = parsedUrl.protocol;
|
||||
const hostname = parsedUrl.hostname.replace(/^www\./, '');
|
||||
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
||||
|
||||
const key = `${hostname}${path}`;
|
||||
|
||||
if (!urlMap.has(key)) {
|
||||
urlMap.set(key, url);
|
||||
} else {
|
||||
const existingUrl = new URL(urlMap.get(key)!);
|
||||
const existingProtocol = existingUrl.protocol;
|
||||
|
||||
if (protocol === 'https:' && existingProtocol === 'http:') {
|
||||
urlMap.set(key, url);
|
||||
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
|
||||
urlMap.set(key, url);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [...new Set(Array.from(urlMap.values()))];
|
||||
}
|
@ -26,7 +26,12 @@ export async function startWebScraperPipeline({
|
||||
mode: job.data.mode,
|
||||
crawlerOptions: job.data.crawlerOptions,
|
||||
extractorOptions: job.data.extractorOptions,
|
||||
pageOptions: job.data.pageOptions,
|
||||
pageOptions: {
|
||||
...job.data.pageOptions,
|
||||
...(job.data.crawl_id ? ({
|
||||
includeRawHtml: true,
|
||||
}): {}),
|
||||
},
|
||||
inProgress: (progress) => {
|
||||
Logger.debug(`🐂 Job in progress ${job.id}`);
|
||||
if (progress.currentDocument) {
|
||||
@ -39,6 +44,9 @@ export async function startWebScraperPipeline({
|
||||
},
|
||||
onSuccess: (result, mode) => {
|
||||
Logger.debug(`🐂 Job completed ${job.id}`);
|
||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||
delete result[0].rawHtml;
|
||||
}
|
||||
saveJob(job, result, token, mode);
|
||||
},
|
||||
onError: (error) => {
|
||||
|
@ -1,11 +1,11 @@
|
||||
import express from "express";
|
||||
import { redisHealthController } from "../controllers/admin/redis-health";
|
||||
import { redisHealthController } from "../controllers/v0/admin/redis-health";
|
||||
import {
|
||||
autoscalerController,
|
||||
checkQueuesController,
|
||||
cleanBefore24hCompleteJobsController,
|
||||
queuesController,
|
||||
} from "../controllers/admin/queue";
|
||||
} from "../controllers/v0/admin/queue";
|
||||
|
||||
export const adminRouter = express.Router();
|
||||
|
||||
|
@ -1,14 +1,14 @@
|
||||
import express from "express";
|
||||
import { crawlController } from "../../src/controllers/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/status";
|
||||
import { searchController } from "../../src/controllers/search";
|
||||
import { crawlCancelController } from "../../src/controllers/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/keyAuth";
|
||||
import { livenessController } from "../controllers/liveness";
|
||||
import { readinessController } from "../controllers/readiness";
|
||||
import { crawlController } from "../../src/controllers/v0/crawl";
|
||||
import { crawlStatusController } from "../../src/controllers/v0/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v0/scrape";
|
||||
import { crawlPreviewController } from "../../src/controllers/v0/crawlPreview";
|
||||
import { crawlJobStatusPreviewController } from "../../src/controllers/v0/status";
|
||||
import { searchController } from "../../src/controllers/v0/search";
|
||||
import { crawlCancelController } from "../../src/controllers/v0/crawl-cancel";
|
||||
import { keyAuthController } from "../../src/controllers/v0/keyAuth";
|
||||
import { livenessController } from "../controllers/v0/liveness";
|
||||
import { readinessController } from "../controllers/v0/readiness";
|
||||
|
||||
export const v0Router = express.Router();
|
||||
|
||||
|
150
apps/api/src/routes/v1.ts
Normal file
150
apps/api/src/routes/v1.ts
Normal file
@ -0,0 +1,150 @@
|
||||
import express, { NextFunction, Request, Response } from "express";
|
||||
import { crawlController } from "../controllers/v1/crawl";
|
||||
// import { crawlStatusController } from "../../src/controllers/v1/crawl-status";
|
||||
import { scrapeController } from "../../src/controllers/v1/scrape";
|
||||
import { crawlStatusController } from "../controllers/v1/crawl-status";
|
||||
import { mapController } from "../controllers/v1/map";
|
||||
import { ErrorResponse, RequestWithAuth, RequestWithMaybeAuth } from "../controllers/v1/types";
|
||||
import { RateLimiterMode } from "../types";
|
||||
import { authenticateUser } from "../controllers/auth";
|
||||
import { createIdempotencyKey } from "../services/idempotency/create";
|
||||
import { validateIdempotencyKey } from "../services/idempotency/validate";
|
||||
import { checkTeamCredits } from "../services/billing/credit_billing";
|
||||
import expressWs from "express-ws";
|
||||
import { crawlStatusWSController } from "../controllers/v1/crawl-status-ws";
|
||||
import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
||||
import { crawlCancelController } from "../controllers/v1/crawl-cancel";
|
||||
import { Logger } from "../lib/logger";
|
||||
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
|
||||
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
|
||||
// import { searchController } from "../../src/controllers/v1/search";
|
||||
// import { crawlCancelController } from "../../src/controllers/v1/crawl-cancel";
|
||||
// import { keyAuthController } from "../../src/controllers/v1/keyAuth";
|
||||
// import { livenessController } from "../controllers/v1/liveness";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
|
||||
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
if (!minimum && req.body) {
|
||||
minimum = (req.body as any)?.limit ?? 1;
|
||||
}
|
||||
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||
if (!success) {
|
||||
Logger.error(`Insufficient credits: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`);
|
||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||
}
|
||||
req.account = { remainingCredits }
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
};
|
||||
}
|
||||
|
||||
export function authMiddleware(rateLimiterMode: RateLimiterMode): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void {
|
||||
return (req, res, next) => {
|
||||
(async () => {
|
||||
const { success, team_id, error, status, plan } = await authenticateUser(
|
||||
req,
|
||||
res,
|
||||
rateLimiterMode,
|
||||
);
|
||||
|
||||
if (!success) {
|
||||
return res.status(status).json({ success: false, error });
|
||||
}
|
||||
|
||||
req.auth = { team_id, plan };
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
}
|
||||
|
||||
function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
(async () => {
|
||||
if (req.headers["x-idempotency-key"]) {
|
||||
const isIdempotencyValid = await validateIdempotencyKey(req);
|
||||
if (!isIdempotencyValid) {
|
||||
return res.status(409).json({ success: false, error: "Idempotency key already used" });
|
||||
}
|
||||
createIdempotencyKey(req);
|
||||
}
|
||||
next();
|
||||
})()
|
||||
.catch(err => next(err));
|
||||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||
}
|
||||
next();
|
||||
}
|
||||
|
||||
function wrap(controller: (req: Request, res: Response) => Promise<any>): (req: Request, res: Response, next: NextFunction) => any {
|
||||
return (req, res, next) => {
|
||||
controller(req, res)
|
||||
.catch(err => next(err))
|
||||
}
|
||||
}
|
||||
|
||||
expressWs(express());
|
||||
|
||||
export const v1Router = express.Router();
|
||||
|
||||
v1Router.post(
|
||||
"/scrape",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(scrapeController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/crawl",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
idempotencyMiddleware,
|
||||
checkCreditsMiddleware(),
|
||||
wrap(crawlController)
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
blocklistMiddleware,
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
checkCreditsMiddleware(1),
|
||||
wrap(mapController)
|
||||
);
|
||||
|
||||
v1Router.get(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(crawlStatusController)
|
||||
);
|
||||
|
||||
v1Router.ws(
|
||||
"/crawl/:jobId",
|
||||
crawlStatusWSController
|
||||
);
|
||||
|
||||
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
|
||||
|
||||
|
||||
v1Router.delete(
|
||||
"/crawl/:jobId",
|
||||
authMiddleware(RateLimiterMode.Crawl),
|
||||
crawlCancelController
|
||||
);
|
||||
// v1Router.get("/checkJobStatus/:jobId", crawlJobStatusPreviewController);
|
||||
|
||||
// // Auth route for key based authentication
|
||||
// v1Router.get("/keyAuth", keyAuthController);
|
||||
|
||||
// // Search routes
|
||||
// v0Router.post("/search", searchController);
|
||||
|
||||
// Health/Probe routes
|
||||
// v1Router.get("/health/liveness", livenessController);
|
||||
// v1Router.get("/health/readiness", readinessController);
|
@ -31,7 +31,8 @@ it('should return a list of links on the firecrawl.ai page', async () => {
|
||||
|
||||
// Check if the result contains a list of links
|
||||
expect(result.linksOnPage).toBeDefined();
|
||||
console.log({result});
|
||||
expect(Array.isArray(result.linksOnPage)).toBe(true);
|
||||
expect(result.linksOnPage.length).toBeGreaterThan(0);
|
||||
expect(result.linksOnPage).toContain('https://flutterbricks.com/features')
|
||||
}, 10000);
|
||||
}, 15000);
|
||||
|
@ -309,6 +309,23 @@ export class WebCrawler {
|
||||
return null;
|
||||
}
|
||||
|
||||
public extractLinksFromHTML(html: string, url: string) {
|
||||
let links: string[] = [];
|
||||
|
||||
const $ = load(html);
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push(u);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return links;
|
||||
}
|
||||
|
||||
async crawl(url: string, pageOptions: PageOptions): Promise<{url: string, html: string, pageStatusCode?: number, pageError?: string}[]> {
|
||||
if (this.visited.has(url) || !this.robots.isAllowed(url, "FireCrawlAgent")) {
|
||||
return [];
|
||||
@ -352,15 +369,7 @@ export class WebCrawler {
|
||||
links.push({ url, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
|
||||
$("a").each((_, element) => {
|
||||
const href = $(element).attr("href");
|
||||
if (href) {
|
||||
const u = this.filterURL(href, url);
|
||||
if (u !== null) {
|
||||
links.push({ url: u, html: content, pageStatusCode, pageError });
|
||||
}
|
||||
}
|
||||
});
|
||||
links.push(...this.extractLinksFromHTML(content, url).map(url => ({ url, html: content, pageStatusCode, pageError })));
|
||||
|
||||
if (this.visited.size === 1) {
|
||||
return links;
|
||||
|
@ -294,7 +294,16 @@ export class WebScraperDataProvider {
|
||||
documents = await this.getSitemapData(this.urls[0], documents);
|
||||
}
|
||||
|
||||
documents = this.applyPathReplacements(documents);
|
||||
if (this.pageOptions.includeMarkdown) {
|
||||
documents = this.applyPathReplacements(documents);
|
||||
}
|
||||
|
||||
if (!this.pageOptions.includeHtml) {
|
||||
for (let document of documents) {
|
||||
delete document.html;
|
||||
}
|
||||
}
|
||||
|
||||
// documents = await this.applyImgAltText(documents);
|
||||
if (
|
||||
(this.extractorOptions.mode === "llm-extraction" ||
|
||||
@ -347,6 +356,7 @@ export class WebScraperDataProvider {
|
||||
});
|
||||
return {
|
||||
content: content,
|
||||
markdown: content,
|
||||
metadata: { sourceURL: pdfLink, pageStatusCode, pageError },
|
||||
provider: "web-scraper",
|
||||
};
|
||||
@ -569,12 +579,20 @@ export class WebScraperDataProvider {
|
||||
this.limit = options.crawlerOptions?.limit ?? 10000;
|
||||
this.generateImgAltText =
|
||||
options.crawlerOptions?.generateImgAltText ?? false;
|
||||
this.pageOptions = options.pageOptions ?? {
|
||||
onlyMainContent: false,
|
||||
includeHtml: false,
|
||||
replaceAllPathsWithAbsolutePaths: false,
|
||||
parsePDF: true,
|
||||
removeTags: [],
|
||||
this.pageOptions = {
|
||||
onlyMainContent: options.pageOptions?.onlyMainContent ?? false,
|
||||
includeHtml: options.pageOptions?.includeHtml ?? false,
|
||||
replaceAllPathsWithAbsolutePaths: options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: options.pageOptions?.parsePDF ?? true,
|
||||
onlyIncludeTags: options.pageOptions?.onlyIncludeTags ?? [],
|
||||
removeTags: options.pageOptions?.removeTags ?? [],
|
||||
includeMarkdown: options.pageOptions?.includeMarkdown ?? true,
|
||||
includeRawHtml: options.pageOptions?.includeRawHtml ?? false,
|
||||
waitFor: options.pageOptions?.waitFor ?? undefined,
|
||||
headers: options.pageOptions?.headers ?? undefined,
|
||||
includeLinks: options.pageOptions?.includeLinks ?? true,
|
||||
fullPageScreenshot: options.pageOptions?.fullPageScreenshot ?? false,
|
||||
screenshot: options.pageOptions?.screenshot ?? false,
|
||||
};
|
||||
this.extractorOptions = options.extractorOptions ?? { mode: "markdown" };
|
||||
this.replaceAllPathsWithAbsolutePaths =
|
||||
|
@ -122,22 +122,38 @@ function getScrapingFallbackOrder(
|
||||
export async function scrapSingleUrl(
|
||||
jobId: string,
|
||||
urlToScrap: string,
|
||||
pageOptions: PageOptions = {
|
||||
onlyMainContent: true,
|
||||
includeHtml: false,
|
||||
includeRawHtml: false,
|
||||
waitFor: 0,
|
||||
screenshot: false,
|
||||
fullPageScreenshot: false,
|
||||
headers: undefined,
|
||||
},
|
||||
extractorOptions: ExtractorOptions = {
|
||||
mode: "llm-extraction-from-markdown",
|
||||
},
|
||||
existingHtml: string = "",
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions?: ExtractorOptions,
|
||||
existingHtml?: string,
|
||||
priority?: number,
|
||||
teamId?: string
|
||||
): Promise<Document> {
|
||||
pageOptions = {
|
||||
includeMarkdown: pageOptions.includeMarkdown ?? true,
|
||||
onlyMainContent: pageOptions.onlyMainContent ?? false,
|
||||
includeHtml: pageOptions.includeHtml ?? false,
|
||||
includeRawHtml: pageOptions.includeRawHtml ?? false,
|
||||
waitFor: pageOptions.waitFor ?? undefined,
|
||||
screenshot: pageOptions.screenshot ?? false,
|
||||
fullPageScreenshot: pageOptions.fullPageScreenshot ?? false,
|
||||
headers: pageOptions.headers ?? undefined,
|
||||
includeLinks: pageOptions.includeLinks ?? true,
|
||||
replaceAllPathsWithAbsolutePaths: pageOptions.replaceAllPathsWithAbsolutePaths ?? true,
|
||||
parsePDF: pageOptions.parsePDF ?? true,
|
||||
removeTags: pageOptions.removeTags ?? [],
|
||||
onlyIncludeTags: pageOptions.onlyIncludeTags ?? [],
|
||||
}
|
||||
|
||||
if (extractorOptions) {
|
||||
extractorOptions = {
|
||||
mode: extractorOptions?.mode ?? "llm-extraction-from-markdown",
|
||||
}
|
||||
}
|
||||
|
||||
if (!existingHtml) {
|
||||
existingHtml = "";
|
||||
}
|
||||
|
||||
urlToScrap = urlToScrap.trim();
|
||||
|
||||
const attemptScraping = async (
|
||||
@ -341,8 +357,8 @@ export async function scrapSingleUrl(
|
||||
pageError = undefined;
|
||||
}
|
||||
|
||||
if (text && text.trim().length >= 100) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100, breaking`);
|
||||
if ((text && text.trim().length >= 100) || (typeof screenshot === "string" && screenshot.length > 0)) {
|
||||
Logger.debug(`⛏️ ${scraper}: Successfully scraped ${urlToScrap} with text length >= 100 or screenshot, breaking`);
|
||||
break;
|
||||
}
|
||||
if (pageStatusCode && (pageStatusCode == 404 || pageStatusCode == 500)) {
|
||||
@ -364,20 +380,22 @@ export async function scrapSingleUrl(
|
||||
|
||||
let linksOnPage: string[] | undefined;
|
||||
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
if (pageOptions.includeLinks) {
|
||||
linksOnPage = extractLinks(rawHtml, urlToScrap);
|
||||
}
|
||||
|
||||
let document: Document;
|
||||
if (screenshot && screenshot.length > 0) {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
metadata: {
|
||||
...metadata,
|
||||
screenshot: screenshot,
|
||||
@ -389,11 +407,11 @@ export async function scrapSingleUrl(
|
||||
} else {
|
||||
document = {
|
||||
content: text,
|
||||
markdown: text,
|
||||
markdown: pageOptions.includeMarkdown ? text : undefined,
|
||||
html: pageOptions.includeHtml ? html : undefined,
|
||||
rawHtml:
|
||||
pageOptions.includeRawHtml ||
|
||||
extractorOptions.mode === "llm-extraction-from-raw-html"
|
||||
extractorOptions?.mode === "llm-extraction-from-raw-html"
|
||||
? rawHtml
|
||||
: undefined,
|
||||
metadata: {
|
||||
@ -402,7 +420,7 @@ export async function scrapSingleUrl(
|
||||
pageStatusCode: pageStatusCode,
|
||||
pageError: pageError,
|
||||
},
|
||||
linksOnPage,
|
||||
linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
|
||||
};
|
||||
}
|
||||
|
||||
@ -416,9 +434,9 @@ export async function scrapSingleUrl(
|
||||
});
|
||||
return {
|
||||
content: "",
|
||||
markdown: "",
|
||||
markdown: pageOptions.includeMarkdown ? "" : undefined,
|
||||
html: "",
|
||||
linksOnPage: [],
|
||||
linksOnPage: pageOptions.includeLinks ? [] : undefined,
|
||||
metadata: {
|
||||
sourceURL: urlToScrap,
|
||||
pageStatusCode: pageStatusCode,
|
||||
|
@ -75,9 +75,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||
description = soup('meta[name="description"]').attr("content") || null;
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
const pattern = /([a-zA-Z]+-[A-Z]{2})/;
|
||||
const match = pattern.exec(url);
|
||||
language = match ? match[1] : null;
|
||||
language = soup('html').attr('lang') || null;
|
||||
|
||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||
|
45
apps/api/src/search/fireEngine.ts
Normal file
45
apps/api/src/search/fireEngine.ts
Normal file
@ -0,0 +1,45 @@
|
||||
import axios from "axios";
|
||||
import dotenv from "dotenv";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
export async function fireEngineMap(q: string, options: {
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
numResults: number;
|
||||
page?: number;
|
||||
}): Promise<SearchResult[]> {
|
||||
let data = JSON.stringify({
|
||||
query: q,
|
||||
lang: options.lang,
|
||||
country: options.country,
|
||||
location: options.location,
|
||||
tbs: options.tbs,
|
||||
numResults: options.numResults,
|
||||
page: options.page ?? 1,
|
||||
});
|
||||
|
||||
if (!process.env.FIRE_ENGINE_BETA_URL) {
|
||||
console.warn("(v1/map Beta) Results might differ from cloud offering currently.");
|
||||
return [];
|
||||
}
|
||||
|
||||
let config = {
|
||||
method: "POST",
|
||||
url: `${process.env.FIRE_ENGINE_BETA_URL}/search`,
|
||||
headers: {
|
||||
"Content-Type": "application/json",
|
||||
},
|
||||
data: data,
|
||||
};
|
||||
const response = await axios(config);
|
||||
if (response && response) {
|
||||
return response.data
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
@ -52,7 +52,7 @@ async function _req(term: string, results: number, lang: string, country: string
|
||||
|
||||
|
||||
|
||||
export async function google_search(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
export async function googleSearch(term: string, advanced = false, num_results = 7, tbs = null, filter = null, lang = "en", country = "us", proxy = null, sleep_interval = 0, timeout = 5000, ) :Promise<SearchResult[]> {
|
||||
let proxies = null;
|
||||
if (proxy) {
|
||||
if (proxy.startsWith("https")) {
|
||||
|
@ -1,11 +1,9 @@
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { google_search } from "./googlesearch";
|
||||
import { googleSearch } from "./googlesearch";
|
||||
import { fireEngineMap } from "./fireEngine";
|
||||
import { serper_search } from "./serper";
|
||||
|
||||
|
||||
|
||||
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
@ -30,12 +28,20 @@ export async function search({
|
||||
proxy?: string;
|
||||
sleep_interval?: number;
|
||||
timeout?: number;
|
||||
}) : Promise<SearchResult[]> {
|
||||
}): Promise<SearchResult[]> {
|
||||
try {
|
||||
if (process.env.SERPER_API_KEY ) {
|
||||
return await serper_search(query, {num_results, tbs, filter, lang, country, location});
|
||||
|
||||
if (process.env.SERPER_API_KEY) {
|
||||
return await serper_search(query, {
|
||||
num_results,
|
||||
tbs,
|
||||
filter,
|
||||
lang,
|
||||
country,
|
||||
location,
|
||||
});
|
||||
}
|
||||
return await google_search(
|
||||
return await googleSearch(
|
||||
query,
|
||||
advanced,
|
||||
num_results,
|
||||
@ -49,7 +55,6 @@ export async function search({
|
||||
);
|
||||
} catch (error) {
|
||||
Logger.error(`Error in search function: ${error}`);
|
||||
return []
|
||||
return [];
|
||||
}
|
||||
// if process.env.SERPER_API_KEY is set, use serper
|
||||
}
|
||||
|
@ -49,5 +49,23 @@ export async function addScrapeJob(
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
export function waitForJob(jobId: string, timeout: number) {
|
||||
return new Promise((resolve, reject) => {
|
||||
const start = Date.now();
|
||||
const int = setInterval(async () => {
|
||||
if (Date.now() >= start + timeout) {
|
||||
clearInterval(int);
|
||||
reject(new Error("Job wait "));
|
||||
} else {
|
||||
const state = await getScrapeQueue().getJobState(jobId);
|
||||
if (state === "completed") {
|
||||
clearInterval(int);
|
||||
resolve((await getScrapeQueue().getJob(jobId)).returnvalue);
|
||||
} else if (state === "failed") {
|
||||
clearInterval(int);
|
||||
reject((await getScrapeQueue().getJob(jobId)).failedReason);
|
||||
}
|
||||
}
|
||||
}, 1000);
|
||||
})
|
||||
}
|
||||
|
@ -195,6 +195,14 @@ async function processJob(job: Job, token: string) {
|
||||
const end = Date.now();
|
||||
const timeTakenInSeconds = (end - start) / 1000;
|
||||
|
||||
const rawHtml = docs[0] ? docs[0].rawHtml : "";
|
||||
|
||||
if (job.data.crawl_id && (!job.data.pageOptions || !job.data.pageOptions.includeRawHtml)) {
|
||||
if (docs[0] && docs[0].rawHtml) {
|
||||
delete docs[0].rawHtml;
|
||||
}
|
||||
}
|
||||
|
||||
const data = {
|
||||
success,
|
||||
result: {
|
||||
@ -211,7 +219,7 @@ async function processJob(job: Job, token: string) {
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl") {
|
||||
await callWebhook(job.data.team_id, job.id as string, data);
|
||||
await callWebhook(job.data.team_id, job.id as string, data, job.data.webhook, job.data.v1);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
@ -238,15 +246,9 @@ async function processJob(job: Job, token: string) {
|
||||
if (!job.data.sitemapped) {
|
||||
if (!sc.cancelled) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
let linksOnPage = [];
|
||||
try{
|
||||
linksOnPage = data.docs[0]?.linksOnPage ?? [];
|
||||
}catch(e){
|
||||
linksOnPage = []
|
||||
}
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
linksOnPage.map(href => crawler.filterURL(href.trim(), sc.originUrl))
|
||||
.filter(x => x !== null),
|
||||
crawler.extractLinksFromHTML(rawHtml ?? "", sc.originUrl),
|
||||
Infinity,
|
||||
sc.crawlerOptions?.maxDepth ?? 10
|
||||
)
|
||||
@ -271,6 +273,7 @@ async function processJob(job: Job, token: string) {
|
||||
pageOptions: sc.pageOptions,
|
||||
origin: job.data.origin,
|
||||
crawl_id: job.data.crawl_id,
|
||||
v1: job.data.v1,
|
||||
}, {}, jobId, jobPriority);
|
||||
|
||||
await addCrawlJob(job.data.crawl_id, newJob.id);
|
||||
@ -340,7 +343,7 @@ async function processJob(job: Job, token: string) {
|
||||
docs: fullDocs,
|
||||
};
|
||||
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id, data);
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id, data, job.data.webhook, job.data.v1);
|
||||
}
|
||||
}
|
||||
|
||||
@ -384,7 +387,7 @@ async function processJob(job: Job, token: string) {
|
||||
};
|
||||
|
||||
if (job.data.mode === "crawl" || job.data.crawl_id) {
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data);
|
||||
await callWebhook(job.data.team_id, job.data.crawl_id ?? job.id as string, data, job.data.webhook, job.data.v1);
|
||||
}
|
||||
|
||||
if (job.data.crawl_id) {
|
||||
|
@ -79,7 +79,7 @@ describe("Rate Limiter Service", () => {
|
||||
"test-prefix:someToken",
|
||||
"growth"
|
||||
);
|
||||
expect(limiter4.points).toBe(150);
|
||||
expect(limiter4.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the default rate limiter if plan is not provided", () => {
|
||||
@ -153,7 +153,7 @@ describe("Rate Limiter Service", () => {
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should consume points correctly for 'crawl' mode", async () => {
|
||||
@ -315,7 +315,7 @@ describe("Rate Limiter Service", () => {
|
||||
"crawlStatus" as RateLimiterMode,
|
||||
"test-prefix:someToken"
|
||||
);
|
||||
expect(limiter2.points).toBe(150);
|
||||
expect(limiter2.points).toBe(250);
|
||||
});
|
||||
|
||||
it("should return the correct rate limiter for 'testSuite' mode", () => {
|
||||
|
@ -65,7 +65,7 @@ const RATE_LIMITS = {
|
||||
},
|
||||
crawlStatus: {
|
||||
free: 150,
|
||||
default: 150,
|
||||
default: 250,
|
||||
},
|
||||
testSuite: {
|
||||
free: 10000,
|
||||
|
@ -1,15 +1,16 @@
|
||||
import { legacyDocumentConverter } from "../../src/controllers/v1/types";
|
||||
import { Logger } from "../../src/lib/logger";
|
||||
import { supabase_service } from "./supabase";
|
||||
|
||||
export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
export const callWebhook = async (teamId: string, jobId: string, data: any, specified?: string, v1 = false) => {
|
||||
try {
|
||||
const selfHostedUrl = process.env.SELF_HOSTED_WEBHOOK_URL?.replace("{{JOB_ID}}", jobId);
|
||||
const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
|
||||
let webhookUrl = selfHostedUrl;
|
||||
let webhookUrl = specified ?? selfHostedUrl;
|
||||
|
||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL is not set
|
||||
// Only fetch the webhook URL from the database if the self-hosted webhook URL and specified webhook are not set
|
||||
// and the USE_DB_AUTHENTICATION environment variable is set to true
|
||||
if (!selfHostedUrl && useDbAuthentication) {
|
||||
if (!webhookUrl && useDbAuthentication) {
|
||||
const { data: webhooksData, error } = await supabase_service
|
||||
.from("webhooks")
|
||||
.select("url")
|
||||
@ -30,11 +31,15 @@ export const callWebhook = async (teamId: string, jobId: string,data: any) => {
|
||||
let dataToSend = [];
|
||||
if (data.result.links && data.result.links.length !== 0) {
|
||||
for (let i = 0; i < data.result.links.length; i++) {
|
||||
dataToSend.push({
|
||||
content: data.result.links[i].content.content,
|
||||
markdown: data.result.links[i].content.markdown,
|
||||
metadata: data.result.links[i].content.metadata,
|
||||
});
|
||||
if (v1) {
|
||||
dataToSend.push(legacyDocumentConverter(data.result.links[i].content))
|
||||
} else {
|
||||
dataToSend.push({
|
||||
content: data.result.links[i].content.content,
|
||||
markdown: data.result.links[i].content.markdown,
|
||||
metadata: data.result.links[i].content.metadata,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -30,6 +30,8 @@ export interface WebScraperOptions {
|
||||
origin?: string;
|
||||
crawl_id?: string;
|
||||
sitemapped?: boolean;
|
||||
webhook?: string;
|
||||
v1?: boolean;
|
||||
}
|
||||
|
||||
export interface RunWebScraperParams {
|
||||
@ -105,6 +107,7 @@ export enum RateLimiterMode {
|
||||
Scrape = "scrape",
|
||||
Preview = "preview",
|
||||
Search = "search",
|
||||
Map = "map",
|
||||
|
||||
}
|
||||
|
||||
@ -113,6 +116,7 @@ export interface AuthResponse {
|
||||
team_id?: string;
|
||||
error?: string;
|
||||
status?: number;
|
||||
api_key?: string;
|
||||
plan?: PlanType;
|
||||
}
|
||||
|
||||
|
25
apps/go-sdk/examples/.gitignore
vendored
25
apps/go-sdk/examples/.gitignore
vendored
@ -1,25 +0,0 @@
|
||||
# If you prefer the allow list template instead of the deny list, see community template:
|
||||
# https://github.com/github/gitignore/blob/main/community/Golang/Go.AllowList.gitignore
|
||||
#
|
||||
# Binaries for programs and plugins
|
||||
*.exe
|
||||
*.exe~
|
||||
*.dll
|
||||
*.so
|
||||
*.dylib
|
||||
|
||||
# Test binary, built with `go test -c`
|
||||
*.test
|
||||
|
||||
# Output of the go coverage tool, specifically when used with LiteIDE
|
||||
*.out
|
||||
|
||||
# Dependency directories (remove the comment below to include it)
|
||||
# vendor/
|
||||
|
||||
# Go workspace file
|
||||
go.work
|
||||
go.work.sum
|
||||
|
||||
# env file
|
||||
.env
|
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Mendable
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,87 +0,0 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/mendableai/firecrawl-go"
|
||||
)
|
||||
|
||||
func main() {
|
||||
app, err := firecrawl.NewFirecrawlApp("fc-YOUR_API_KEY", "https://api.firecrawl.dev")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to create FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a website
|
||||
scrapeResult, err := app.ScrapeURL("firecrawl.dev", nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult.Markdown)
|
||||
|
||||
// Crawl a website
|
||||
idempotencyKey := uuid.New().String() // optional idempotency key
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
jsonCrawlResult, err := json.MarshalIndent(crawlResult, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal crawl result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonCrawlResult))
|
||||
|
||||
// LLM Extraction using JSON schema
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
Mode: "llm-extraction",
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
llmExtractionResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
|
||||
// Pretty print the LLM extraction result
|
||||
jsonResult, err := json.MarshalIndent(llmExtractionResult.LLMExtraction, "", " ")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to marshal LLM extraction result: %v", err)
|
||||
}
|
||||
fmt.Println(string(jsonResult))
|
||||
}
|
@ -1,9 +0,0 @@
|
||||
module github.com/mendableai/firecrawl-go-examples
|
||||
|
||||
go 1.22.5
|
||||
|
||||
replace github.com/mendableai/firecrawl => ../
|
||||
|
||||
require github.com/google/uuid v1.6.0
|
||||
|
||||
require github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 // indirect
|
@ -1,14 +0,0 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46 h1:461um7fbSQYj2E3ETl8GINuRg5MTY3BdjMnogwUIhBs=
|
||||
github.com/mendableai/firecrawl-go v0.0.0-20240813205613-366e8d8dcf46/go.mod h1:mTGbJ37fy43aaqonp/tdpzCH516jHFw/XVvfFi4QXHo=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
@ -1,2 +0,0 @@
|
||||
API_URL=http://localhost:3002
|
||||
TEST_API_KEY=fc-YOUR-API-KEY
|
2
apps/go-sdk/firecrawl/.gitignore
vendored
2
apps/go-sdk/firecrawl/.gitignore
vendored
@ -1,2 +0,0 @@
|
||||
.env
|
||||
vendor
|
@ -1,21 +0,0 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 Sideguide Technologies Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -1,189 +0,0 @@
|
||||
# Firecrawl Go SDK
|
||||
|
||||
The Firecrawl Go SDK is a library that allows you to easily scrape and crawl websites, and output the data in a format ready for use with language models (LLMs). It provides a simple and intuitive interface for interacting with the Firecrawl API.
|
||||
|
||||
## Installation
|
||||
|
||||
To install the Firecrawl Go SDK, you can
|
||||
|
||||
```bash
|
||||
go get github.com/mendableai/firecrawl
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
1. Get an API key from [firecrawl.dev](https://firecrawl.dev)
|
||||
2. Set the API key as an environment variable named `FIRECRAWL_API_KEY` or pass it as a parameter to the `FirecrawlApp` class.
|
||||
|
||||
|
||||
Here's an example of how to use the SDK with error handling:
|
||||
|
||||
```go
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
|
||||
"github.com/mendableai/firecrawl/firecrawl"
|
||||
)
|
||||
|
||||
func main() {
|
||||
// Initialize the FirecrawlApp with your API key
|
||||
app, err := firecrawl.NewFirecrawlApp("YOUR_API_KEY")
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to initialize FirecrawlApp: %v", err)
|
||||
}
|
||||
|
||||
// Scrape a single URL
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while scraping: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
|
||||
// Crawl a website
|
||||
crawlUrl := "https://mendable.ai"
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
|
||||
crawlResult, err := app.CrawlURL(crawlUrl, params)
|
||||
if err != nil {
|
||||
log.Fatalf("Error occurred while crawling: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
}
|
||||
```
|
||||
|
||||
### Scraping a URL
|
||||
|
||||
To scrape a single URL with error handling, use the `ScrapeURL` method. It takes the URL as a parameter and returns the scraped data as a dictionary.
|
||||
|
||||
```go
|
||||
url := "https://mendable.ai"
|
||||
scrapedData, err := app.ScrapeURL(url, nil)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to scrape URL: %v", err)
|
||||
}
|
||||
fmt.Println(scrapedData)
|
||||
```
|
||||
|
||||
### Extracting structured data from a URL
|
||||
|
||||
With LLM extraction, you can easily extract structured data from any URL. Here is how you to use it:
|
||||
|
||||
```go
|
||||
jsonSchema := map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"top": map[string]any{
|
||||
"type": "array",
|
||||
"items": map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"title": map[string]string{"type": "string"},
|
||||
"points": map[string]string{"type": "number"},
|
||||
"by": map[string]string{"type": "string"},
|
||||
"commentsURL": map[string]string{"type": "string"},
|
||||
},
|
||||
"required": []string{"title", "points", "by", "commentsURL"},
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News",
|
||||
},
|
||||
},
|
||||
"required": []string{"top"},
|
||||
}
|
||||
|
||||
llmExtractionParams := map[string]any{
|
||||
"extractorOptions": firecrawl.ExtractorOptions{
|
||||
ExtractionSchema: jsonSchema,
|
||||
},
|
||||
}
|
||||
|
||||
scrapeResult, err := app.ScrapeURL("https://news.ycombinator.com", llmExtractionParams)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to perform LLM extraction: %v", err)
|
||||
}
|
||||
fmt.Println(scrapeResult)
|
||||
```
|
||||
|
||||
### Search for a query
|
||||
|
||||
To search the web, get the most relevant results, scrap each page and return the markdown, use the `Search` method. The method takes the query as a parameter and returns the search results.
|
||||
|
||||
|
||||
```go
|
||||
query := "what is mendable?"
|
||||
searchResult, err := app.Search(query)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to search: %v", err)
|
||||
}
|
||||
fmt.Println(searchResult)
|
||||
```
|
||||
|
||||
### Crawling a Website
|
||||
|
||||
To crawl a website, use the `CrawlUrl` method. It takes the starting URL and optional parameters as arguments. The `params` argument allows you to specify additional options for the crawl job, such as the maximum number of pages to crawl, allowed domains, and the output format.
|
||||
|
||||
```go
|
||||
crawlParams := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
"includes": []string{}, // leave empty for all pages
|
||||
"limit": 1000,
|
||||
},
|
||||
"pageOptions": map[string]any{
|
||||
"onlyMainContent": true,
|
||||
},
|
||||
}
|
||||
crawlResult, err := app.CrawlURL("mendable.ai", crawlParams, true, 2, idempotencyKey)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to crawl URL: %v", err)
|
||||
}
|
||||
fmt.Println(crawlResult)
|
||||
```
|
||||
|
||||
### Checking Crawl Status
|
||||
|
||||
To check the status of a crawl job, use the `CheckCrawlStatus` method. It takes the job ID as a parameter and returns the current status of the crawl job.
|
||||
|
||||
```go
|
||||
status, err := app.CheckCrawlStatus(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to check crawl status: %v", err)
|
||||
}
|
||||
fmt.Println(status)
|
||||
```
|
||||
|
||||
### Canceling a Crawl Job
|
||||
To cancel a crawl job, use the `CancelCrawlJob` method. It takes the job ID as a parameter and returns the cancellation status of the crawl job.
|
||||
|
||||
```go
|
||||
canceled, err := app.CancelCrawlJob(jobId)
|
||||
if err != nil {
|
||||
log.Fatalf("Failed to cancel crawl job: %v", err)
|
||||
}
|
||||
fmt.Println(canceled)
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
The SDK handles errors returned by the Firecrawl API and raises appropriate exceptions. If an error occurs during a request, an exception will be raised with a descriptive error message.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions to the Firecrawl Go SDK are welcome! If you find any issues or have suggestions for improvements, please open an issue or submit a pull request on the GitHub repository.
|
||||
|
||||
## License
|
||||
|
||||
The Firecrawl Go SDK is licensed under the MIT License. This means you are free to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the SDK, subject to the following conditions:
|
||||
|
||||
- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
|
||||
Please note that while this SDK is MIT licensed, it is part of a larger project which may be under different licensing terms. Always refer to the license information in the root directory of the main project for overall licensing details.
|
@ -1,584 +0,0 @@
|
||||
// Package firecrawl provides a client for interacting with the Firecrawl API.
|
||||
package firecrawl
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"math"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
)
|
||||
|
||||
// FirecrawlDocumentMetadata represents metadata for a Firecrawl document
|
||||
type FirecrawlDocumentMetadata struct {
|
||||
Title string `json:"title,omitempty"`
|
||||
Description string `json:"description,omitempty"`
|
||||
Language string `json:"language,omitempty"`
|
||||
Keywords string `json:"keywords,omitempty"`
|
||||
Robots string `json:"robots,omitempty"`
|
||||
OGTitle string `json:"ogTitle,omitempty"`
|
||||
OGDescription string `json:"ogDescription,omitempty"`
|
||||
OGURL string `json:"ogUrl,omitempty"`
|
||||
OGImage string `json:"ogImage,omitempty"`
|
||||
OGAudio string `json:"ogAudio,omitempty"`
|
||||
OGDeterminer string `json:"ogDeterminer,omitempty"`
|
||||
OGLocale string `json:"ogLocale,omitempty"`
|
||||
OGLocaleAlternate []string `json:"ogLocaleAlternate,omitempty"`
|
||||
OGSiteName string `json:"ogSiteName,omitempty"`
|
||||
OGVideo string `json:"ogVideo,omitempty"`
|
||||
DCTermsCreated string `json:"dctermsCreated,omitempty"`
|
||||
DCDateCreated string `json:"dcDateCreated,omitempty"`
|
||||
DCDate string `json:"dcDate,omitempty"`
|
||||
DCTermsType string `json:"dctermsType,omitempty"`
|
||||
DCType string `json:"dcType,omitempty"`
|
||||
DCTermsAudience string `json:"dctermsAudience,omitempty"`
|
||||
DCTermsSubject string `json:"dctermsSubject,omitempty"`
|
||||
DCSubject string `json:"dcSubject,omitempty"`
|
||||
DCDescription string `json:"dcDescription,omitempty"`
|
||||
DCTermsKeywords string `json:"dctermsKeywords,omitempty"`
|
||||
ModifiedTime string `json:"modifiedTime,omitempty"`
|
||||
PublishedTime string `json:"publishedTime,omitempty"`
|
||||
ArticleTag string `json:"articleTag,omitempty"`
|
||||
ArticleSection string `json:"articleSection,omitempty"`
|
||||
SourceURL string `json:"sourceURL,omitempty"`
|
||||
PageStatusCode int `json:"pageStatusCode,omitempty"`
|
||||
PageError string `json:"pageError,omitempty"`
|
||||
}
|
||||
|
||||
// FirecrawlDocument represents a document in Firecrawl
|
||||
type FirecrawlDocument struct {
|
||||
ID string `json:"id,omitempty"`
|
||||
URL string `json:"url,omitempty"`
|
||||
Content string `json:"content"`
|
||||
Markdown string `json:"markdown,omitempty"`
|
||||
HTML string `json:"html,omitempty"`
|
||||
LLMExtraction map[string]any `json:"llm_extraction,omitempty"`
|
||||
CreatedAt *time.Time `json:"createdAt,omitempty"`
|
||||
UpdatedAt *time.Time `json:"updatedAt,omitempty"`
|
||||
Type string `json:"type,omitempty"`
|
||||
Metadata *FirecrawlDocumentMetadata `json:"metadata,omitempty"`
|
||||
ChildrenLinks []string `json:"childrenLinks,omitempty"`
|
||||
Provider string `json:"provider,omitempty"`
|
||||
Warning string `json:"warning,omitempty"`
|
||||
Index int `json:"index,omitempty"`
|
||||
}
|
||||
|
||||
// ExtractorOptions represents options for extraction.
|
||||
type ExtractorOptions struct {
|
||||
Mode string `json:"mode,omitempty"`
|
||||
ExtractionPrompt string `json:"extractionPrompt,omitempty"`
|
||||
ExtractionSchema any `json:"extractionSchema,omitempty"`
|
||||
}
|
||||
|
||||
// ScrapeResponse represents the response for scraping operations
|
||||
type ScrapeResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data *FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// SearchResponse represents the response for searching operations
|
||||
type SearchResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// CrawlResponse represents the response for crawling operations
|
||||
type CrawlResponse struct {
|
||||
Success bool `json:"success"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
}
|
||||
|
||||
// JobStatusResponse represents the response for checking crawl job status
|
||||
type JobStatusResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
Current int `json:"current,omitempty"`
|
||||
CurrentURL string `json:"current_url,omitempty"`
|
||||
CurrentStep string `json:"current_step,omitempty"`
|
||||
Total int `json:"total,omitempty"`
|
||||
JobID string `json:"jobId,omitempty"`
|
||||
Data []*FirecrawlDocument `json:"data,omitempty"`
|
||||
PartialData []*FirecrawlDocument `json:"partial_data,omitempty"`
|
||||
}
|
||||
|
||||
// CancelCrawlJobResponse represents the response for canceling a crawl job
|
||||
type CancelCrawlJobResponse struct {
|
||||
Success bool `json:"success"`
|
||||
Status string `json:"status"`
|
||||
}
|
||||
|
||||
// requestOptions represents options for making requests.
|
||||
type requestOptions struct {
|
||||
retries int
|
||||
backoff int
|
||||
}
|
||||
|
||||
// requestOption is a functional option type for requestOptions.
|
||||
type requestOption func(*requestOptions)
|
||||
|
||||
// newRequestOptions creates a new requestOptions instance with the provided options.
|
||||
//
|
||||
// Parameters:
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - *requestOptions: A new instance of requestOptions with the provided options.
|
||||
func newRequestOptions(opts ...requestOption) *requestOptions {
|
||||
options := &requestOptions{retries: 1}
|
||||
for _, opt := range opts {
|
||||
opt(options)
|
||||
}
|
||||
return options
|
||||
}
|
||||
|
||||
// withRetries sets the number of retries for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - retries: The number of retries to be performed.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the number of retries for a request.
|
||||
func withRetries(retries int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.retries = retries
|
||||
}
|
||||
}
|
||||
|
||||
// withBackoff sets the backoff interval for a request.
|
||||
//
|
||||
// Parameters:
|
||||
// - backoff: The backoff interval (in milliseconds) to be used for retries.
|
||||
//
|
||||
// Returns:
|
||||
// - requestOption: A functional option that sets the backoff interval for a request.
|
||||
func withBackoff(backoff int) requestOption {
|
||||
return func(opts *requestOptions) {
|
||||
opts.backoff = backoff
|
||||
}
|
||||
}
|
||||
|
||||
// FirecrawlApp represents a client for the Firecrawl API.
|
||||
type FirecrawlApp struct {
|
||||
APIKey string
|
||||
APIURL string
|
||||
Client *http.Client
|
||||
}
|
||||
|
||||
// NewFirecrawlApp creates a new instance of FirecrawlApp with the provided API key and API URL.
|
||||
// If the API key or API URL is not provided, it attempts to retrieve them from environment variables.
|
||||
// If the API key is still not found, it returns an error.
|
||||
//
|
||||
// Parameters:
|
||||
// - apiKey: The API key for authenticating with the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_KEY environment variable.
|
||||
// - apiURL: The base URL for the Firecrawl API. If empty, it will be retrieved from the FIRECRAWL_API_URL environment variable, defaulting to "https://api.firecrawl.dev".
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlApp: A new instance of FirecrawlApp configured with the provided or retrieved API key and API URL.
|
||||
// - error: An error if the API key is not provided or retrieved.
|
||||
func NewFirecrawlApp(apiKey, apiURL string) (*FirecrawlApp, error) {
|
||||
if apiKey == "" {
|
||||
apiKey = os.Getenv("FIRECRAWL_API_KEY")
|
||||
if apiKey == "" {
|
||||
return nil, fmt.Errorf("no API key provided")
|
||||
}
|
||||
}
|
||||
|
||||
if apiURL == "" {
|
||||
apiURL = os.Getenv("FIRECRAWL_API_URL")
|
||||
if apiURL == "" {
|
||||
apiURL = "https://api.firecrawl.dev"
|
||||
}
|
||||
}
|
||||
|
||||
client := &http.Client{
|
||||
Timeout: 60 * time.Second,
|
||||
}
|
||||
|
||||
return &FirecrawlApp{
|
||||
APIKey: apiKey,
|
||||
APIURL: apiURL,
|
||||
Client: client,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// ScrapeURL scrapes the content of the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to be scraped.
|
||||
// - params: Optional parameters for the scrape request, including extractor options for LLM extraction.
|
||||
//
|
||||
// Returns:
|
||||
// - *FirecrawlDocument: The scraped document data.
|
||||
// - error: An error if the scrape request fails.
|
||||
func (app *FirecrawlApp) ScrapeURL(url string, params map[string]any) (*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
scrapeBody := map[string]any{"url": url}
|
||||
|
||||
if params != nil {
|
||||
if extractorOptions, ok := params["extractorOptions"].(ExtractorOptions); ok {
|
||||
if schema, ok := extractorOptions.ExtractionSchema.(interface{ schema() any }); ok {
|
||||
extractorOptions.ExtractionSchema = schema.schema()
|
||||
}
|
||||
if extractorOptions.Mode == "" {
|
||||
extractorOptions.Mode = "llm-extraction"
|
||||
}
|
||||
scrapeBody["extractorOptions"] = extractorOptions
|
||||
}
|
||||
|
||||
for key, value := range params {
|
||||
if key != "extractorOptions" {
|
||||
scrapeBody[key] = value
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/scrape", app.APIURL),
|
||||
scrapeBody,
|
||||
headers,
|
||||
"scrape URL",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var scrapeResponse ScrapeResponse
|
||||
err = json.Unmarshal(resp, &scrapeResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if scrapeResponse.Success {
|
||||
return scrapeResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to scrape URL")
|
||||
}
|
||||
|
||||
// Search performs a search query using the Firecrawl API and returns the search results.
|
||||
//
|
||||
// Parameters:
|
||||
// - query: The search query string.
|
||||
// - params: Optional parameters for the search request.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: A slice of FirecrawlDocument containing the search results.
|
||||
// - error: An error if the search request fails.
|
||||
func (app *FirecrawlApp) Search(query string, params map[string]any) ([]*FirecrawlDocument, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
searchBody := map[string]any{"query": query}
|
||||
for k, v := range params {
|
||||
searchBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/search", app.APIURL),
|
||||
searchBody,
|
||||
headers,
|
||||
"search",
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var searchResponse SearchResponse
|
||||
err = json.Unmarshal(resp, &searchResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if searchResponse.Success {
|
||||
return searchResponse.Data, nil
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("failed to search")
|
||||
}
|
||||
|
||||
// CrawlURL starts a crawl job for the specified URL using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - url: The URL to crawl.
|
||||
// - params: Optional parameters for the crawl request.
|
||||
// - waitUntilDone: If true, the method will wait until the crawl job is completed before returning.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status if waitUntilDone is true.
|
||||
// - idempotencyKey: An optional idempotency key to ensure the request is idempotent.
|
||||
//
|
||||
// Returns:
|
||||
// - any: The job ID if waitUntilDone is false, or the crawl result if waitUntilDone is true.
|
||||
// - error: An error if the crawl request fails.
|
||||
func (app *FirecrawlApp) CrawlURL(url string, params map[string]any, waitUntilDone bool, pollInterval int, idempotencyKey string) (any, error) {
|
||||
headers := app.prepareHeaders(idempotencyKey)
|
||||
crawlBody := map[string]any{"url": url}
|
||||
for k, v := range params {
|
||||
crawlBody[k] = v
|
||||
}
|
||||
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodPost,
|
||||
fmt.Sprintf("%s/v0/crawl", app.APIURL),
|
||||
crawlBody,
|
||||
headers,
|
||||
"start crawl job",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var crawlResponse CrawlResponse
|
||||
err = json.Unmarshal(resp, &crawlResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if waitUntilDone {
|
||||
return app.monitorJobStatus(crawlResponse.JobID, headers, pollInterval)
|
||||
}
|
||||
|
||||
if crawlResponse.JobID == "" {
|
||||
return nil, fmt.Errorf("failed to get job ID")
|
||||
}
|
||||
|
||||
return crawlResponse.JobID, nil
|
||||
}
|
||||
|
||||
// CheckCrawlStatus checks the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to check.
|
||||
//
|
||||
// Returns:
|
||||
// - *JobStatusResponse: The status of the crawl job.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) CheckCrawlStatus(jobID string) (*JobStatusResponse, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var jobStatusResponse JobStatusResponse
|
||||
err = json.Unmarshal(resp, &jobStatusResponse)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return &jobStatusResponse, nil
|
||||
}
|
||||
|
||||
// CancelCrawlJob cancels a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to cancel.
|
||||
//
|
||||
// Returns:
|
||||
// - string: The status of the crawl job after cancellation.
|
||||
// - error: An error if the crawl job cancellation request fails.
|
||||
func (app *FirecrawlApp) CancelCrawlJob(jobID string) (string, error) {
|
||||
headers := app.prepareHeaders("")
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodDelete,
|
||||
fmt.Sprintf("%s/v0/crawl/cancel/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"cancel crawl job",
|
||||
)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
var cancelCrawlJobResponse CancelCrawlJobResponse
|
||||
err = json.Unmarshal(resp, &cancelCrawlJobResponse)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return cancelCrawlJobResponse.Status, nil
|
||||
}
|
||||
|
||||
// prepareHeaders prepares the headers for an HTTP request.
|
||||
//
|
||||
// Parameters:
|
||||
// - idempotencyKey: A string representing the idempotency key to be included in the headers.
|
||||
// If the idempotency key is an empty string, it will not be included in the headers.
|
||||
//
|
||||
// Returns:
|
||||
// - map[string]string: A map containing the headers for the HTTP request.
|
||||
func (app *FirecrawlApp) prepareHeaders(idempotencyKey string) map[string]string {
|
||||
headers := map[string]string{
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": fmt.Sprintf("Bearer %s", app.APIKey),
|
||||
}
|
||||
if idempotencyKey != "" {
|
||||
headers["x-idempotency-key"] = idempotencyKey
|
||||
}
|
||||
return headers
|
||||
}
|
||||
|
||||
// makeRequest makes a request to the specified URL with the provided method, data, headers, and options.
|
||||
//
|
||||
// Parameters:
|
||||
// - method: The HTTP method to use for the request (e.g., "GET", "POST", "DELETE").
|
||||
// - url: The URL to send the request to.
|
||||
// - data: The data to be sent in the request body.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - action: A string describing the action being performed.
|
||||
// - opts: Optional request options.
|
||||
//
|
||||
// Returns:
|
||||
// - []byte: The response body from the request.
|
||||
// - error: An error if the request fails.
|
||||
func (app *FirecrawlApp) makeRequest(method, url string, data map[string]any, headers map[string]string, action string, opts ...requestOption) ([]byte, error) {
|
||||
var body []byte
|
||||
var err error
|
||||
if data != nil {
|
||||
body, err = json.Marshal(data)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
req, err := http.NewRequest(method, url, bytes.NewBuffer(body))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
for key, value := range headers {
|
||||
req.Header.Set(key, value)
|
||||
}
|
||||
|
||||
var resp *http.Response
|
||||
options := newRequestOptions(opts...)
|
||||
for i := 0; i < options.retries; i++ {
|
||||
resp, err = app.Client.Do(req)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != 502 {
|
||||
break
|
||||
}
|
||||
|
||||
time.Sleep(time.Duration(math.Pow(2, float64(i))) * time.Duration(options.backoff) * time.Millisecond)
|
||||
}
|
||||
|
||||
respBody, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
statusCode := resp.StatusCode
|
||||
if statusCode != 200 {
|
||||
return nil, app.handleError(statusCode, respBody, action)
|
||||
}
|
||||
|
||||
return respBody, nil
|
||||
}
|
||||
|
||||
// monitorJobStatus monitors the status of a crawl job using the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - jobID: The ID of the crawl job to monitor.
|
||||
// - headers: The headers to be included in the request.
|
||||
// - pollInterval: The interval (in seconds) at which to poll the job status.
|
||||
//
|
||||
// Returns:
|
||||
// - []*FirecrawlDocument: The crawl result if the job is completed.
|
||||
// - error: An error if the crawl status check request fails.
|
||||
func (app *FirecrawlApp) monitorJobStatus(jobID string, headers map[string]string, pollInterval int) ([]*FirecrawlDocument, error) {
|
||||
attempts := 0
|
||||
for {
|
||||
resp, err := app.makeRequest(
|
||||
http.MethodGet,
|
||||
fmt.Sprintf("%s/v0/crawl/status/%s", app.APIURL, jobID),
|
||||
nil,
|
||||
headers,
|
||||
"check crawl status",
|
||||
withRetries(3),
|
||||
withBackoff(500),
|
||||
)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
var statusData JobStatusResponse
|
||||
err = json.Unmarshal(resp, &statusData)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
status := statusData.Status
|
||||
if status == "" {
|
||||
return nil, fmt.Errorf("invalid status in response")
|
||||
}
|
||||
|
||||
if status == "completed" {
|
||||
if statusData.Data != nil {
|
||||
return statusData.Data, nil
|
||||
}
|
||||
attempts++
|
||||
if attempts > 3 {
|
||||
return nil, fmt.Errorf("crawl job completed but no data was returned")
|
||||
}
|
||||
} else if status == "active" || status == "paused" || status == "pending" || status == "queued" || status == "waiting" {
|
||||
pollInterval = max(pollInterval, 2)
|
||||
time.Sleep(time.Duration(pollInterval) * time.Second)
|
||||
} else {
|
||||
return nil, fmt.Errorf("crawl job failed or was stopped. Status: %s", status)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// handleError handles errors returned by the Firecrawl API.
|
||||
//
|
||||
// Parameters:
|
||||
// - resp: The HTTP response object.
|
||||
// - body: The response body from the HTTP response.
|
||||
// - action: A string describing the action being performed.
|
||||
//
|
||||
// Returns:
|
||||
// - error: An error describing the failure reason.
|
||||
func (app *FirecrawlApp) handleError(statusCode int, body []byte, action string) error {
|
||||
var errorData map[string]any
|
||||
err := json.Unmarshal(body, &errorData)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to parse error response: %v", err)
|
||||
}
|
||||
|
||||
errorMessage, _ := errorData["error"].(string)
|
||||
if errorMessage == "" {
|
||||
errorMessage = "No additional error details provided."
|
||||
}
|
||||
|
||||
var message string
|
||||
switch statusCode {
|
||||
case 402:
|
||||
message = fmt.Sprintf("Payment Required: Failed to %s. %s", action, errorMessage)
|
||||
case 408:
|
||||
message = fmt.Sprintf("Request Timeout: Failed to %s as the request timed out. %s", action, errorMessage)
|
||||
case 409:
|
||||
message = fmt.Sprintf("Conflict: Failed to %s due to a conflict. %s", action, errorMessage)
|
||||
case 500:
|
||||
message = fmt.Sprintf("Internal Server Error: Failed to %s. %s", action, errorMessage)
|
||||
default:
|
||||
message = fmt.Sprintf("Unexpected error during %s: Status code %d. %s", action, statusCode, errorMessage)
|
||||
}
|
||||
|
||||
return fmt.Errorf(message)
|
||||
}
|
@ -1,292 +0,0 @@
|
||||
package firecrawl
|
||||
|
||||
import (
|
||||
"log"
|
||||
"os"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/joho/godotenv"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
var API_URL string
|
||||
var TEST_API_KEY string
|
||||
|
||||
func init() {
|
||||
err := godotenv.Load("../.env")
|
||||
if err != nil {
|
||||
log.Fatalf("Error loading .env file: %v", err)
|
||||
}
|
||||
API_URL = os.Getenv("API_URL")
|
||||
TEST_API_KEY = os.Getenv("TEST_API_KEY")
|
||||
}
|
||||
|
||||
func TestNoAPIKey(t *testing.T) {
|
||||
_, err := NewFirecrawlApp("", API_URL)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "no API key provided")
|
||||
}
|
||||
|
||||
func TestScrapeURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://firecrawl.dev", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.ScrapeURL("https://facebook.com/fake-test", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidPreviewToken(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("this_is_just_a_preview_token", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestScrapeURLE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.NotEqual(t, response.Markdown, "")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
assert.Equal(t, response.HTML, "")
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseWithValidAPIKeyAndIncludeHTML(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"pageOptions": map[string]any{
|
||||
"includeHtml": true,
|
||||
},
|
||||
}
|
||||
response, err := app.ScrapeURL("https://roastmywebsite.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "_Roast_")
|
||||
assert.Contains(t, response.Markdown, "_Roast_")
|
||||
assert.Contains(t, response.HTML, "<h1")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFile(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001.pdf", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestSuccessfulResponseForValidScrapeWithPDFFileWithoutExplicitExtension(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.ScrapeURL("https://arxiv.org/pdf/astro-ph/9301001", nil)
|
||||
time.Sleep(6 * time.Second) // wait for 6 seconds
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.Content, "We present spectrophotometric observations of the Broad Line Radio Galaxy")
|
||||
assert.NotNil(t, response.Metadata)
|
||||
}
|
||||
|
||||
func TestCrawlURLInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestShouldReturnErrorForBlocklistedURL(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CrawlURL("https://twitter.com/fake-test", nil, false, 2, "")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions.")
|
||||
}
|
||||
|
||||
func TestCrawlURLWaitForCompletionE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
}
|
||||
|
||||
func TestCrawlURLWithIdempotencyKeyE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
uniqueIdempotencyKey := uuid.New().String()
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://roastmywebsite.ai", params, true, 2, uniqueIdempotencyKey)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
data, ok := response.([]*FirecrawlDocument)
|
||||
assert.True(t, ok)
|
||||
assert.Greater(t, len(data), 0)
|
||||
assert.Contains(t, data[0].Content, "_Roast_")
|
||||
|
||||
_, err = app.CrawlURL("https://firecrawl.dev", params, true, 2, uniqueIdempotencyKey)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used")
|
||||
}
|
||||
|
||||
func TestCheckCrawlStatusE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"crawlerOptions": map[string]any{
|
||||
"excludes": []string{"blog/*"},
|
||||
},
|
||||
}
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", params, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
time.Sleep(30 * time.Second) // wait for 30 seconds
|
||||
|
||||
statusResponse, err := app.CheckCrawlStatus(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, statusResponse)
|
||||
|
||||
assert.Equal(t, "completed", statusResponse.Status)
|
||||
assert.Greater(t, len(statusResponse.Data), 0)
|
||||
}
|
||||
|
||||
func TestSearchE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.Search("test query", nil)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Greater(t, len(response), 2)
|
||||
assert.NotEqual(t, response[0].Content, "")
|
||||
}
|
||||
|
||||
func TestSearchInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.Search("test query", nil)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during search: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestLLMExtraction(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
params := map[string]any{
|
||||
"extractorOptions": ExtractorOptions{
|
||||
Mode: "llm-extraction",
|
||||
ExtractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
ExtractionSchema: map[string]any{
|
||||
"type": "object",
|
||||
"properties": map[string]any{
|
||||
"company_mission": map[string]string{"type": "string"},
|
||||
"supports_sso": map[string]string{"type": "boolean"},
|
||||
"is_open_source": map[string]string{"type": "boolean"},
|
||||
},
|
||||
"required": []string{"company_mission", "supports_sso", "is_open_source"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
response, err := app.ScrapeURL("https://mendable.ai", params)
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
assert.Contains(t, response.LLMExtraction, "company_mission")
|
||||
assert.IsType(t, true, response.LLMExtraction["supports_sso"])
|
||||
assert.IsType(t, true, response.LLMExtraction["is_open_source"])
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobInvalidAPIKey(t *testing.T) {
|
||||
app, err := NewFirecrawlApp("invalid_api_key", API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = app.CancelCrawlJob("test query")
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Unexpected error during cancel crawl job: Status code 401. Unauthorized: Invalid token")
|
||||
}
|
||||
|
||||
func TestCancelNonExistingCrawlJob(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
jobID := uuid.New().String()
|
||||
_, err = app.CancelCrawlJob(jobID)
|
||||
assert.Error(t, err)
|
||||
assert.Contains(t, err.Error(), "Job not found")
|
||||
}
|
||||
|
||||
func TestCancelCrawlJobE2E(t *testing.T) {
|
||||
app, err := NewFirecrawlApp(TEST_API_KEY, API_URL)
|
||||
require.NoError(t, err)
|
||||
|
||||
response, err := app.CrawlURL("https://firecrawl.dev", nil, false, 2, "")
|
||||
require.NoError(t, err)
|
||||
assert.NotNil(t, response)
|
||||
|
||||
jobID, ok := response.(string)
|
||||
assert.True(t, ok)
|
||||
assert.NotEqual(t, "", jobID)
|
||||
|
||||
status, err := app.CancelCrawlJob(jobID)
|
||||
require.NoError(t, err)
|
||||
assert.Equal(t, "cancelled", status)
|
||||
}
|
@ -1,15 +0,0 @@
|
||||
module github.com/mendableai/firecrawl-go
|
||||
|
||||
go 1.22.5
|
||||
|
||||
require (
|
||||
github.com/google/uuid v1.6.0
|
||||
github.com/joho/godotenv v1.5.1
|
||||
github.com/stretchr/testify v1.9.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
gopkg.in/yaml.v3 v3.0.1 // indirect
|
||||
)
|
@ -1,14 +0,0 @@
|
||||
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
|
||||
github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
|
||||
github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0=
|
||||
github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/stretchr/testify v1.9.0 h1:HtqpIVDClZ4nwg75+f6Lvsy/wHu+3BoSGCbBAcpTsTg=
|
||||
github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
|
||||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
|
||||
gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
|
||||
gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
|
@ -1,16 +1,16 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp from './firecrawl/src/index'; //'@mendable/firecrawl-js';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
@ -19,67 +19,15 @@ console.log(jobId);
|
||||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
if (job.data) {
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
const mapResult = await app.map('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
@ -1,25 +1,25 @@
|
||||
import FirecrawlApp, { JobStatusResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
import FirecrawlApp, { ScrapeResponse } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { CrawlStatusResponse } from './firecrawl/src/index';
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
if (scrapeResult) {
|
||||
console.log(scrapeResult.markdown)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludePaths: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: JobStatusResponse;
|
||||
let job: CrawlStatusResponse;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponse;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
@ -27,66 +27,8 @@ while (true) {
|
||||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
console.log(job.data[0].markdown);
|
||||
}
|
||||
|
||||
const mapResult = await app.mapUrl('https://firecrawl.dev');
|
||||
console.log(mapResult)
|
||||
|
85
apps/js-sdk/exampleV0.js
Normal file
85
apps/js-sdk/exampleV0.js
Normal file
@ -0,0 +1,85 @@
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import FirecrawlApp from '@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
console.log(scrapeResult.data.content)
|
||||
|
||||
// Crawl a website:
|
||||
const idempotencyKey = uuidv4(); // optional
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false, 2, idempotencyKey);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId);
|
||||
if (job.status == 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
console.log(job.data[0].content);
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query)
|
||||
console.log(searchResult)
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: zodSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
console.log(llmExtractionResult.data.llm_extraction);
|
93
apps/js-sdk/exampleV0.ts
Normal file
93
apps/js-sdk/exampleV0.ts
Normal file
@ -0,0 +1,93 @@
|
||||
import FirecrawlApp, { ScrapeResponseV0, CrawlStatusResponseV0, SearchResponseV0 } from './firecrawl/src/index' //'@mendable/firecrawl-js';
|
||||
import { z } from "zod";
|
||||
|
||||
const app = new FirecrawlApp<"v0">({apiKey: "fc-YOUR_API_KEY", version: "v0"})
|
||||
|
||||
// Scrape a website:
|
||||
const scrapeResult = await app.scrapeUrl('firecrawl.dev');
|
||||
|
||||
if (scrapeResult.data) {
|
||||
console.log(scrapeResult.data.content)
|
||||
}
|
||||
|
||||
// Crawl a website:
|
||||
const crawlResult = await app.crawlUrl('mendable.ai', {crawlerOptions: {excludes: ['blog/*'], limit: 5}}, false);
|
||||
console.log(crawlResult)
|
||||
|
||||
const jobId: string = await crawlResult['jobId'];
|
||||
console.log(jobId);
|
||||
|
||||
let job: CrawlStatusResponseV0;
|
||||
while (true) {
|
||||
job = await app.checkCrawlStatus(jobId) as CrawlStatusResponseV0;
|
||||
if (job.status === 'completed') {
|
||||
break;
|
||||
}
|
||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||
}
|
||||
|
||||
if (job.data) {
|
||||
console.log(job.data[0].content);
|
||||
}
|
||||
|
||||
// Search for a query:
|
||||
const query = 'what is mendable?'
|
||||
const searchResult = await app.search(query) as SearchResponseV0;
|
||||
if (searchResult.data) {
|
||||
console.log(searchResult.data[0].content)
|
||||
}
|
||||
|
||||
// LLM Extraction:
|
||||
// Define schema to extract contents into using zod schema
|
||||
const zodSchema = z.object({
|
||||
top: z
|
||||
.array(
|
||||
z.object({
|
||||
title: z.string(),
|
||||
points: z.number(),
|
||||
by: z.string(),
|
||||
commentsURL: z.string(),
|
||||
})
|
||||
)
|
||||
.length(5)
|
||||
.describe("Top 5 stories on Hacker News"),
|
||||
});
|
||||
|
||||
let llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com");
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
||||
// Define schema to extract contents into using json schema
|
||||
const jsonSchema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llmExtractionResult = await app.scrapeUrl("https://news.ycombinator.com", {
|
||||
extractorOptions: { extractionSchema: jsonSchema },
|
||||
});
|
||||
|
||||
if (llmExtractionResult.data) {
|
||||
console.log(llmExtractionResult.data[0].llm_extraction);
|
||||
}
|
||||
|
@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@mendable/firecrawl-js",
|
||||
"version": "0.0.36",
|
||||
"version": "1.0.0",
|
||||
"description": "JavaScript SDK for Firecrawl API",
|
||||
"main": "build/cjs/index.js",
|
||||
"types": "types/index.d.ts",
|
||||
@ -19,7 +19,7 @@
|
||||
"build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json",
|
||||
"build-and-publish": "npm run build && npm publish --access public",
|
||||
"publish-beta": "npm run build && npm publish --access public --tag beta",
|
||||
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts"
|
||||
"test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/v1/**/*.test.ts"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
|
@ -1,160 +1,331 @@
|
||||
import FirecrawlApp from '../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
import FirecrawlApp, {
|
||||
CrawlResponseV0,
|
||||
CrawlStatusResponse,
|
||||
CrawlStatusResponseV0,
|
||||
FirecrawlDocumentV0,
|
||||
ScrapeResponseV0,
|
||||
SearchResponseV0,
|
||||
} from "../../index";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import dotenv from "dotenv";
|
||||
import { describe, test, expect } from "@jest/globals";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
describe('FirecrawlApp<"v0"> E2E Tests', () => {
|
||||
test.concurrent("should throw error for no API key", async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
new FirecrawlApp<"v0">({ apiKey: null, apiUrl: API_URL, version: "v0" });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data).toHaveProperty('markdown');
|
||||
expect(response.data).toHaveProperty('metadata');
|
||||
expect(response.data).not.toHaveProperty('html');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and include HTML', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai', { pageOptions: { includeHtml: true } });
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data?.markdown).toContain("_Roast_");
|
||||
expect(response.data?.html).toContain("<h1");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001');
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 30);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response[0].content).toContain("_Roast_");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false, 2, uniqueIdempotencyKey);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', { crawlerOptions: { excludes: ['blog/*'] } }, false);
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
let statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === 'active' && checks < maxChecks) {
|
||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||
expect(statusResponse.partial_data).not.toBeNull();
|
||||
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
checks++;
|
||||
test.concurrent(
|
||||
"should throw error for invalid API key on scrape",
|
||||
async () => {
|
||||
const invalidApp = new FirecrawlApp<"v0">({
|
||||
apiKey: "invalid_api_key",
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
await expect(
|
||||
invalidApp.scrapeUrl("https://roastmywebsite.ai")
|
||||
).rejects.toThrow("Request failed with status code 401");
|
||||
}
|
||||
);
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse.success).toBe(true);
|
||||
expect(statusResponse.status).toBe('completed');
|
||||
expect(statusResponse.total).toEqual(statusResponse.current);
|
||||
expect(statusResponse.current_step).not.toBeNull();
|
||||
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
||||
}, 35000); // 35 seconds timeout
|
||||
test.concurrent(
|
||||
"should throw error for blocklisted URL on scrape",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow(
|
||||
"Request failed with status code 403"
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
test.concurrent('should return successful response for search', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.search("test query");
|
||||
test.concurrent(
|
||||
"should return successful response with valid preview token",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: "this_is_just_a_preview_token",
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl(
|
||||
"https://roastmywebsite.ai"
|
||||
)) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response for valid scrape",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl(
|
||||
"https://roastmywebsite.ai"
|
||||
)) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data).toHaveProperty("markdown");
|
||||
expect(response.data).toHaveProperty("metadata");
|
||||
expect(response.data).not.toHaveProperty("html");
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response with valid API key and include HTML",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl("https://roastmywebsite.ai", {
|
||||
pageOptions: { includeHtml: true },
|
||||
})) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain("_Roast_");
|
||||
expect(response.data?.markdown).toContain("_Roast_");
|
||||
expect(response.data?.html).toContain("<h1");
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response for valid scrape with PDF file",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl(
|
||||
"https://arxiv.org/pdf/astro-ph/9301001.pdf"
|
||||
)) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain(
|
||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
||||
);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response for valid scrape with PDF file without explicit extension",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl(
|
||||
"https://arxiv.org/pdf/astro-ph/9301001"
|
||||
)) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.content).toContain(
|
||||
"We present spectrophotometric observations of the Broad Line Radio Galaxy"
|
||||
);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should throw error for invalid API key on crawl",
|
||||
async () => {
|
||||
const invalidApp = new FirecrawlApp<"v0">({
|
||||
apiKey: "invalid_api_key",
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
await expect(
|
||||
invalidApp.crawlUrl("https://roastmywebsite.ai")
|
||||
).rejects.toThrow("Request failed with status code 401");
|
||||
}
|
||||
);
|
||||
|
||||
test.concurrent(
|
||||
"should throw error for blocklisted URL on crawl",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow(
|
||||
"Request failed with status code 403"
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response for crawl and wait for completion",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.crawlUrl(
|
||||
"https://roastmywebsite.ai",
|
||||
{ crawlerOptions: { excludes: ["blog/*"] } },
|
||||
true,
|
||||
10
|
||||
)) as FirecrawlDocumentV0[];
|
||||
expect(response).not.toBeNull();
|
||||
console.log({ response });
|
||||
expect(response[0].content).toContain("_Roast_");
|
||||
},
|
||||
60000
|
||||
); // 60 seconds timeout
|
||||
|
||||
test.concurrent("should handle idempotency key for crawl", async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = (await app.crawlUrl(
|
||||
"https://roastmywebsite.ai",
|
||||
{ crawlerOptions: { excludes: ["blog/*"] } },
|
||||
false,
|
||||
2,
|
||||
uniqueIdempotencyKey
|
||||
)) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.data?.[0]?.content).toBeDefined();
|
||||
expect(response?.data?.length).toBeGreaterThan(2);
|
||||
}, 30000); // 30 seconds timeout
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
test.concurrent('should throw error for invalid API key on search', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.search("test query")).rejects.toThrow("Request failed with status code 401");
|
||||
await expect(
|
||||
app.crawlUrl(
|
||||
"https://roastmywebsite.ai",
|
||||
{ crawlerOptions: { excludes: ["blog/*"] } },
|
||||
true,
|
||||
2,
|
||||
uniqueIdempotencyKey
|
||||
)
|
||||
).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should perform LLM extraction', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl("https://mendable.ai", {
|
||||
extractorOptions: {
|
||||
mode: 'llm-extraction',
|
||||
extractionPrompt: "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
extractionSchema: {
|
||||
type: 'object',
|
||||
properties: {
|
||||
company_mission: { type: 'string' },
|
||||
supports_sso: { type: 'boolean' },
|
||||
is_open_source: { type: 'boolean' }
|
||||
},
|
||||
required: ['company_mission', 'supports_sso', 'is_open_source']
|
||||
}
|
||||
test.concurrent(
|
||||
"should check crawl status",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response: any = (await app.crawlUrl(
|
||||
"https://roastmywebsite.ai",
|
||||
{ crawlerOptions: { excludes: ["blog/*"] } },
|
||||
false
|
||||
)) as CrawlResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.jobId).toBeDefined();
|
||||
|
||||
let statusResponse = await app.checkCrawlStatus(response.jobId);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === "active" && checks < maxChecks) {
|
||||
await new Promise((resolve) => setTimeout(resolve, 5000));
|
||||
expect(statusResponse.partial_data).not.toBeNull();
|
||||
// expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
statusResponse = (await app.checkCrawlStatus(
|
||||
response.jobId
|
||||
)) as CrawlStatusResponseV0;
|
||||
checks++;
|
||||
}
|
||||
});
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.llm_extraction).toBeDefined();
|
||||
const llmExtraction = response.data?.llm_extraction;
|
||||
expect(llmExtraction?.company_mission).toBeDefined();
|
||||
expect(typeof llmExtraction?.supports_sso).toBe('boolean');
|
||||
expect(typeof llmExtraction?.is_open_source).toBe('boolean');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse.success).toBe(true);
|
||||
expect(statusResponse.status).toBe("completed");
|
||||
expect(statusResponse.total).toEqual(statusResponse.current);
|
||||
expect(statusResponse.current_step).not.toBeNull();
|
||||
expect(statusResponse.current).toBeGreaterThanOrEqual(1);
|
||||
|
||||
expect(statusResponse?.data?.length).toBeGreaterThan(0);
|
||||
},
|
||||
35000
|
||||
); // 35 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should return successful response for search",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.search("test query")) as SearchResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.data?.[0]?.content).toBeDefined();
|
||||
expect(response?.data?.length).toBeGreaterThan(2);
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
|
||||
test.concurrent(
|
||||
"should throw error for invalid API key on search",
|
||||
async () => {
|
||||
const invalidApp = new FirecrawlApp<"v0">({
|
||||
apiKey: "invalid_api_key",
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
await expect(invalidApp.search("test query")).rejects.toThrow(
|
||||
"Request failed with status code 401"
|
||||
);
|
||||
}
|
||||
);
|
||||
|
||||
test.concurrent(
|
||||
"should perform LLM extraction",
|
||||
async () => {
|
||||
const app = new FirecrawlApp<"v0">({
|
||||
apiKey: TEST_API_KEY,
|
||||
apiUrl: API_URL,
|
||||
version: "v0",
|
||||
});
|
||||
const response = (await app.scrapeUrl("https://mendable.ai", {
|
||||
extractorOptions: {
|
||||
mode: "llm-extraction",
|
||||
extractionPrompt:
|
||||
"Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
extractionSchema: {
|
||||
type: "object",
|
||||
properties: {
|
||||
company_mission: { type: "string" },
|
||||
supports_sso: { type: "boolean" },
|
||||
is_open_source: { type: "boolean" },
|
||||
},
|
||||
required: ["company_mission", "supports_sso", "is_open_source"],
|
||||
},
|
||||
},
|
||||
})) as ScrapeResponseV0;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.data?.llm_extraction).toBeDefined();
|
||||
const llmExtraction = response.data?.llm_extraction;
|
||||
expect(llmExtraction?.company_mission).toBeDefined();
|
||||
expect(typeof llmExtraction?.supports_sso).toBe("boolean");
|
||||
expect(typeof llmExtraction?.is_open_source).toBe("boolean");
|
||||
},
|
||||
30000
|
||||
); // 30 seconds timeout
|
||||
});
|
||||
|
@ -31,7 +31,7 @@ describe('the firecrawl JS SDK', () => {
|
||||
});
|
||||
|
||||
const apiKey = 'YOUR_API_KEY'
|
||||
const app = new FirecrawlApp({ apiKey });
|
||||
const app = new FirecrawlApp<"v0">({ apiKey });
|
||||
// Scrape a single URL
|
||||
const url = 'https://mendable.ai';
|
||||
const scrapedData = await app.scrapeUrl(url);
|
||||
|
@ -0,0 +1,312 @@
|
||||
import FirecrawlApp, { CrawlParams, CrawlResponse, CrawlStatusResponse, MapResponse, ScrapeParams, ScrapeResponse } from '../../../index';
|
||||
import { v4 as uuidv4 } from 'uuid';
|
||||
import dotenv from 'dotenv';
|
||||
import { describe, test, expect } from '@jest/globals';
|
||||
|
||||
dotenv.config();
|
||||
|
||||
const TEST_API_KEY = process.env.TEST_API_KEY;
|
||||
const API_URL = "http://127.0.0.1:3002";
|
||||
|
||||
describe('FirecrawlApp E2E Tests', () => {
|
||||
test.concurrent('should throw error for no API key', async () => {
|
||||
expect(() => {
|
||||
new FirecrawlApp({ apiKey: null, apiUrl: API_URL });
|
||||
}).toThrow("No API key provided");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for invalid API key on scrape', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain("_Roast_");
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://roastmywebsite.ai') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).not.toHaveProperty('content'); // v0
|
||||
expect(response).not.toHaveProperty('html');
|
||||
expect(response).not.toHaveProperty('rawHtml');
|
||||
expect(response).not.toHaveProperty('screenshot');
|
||||
expect(response).not.toHaveProperty('links');
|
||||
|
||||
expect(response).toHaveProperty('markdown');
|
||||
expect(response).toHaveProperty('metadata');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response with valid API key and options', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl(
|
||||
'https://roastmywebsite.ai', {
|
||||
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
headers: { "x-key": "test" },
|
||||
includeTags: ['h1'],
|
||||
excludeTags: ['h2'],
|
||||
onlyMainContent: true,
|
||||
timeout: 30000,
|
||||
waitFor: 1000
|
||||
}) as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).not.toHaveProperty('content'); // v0
|
||||
expect(response.markdown).toContain("_Roast_");
|
||||
expect(response.html).toContain("<h1");
|
||||
expect(response.rawHtml).toContain("<h1");
|
||||
expect(response.screenshot).not.toBeUndefined();
|
||||
expect(response.screenshot).not.toBeNull();
|
||||
expect(response.screenshot).toContain("https://");
|
||||
expect(response.links).not.toBeNull();
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
expect(response.links?.[0]).toContain("https://");
|
||||
expect(response.metadata).not.toBeNull();
|
||||
expect(response.metadata).toHaveProperty("title");
|
||||
expect(response.metadata).toHaveProperty("description");
|
||||
expect(response.metadata).toHaveProperty("keywords");
|
||||
expect(response.metadata).toHaveProperty("robots");
|
||||
expect(response.metadata).toHaveProperty("ogTitle");
|
||||
expect(response.metadata).toHaveProperty("ogDescription");
|
||||
expect(response.metadata).toHaveProperty("ogUrl");
|
||||
expect(response.metadata).toHaveProperty("ogImage");
|
||||
expect(response.metadata).toHaveProperty("ogLocaleAlternate");
|
||||
expect(response.metadata).toHaveProperty("ogSiteName");
|
||||
expect(response.metadata).toHaveProperty("sourceURL");
|
||||
expect(response.metadata).not.toHaveProperty("pageStatusCode");
|
||||
expect(response.metadata).toHaveProperty("statusCode");
|
||||
expect(response.metadata).not.toHaveProperty("pageError");
|
||||
expect(response.metadata.error).toBeUndefined();
|
||||
expect(response.metadata.title).toBe("Roast My Website");
|
||||
expect(response.metadata.description).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.metadata.keywords).toBe("Roast My Website,Roast,Website,GitHub,Firecrawl");
|
||||
expect(response.metadata.robots).toBe("follow, index");
|
||||
expect(response.metadata.ogTitle).toBe("Roast My Website");
|
||||
expect(response.metadata.ogDescription).toBe("Welcome to Roast My Website, the ultimate tool for putting your website through the wringer! This repository harnesses the power of Firecrawl to scrape and capture screenshots of websites, and then unleashes the latest LLM vision models to mercilessly roast them. 🌶️");
|
||||
expect(response.metadata.ogUrl).toBe("https://www.roastmywebsite.ai");
|
||||
expect(response.metadata.ogImage).toBe("https://www.roastmywebsite.ai/og.png");
|
||||
expect(response.metadata.ogLocaleAlternate).toStrictEqual([]);
|
||||
expect(response.metadata.ogSiteName).toBe("Roast My Website");
|
||||
expect(response.metadata.sourceURL).toBe("https://roastmywebsite.ai");
|
||||
expect(response.metadata.statusCode).toBe(200);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001.pdf') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid scrape with PDF file without explicit extension', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.scrapeUrl('https://arxiv.org/pdf/astro-ph/9301001') as ScrapeResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response?.markdown).toContain('We present spectrophotometric observations of the Broad Line Radio Galaxy');
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on crawl', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://twitter.com/fake-test";
|
||||
await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response for crawl and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, true, 30) as CrawlStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).toHaveProperty("total");
|
||||
expect(response.total).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("creditsUsed");
|
||||
expect(response.creditsUsed).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("expiresAt");
|
||||
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
|
||||
expect(response).toHaveProperty("status");
|
||||
expect(response.status).toBe("completed");
|
||||
expect(response).not.toHaveProperty("next"); // wait until done
|
||||
expect(response.data?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("markdown");
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(response.data?.[0]).not.toHaveProperty("html");
|
||||
expect(response.data?.[0]).not.toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0]).not.toHaveProperty("screenshot");
|
||||
expect(response.data?.[0]).not.toHaveProperty("links");
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for crawl with options and wait for completion', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {
|
||||
excludePaths: ['blog/*'],
|
||||
includePaths: ['/'],
|
||||
maxDepth: 2,
|
||||
ignoreSitemap: true,
|
||||
limit: 10,
|
||||
allowBackwardLinks: true,
|
||||
allowExternalLinks: true,
|
||||
scrapeOptions: {
|
||||
formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'],
|
||||
headers: { "x-key": "test" },
|
||||
includeTags: ['h1'],
|
||||
excludeTags: ['h2'],
|
||||
onlyMainContent: true,
|
||||
waitFor: 1000
|
||||
}
|
||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response).toHaveProperty("total");
|
||||
expect(response.total).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("creditsUsed");
|
||||
expect(response.creditsUsed).toBeGreaterThan(0);
|
||||
expect(response).toHaveProperty("expiresAt");
|
||||
expect(new Date(response.expiresAt).getTime()).toBeGreaterThan(Date.now());
|
||||
expect(response).toHaveProperty("status");
|
||||
expect(response.status).toBe("completed");
|
||||
expect(response).not.toHaveProperty("next");
|
||||
expect(response.data?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("markdown");
|
||||
expect(response.data?.[0].markdown).toContain("_Roast_");
|
||||
expect(response.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(response.data?.[0]).toHaveProperty("html");
|
||||
expect(response.data?.[0].html).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(response.data?.[0].rawHtml).toContain("<h1");
|
||||
expect(response.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(response.data?.[0].screenshot).toContain("https://");
|
||||
expect(response.data?.[0]).toHaveProperty("links");
|
||||
expect(response.data?.[0].links).not.toBeNull();
|
||||
expect(response.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(response.data?.[0]).toHaveProperty("metadata");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(response.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(response.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should handle idempotency key for crawl', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const uniqueIdempotencyKey = uuidv4();
|
||||
const response = await app.crawlUrl('https://roastmywebsite.ai', {}, false, 2, uniqueIdempotencyKey) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
await expect(app.crawlUrl('https://roastmywebsite.ai', {}, true, 2, uniqueIdempotencyKey)).rejects.toThrow("Request failed with status code 409");
|
||||
});
|
||||
|
||||
test.concurrent('should check crawl status', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links']}} as CrawlParams, false) as CrawlResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.id).toBeDefined();
|
||||
|
||||
let statusResponse = await app.checkCrawlStatus(response.id);
|
||||
const maxChecks = 15;
|
||||
let checks = 0;
|
||||
|
||||
while (statusResponse.status === 'scraping' && checks < maxChecks) {
|
||||
await new Promise(resolve => setTimeout(resolve, 5000));
|
||||
expect(statusResponse).not.toHaveProperty("partial_data"); // v0
|
||||
expect(statusResponse).not.toHaveProperty("current"); // v0
|
||||
expect(statusResponse).toHaveProperty("data");
|
||||
expect(statusResponse).toHaveProperty("total");
|
||||
expect(statusResponse).toHaveProperty("creditsUsed");
|
||||
expect(statusResponse).toHaveProperty("expiresAt");
|
||||
expect(statusResponse).toHaveProperty("status");
|
||||
expect(statusResponse).toHaveProperty("next");
|
||||
expect(statusResponse.total).toBeGreaterThan(0);
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse.status).toBe("scraping");
|
||||
expect(statusResponse.next).toContain("/v1/crawl/");
|
||||
statusResponse = await app.checkCrawlStatus(response.id) as CrawlStatusResponse;
|
||||
checks++;
|
||||
}
|
||||
|
||||
expect(statusResponse).not.toBeNull();
|
||||
expect(statusResponse).toHaveProperty("total");
|
||||
expect(statusResponse.total).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("creditsUsed");
|
||||
expect(statusResponse.creditsUsed).toBeGreaterThan(0);
|
||||
expect(statusResponse).toHaveProperty("expiresAt");
|
||||
expect(statusResponse.expiresAt.getTime()).toBeGreaterThan(Date.now());
|
||||
expect(statusResponse).toHaveProperty("status");
|
||||
expect(statusResponse.status).toBe("completed");
|
||||
expect(statusResponse.data?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("markdown");
|
||||
expect(statusResponse.data?.[0].markdown?.length).toBeGreaterThan(10);
|
||||
expect(statusResponse.data?.[0]).not.toHaveProperty('content'); // v0
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("html");
|
||||
expect(statusResponse.data?.[0].html).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("rawHtml");
|
||||
expect(statusResponse.data?.[0].rawHtml).toContain("<div");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("screenshot");
|
||||
expect(statusResponse.data?.[0].screenshot).toContain("https://");
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("links");
|
||||
expect(statusResponse.data?.[0].links).not.toBeNull();
|
||||
expect(statusResponse.data?.[0].links?.length).toBeGreaterThan(0);
|
||||
expect(statusResponse.data?.[0]).toHaveProperty("metadata");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("title");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("description");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("language");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("sourceURL");
|
||||
expect(statusResponse.data?.[0].metadata).toHaveProperty("statusCode");
|
||||
expect(statusResponse.data?.[0].metadata).not.toHaveProperty("error");
|
||||
}, 60000); // 60 seconds timeout
|
||||
|
||||
test.concurrent('should throw error for invalid API key on map', async () => {
|
||||
const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL });
|
||||
await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401");
|
||||
});
|
||||
|
||||
test.concurrent('should throw error for blocklisted URL on map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const blocklistedUrl = "https://facebook.com/fake-test";
|
||||
await expect(app.mapUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403");
|
||||
});
|
||||
|
||||
test.concurrent('should return successful response with valid preview token', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: "this_is_just_a_preview_token", apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test.concurrent('should return successful response for valid map', async () => {
|
||||
const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL });
|
||||
const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse;
|
||||
expect(response).not.toBeNull();
|
||||
|
||||
expect(response.links?.length).toBeGreaterThan(0);
|
||||
expect(response.links?.[0]).toContain("https://");
|
||||
const filteredLinks = response.links?.filter((link: string) => link.includes("roastmywebsite.ai"));
|
||||
expect(filteredLinks?.length).toBeGreaterThan(0);
|
||||
}, 30000); // 30 seconds timeout
|
||||
|
||||
test('should throw NotImplementedError for search on v1', async () => {
|
||||
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
|
||||
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
|
||||
});
|
||||
});
|
@ -1,16 +1,22 @@
|
||||
import axios, { AxiosResponse, AxiosRequestHeaders } from "axios";
|
||||
import { z } from "zod";
|
||||
import { zodToJsonSchema } from "zod-to-json-schema";
|
||||
|
||||
/**
|
||||
* Configuration interface for FirecrawlApp.
|
||||
* @param apiKey - Optional API key for authentication.
|
||||
* @param apiUrl - Optional base URL of the API; defaults to 'https://api.firecrawl.dev'.
|
||||
* @param version - API version, either 'v0' or 'v1'.
|
||||
*/
|
||||
export interface FirecrawlAppConfig {
|
||||
apiKey?: string | null;
|
||||
apiUrl?: string | null;
|
||||
version?: "v0" | "v1";
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for a Firecrawl document.
|
||||
* Includes various optional properties for document metadata.
|
||||
*/
|
||||
export interface FirecrawlDocumentMetadata {
|
||||
title?: string;
|
||||
@ -43,6 +49,17 @@ export interface FirecrawlDocumentMetadata {
|
||||
articleTag?: string;
|
||||
articleSection?: string;
|
||||
sourceURL?: string;
|
||||
statusCode?: number;
|
||||
error?: string;
|
||||
[key: string]: any; // Allows for additional metadata properties not explicitly defined.
|
||||
}
|
||||
|
||||
/**
|
||||
* Metadata for a Firecrawl document on v0.
|
||||
* Similar to FirecrawlDocumentMetadata but includes properties specific to API version v0.
|
||||
*/
|
||||
export interface FirecrawlDocumentMetadataV0 {
|
||||
// Similar properties as FirecrawlDocumentMetadata with additional v0 specific adjustments
|
||||
pageStatusCode?: number;
|
||||
pageError?: string;
|
||||
[key: string]: any;
|
||||
@ -50,8 +67,23 @@ export interface FirecrawlDocumentMetadata {
|
||||
|
||||
/**
|
||||
* Document interface for Firecrawl.
|
||||
* Represents a document retrieved or processed by Firecrawl.
|
||||
*/
|
||||
export interface FirecrawlDocument {
|
||||
url?: string;
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
links?: string[];
|
||||
screenshot?: string;
|
||||
metadata: FirecrawlDocumentMetadata;
|
||||
}
|
||||
|
||||
/**
|
||||
* Document interface for Firecrawl on v0.
|
||||
* Represents a document specifically for API version v0 with additional properties.
|
||||
*/
|
||||
export interface FirecrawlDocumentV0 {
|
||||
id?: string;
|
||||
url?: string;
|
||||
content: string;
|
||||
@ -61,79 +93,242 @@ export interface FirecrawlDocument {
|
||||
createdAt?: Date;
|
||||
updatedAt?: Date;
|
||||
type?: string;
|
||||
metadata: FirecrawlDocumentMetadata;
|
||||
metadata: FirecrawlDocumentMetadataV0;
|
||||
childrenLinks?: string[];
|
||||
provider?: string;
|
||||
warning?: string;
|
||||
|
||||
index?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations.
|
||||
* Parameters for scraping operations.
|
||||
* Defines the options and configurations available for scraping web content.
|
||||
*/
|
||||
export interface ScrapeResponse {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocument;
|
||||
error?: string;
|
||||
export interface ScrapeParams {
|
||||
formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[];
|
||||
headers?: Record<string, string>;
|
||||
includeTags?: string[];
|
||||
excludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile";
|
||||
waitFor?: number;
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for searching operations.
|
||||
* Parameters for scraping operations on v0.
|
||||
* Includes page and extractor options specific to API version v0.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
export interface ScrapeParamsV0 {
|
||||
pageOptions?: {
|
||||
headers?: Record<string, string>;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
onlyIncludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
removeTags?: string[];
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
waitFor?: number;
|
||||
};
|
||||
extractorOptions?: {
|
||||
mode?: "markdown" | "llm-extraction" | "llm-extraction-from-raw-html" | "llm-extraction-from-markdown";
|
||||
extractionPrompt?: string;
|
||||
extractionSchema?: Record<string, any> | z.ZodSchema | any;
|
||||
};
|
||||
timeout?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations.
|
||||
* Defines the structure of the response received after a scraping operation.
|
||||
*/
|
||||
export interface ScrapeResponse extends FirecrawlDocument {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocument[];
|
||||
warning?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for scraping operations on v0.
|
||||
* Similar to ScrapeResponse but tailored for responses from API version v0.
|
||||
*/
|
||||
export interface ScrapeResponseV0 {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocumentV0;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for crawling operations.
|
||||
* Includes options for both scraping and mapping during a crawl.
|
||||
*/
|
||||
export interface CrawlParams {
|
||||
scrapeOptions?: ScrapeParams;
|
||||
crawlerOptions?: {
|
||||
includePaths?: string[]
|
||||
excludePaths?: string[]
|
||||
maxDepth?: number
|
||||
limit?: number
|
||||
allowBackwardLinks?: boolean
|
||||
allowExternalLinks?: boolean
|
||||
ignoreSitemap?: boolean
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for crawling operations on v0.
|
||||
* Tailored for API version v0, includes specific options for crawling.
|
||||
*/
|
||||
export interface CrawlParamsV0 {
|
||||
crawlerOptions?: {
|
||||
includes?: string[];
|
||||
excludes?: string[];
|
||||
generateImgAltText?: boolean;
|
||||
returnOnlyUrls?: boolean;
|
||||
maxDepth?: number;
|
||||
mode?: "default" | "fast";
|
||||
ignoreSitemap?: boolean;
|
||||
limit?: number;
|
||||
allowBackwardCrawling?: boolean;
|
||||
allowExternalContentLinks?: boolean;
|
||||
};
|
||||
pageOptions?: {
|
||||
headers?: Record<string, string>;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
onlyIncludeTags?: string[];
|
||||
onlyMainContent?: boolean;
|
||||
removeTags?: string[];
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
screenshot?: boolean;
|
||||
fullPageScreenshot?: boolean;
|
||||
waitFor?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for crawling operations.
|
||||
* Defines the structure of the response received after initiating a crawl.
|
||||
*/
|
||||
export interface CrawlResponse {
|
||||
id?: string;
|
||||
url?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for crawling operations on v0.
|
||||
* Similar to CrawlResponse but tailored for responses from API version v0.
|
||||
*/
|
||||
export interface CrawlResponseV0 {
|
||||
jobId?: string;
|
||||
success: boolean;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Provides detailed status of a crawl job including progress and results.
|
||||
*/
|
||||
export interface CrawlStatusResponse {
|
||||
success: boolean;
|
||||
total: number;
|
||||
completed: number;
|
||||
creditsUsed: number;
|
||||
expiresAt: Date;
|
||||
status: "scraping" | "completed" | "failed";
|
||||
next: string;
|
||||
data?: FirecrawlDocument[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for job status checks.
|
||||
* Response interface for job status checks on v0.
|
||||
* Tailored for API version v0, provides status and partial data of a crawl job.
|
||||
*/
|
||||
export interface JobStatusResponse {
|
||||
export interface CrawlStatusResponseV0 {
|
||||
success: boolean;
|
||||
status: string;
|
||||
current?: number;
|
||||
current_url?: string;
|
||||
current_step?: string;
|
||||
total?: number;
|
||||
jobId?: string;
|
||||
data?: FirecrawlDocument[];
|
||||
partial_data?: FirecrawlDocument[];
|
||||
data?: FirecrawlDocumentV0[];
|
||||
partial_data?: FirecrawlDocumentV0[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Generic parameter interface.
|
||||
* Parameters for mapping operations.
|
||||
* Defines options for mapping URLs during a crawl.
|
||||
*/
|
||||
export interface Params {
|
||||
[key: string]: any;
|
||||
extractorOptions?: {
|
||||
extractionSchema: z.ZodSchema | any;
|
||||
mode?: "llm-extraction";
|
||||
extractionPrompt?: string;
|
||||
export interface MapParams {
|
||||
includePaths?: string[]
|
||||
excludePaths?: string[]
|
||||
maxDepth?: number
|
||||
limit?: number
|
||||
allowBackwardLinks?: boolean
|
||||
allowExternalLinks?: boolean
|
||||
ignoreSitemap?: boolean
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for mapping operations.
|
||||
* Defines the structure of the response received after a mapping operation.
|
||||
*/
|
||||
export interface MapResponse {
|
||||
success: boolean;
|
||||
links?: string[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for searching operations on v0.
|
||||
* Tailored for API version v0, includes specific options for searching content.
|
||||
*/
|
||||
export interface SearchParamsV0 {
|
||||
pageOptions?: {
|
||||
onlyMainContent?: boolean;
|
||||
fetchPageContent?: boolean;
|
||||
includeHtml?: boolean;
|
||||
includeRawHtml?: boolean;
|
||||
};
|
||||
searchOptions?: {
|
||||
limit?: number;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for searching operations on v0.
|
||||
* Defines the structure of the response received after a search operation on v0.
|
||||
*/
|
||||
export interface SearchResponseV0 {
|
||||
success: boolean;
|
||||
data?: FirecrawlDocumentV0[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||
*/
|
||||
export default class FirecrawlApp {
|
||||
export default class FirecrawlApp<T extends "v0" | "v1"> {
|
||||
private apiKey: string;
|
||||
private apiUrl: string;
|
||||
public version: T;
|
||||
|
||||
/**
|
||||
* Initializes a new instance of the FirecrawlApp class.
|
||||
* @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance.
|
||||
* @param config - Configuration options for the FirecrawlApp instance.
|
||||
*/
|
||||
constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) {
|
||||
constructor({ apiKey = null, apiUrl = null, version = "v1" }: FirecrawlAppConfig) {
|
||||
this.apiKey = apiKey || "";
|
||||
this.apiUrl = apiUrl || "https://api.firecrawl.dev";
|
||||
this.version = version as T;
|
||||
if (!this.apiKey) {
|
||||
throw new Error("No API key provided");
|
||||
}
|
||||
@ -141,21 +336,21 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Scrapes a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to scrape.
|
||||
* @param {Params | null} params - Additional parameters for the scrape request.
|
||||
* @returns {Promise<ScrapeResponse>} The response from the scrape operation.
|
||||
* @param url - The URL to scrape.
|
||||
* @param params - Additional parameters for the scrape request.
|
||||
* @returns The response from the scrape operation.
|
||||
*/
|
||||
async scrapeUrl(
|
||||
url: string,
|
||||
params: Params | null = null
|
||||
): Promise<ScrapeResponse> {
|
||||
params?: ScrapeParams | ScrapeParamsV0
|
||||
): Promise<this['version'] extends 'v0' ? ScrapeResponseV0 : ScrapeResponse> {
|
||||
const headers: AxiosRequestHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { url, ...params };
|
||||
if (params?.extractorOptions?.extractionSchema) {
|
||||
let schema = params.extractorOptions.extractionSchema;
|
||||
let jsonData: any = { url, ...params };
|
||||
if (jsonData?.extractorOptions?.extractionSchema) {
|
||||
let schema = jsonData.extractorOptions.extractionSchema;
|
||||
// Check if schema is an instance of ZodSchema to correctly identify Zod schemas
|
||||
if (schema instanceof z.ZodSchema) {
|
||||
schema = zodToJsonSchema(schema);
|
||||
@ -163,22 +358,27 @@ export default class FirecrawlApp {
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
extractorOptions: {
|
||||
...params.extractorOptions,
|
||||
...jsonData.extractorOptions,
|
||||
extractionSchema: schema,
|
||||
mode: params.extractorOptions.mode || "llm-extraction",
|
||||
mode: jsonData.extractorOptions.mode || "llm-extraction",
|
||||
},
|
||||
};
|
||||
}
|
||||
try {
|
||||
const response: AxiosResponse = await axios.post(
|
||||
this.apiUrl + "/v0/scrape",
|
||||
this.apiUrl + `/${this.version}/scrape`,
|
||||
jsonData,
|
||||
{ headers }
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return responseData;
|
||||
return (this.version === 'v0' ? responseData as ScrapeResponseV0 : {
|
||||
success: true,
|
||||
warning: responseData.warning,
|
||||
error: responseData.error,
|
||||
...responseData.data
|
||||
}) as ScrapeResponse;
|
||||
} else {
|
||||
throw new Error(`Failed to scrape URL. Error: ${responseData.error}`);
|
||||
}
|
||||
@ -188,24 +388,28 @@ export default class FirecrawlApp {
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
return { success: false, error: "Internal server error." } as this['version'] extends 'v0' ? ScrapeResponseV0 : ScrapeResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Searches for a query using the Firecrawl API.
|
||||
* @param {string} query - The query to search for.
|
||||
* @param {Params | null} params - Additional parameters for the search request.
|
||||
* @returns {Promise<SearchResponse>} The response from the search operation.
|
||||
* @param query - The query to search for.
|
||||
* @param params - Additional parameters for the search request.
|
||||
* @returns The response from the search operation.
|
||||
*/
|
||||
async search(
|
||||
query: string,
|
||||
params: Params | null = null
|
||||
): Promise<SearchResponse> {
|
||||
params?: SearchParamsV0
|
||||
): Promise<SearchResponseV0> {
|
||||
if (this.version === "v1") {
|
||||
throw new Error("Search is not supported in v1, please update FirecrawlApp() initialization to use v0.");
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
let jsonData: Params = { query };
|
||||
let jsonData: any = { query };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
@ -233,93 +437,160 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Initiates a crawl job for a URL using the Firecrawl API.
|
||||
* @param {string} url - The URL to crawl.
|
||||
* @param {Params | null} params - Additional parameters for the crawl request.
|
||||
* @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param {number} pollInterval - Time in seconds for job status checks.
|
||||
* @param {string} idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns {Promise<CrawlResponse | any>} The response from the crawl operation.
|
||||
* @param url - The URL to crawl.
|
||||
* @param params - Additional parameters for the crawl request.
|
||||
* @param waitUntilDone - Whether to wait for the crawl job to complete.
|
||||
* @param pollInterval - Time in seconds for job status checks.
|
||||
* @param idempotencyKey - Optional idempotency key for the request.
|
||||
* @returns The response from the crawl operation.
|
||||
*/
|
||||
async crawlUrl(
|
||||
url: string,
|
||||
params: Params | null = null,
|
||||
params?: this['version'] extends 'v0' ? CrawlParamsV0 : CrawlParams,
|
||||
waitUntilDone: boolean = true,
|
||||
pollInterval: number = 2,
|
||||
idempotencyKey?: string
|
||||
): Promise<CrawlResponse | any> {
|
||||
): Promise<
|
||||
this['version'] extends 'v0'
|
||||
? CrawlResponseV0 | CrawlStatusResponseV0 | FirecrawlDocumentV0[]
|
||||
: CrawlResponse | CrawlStatusResponse
|
||||
> {
|
||||
const headers = this.prepareHeaders(idempotencyKey);
|
||||
let jsonData: Params = { url };
|
||||
if (params) {
|
||||
jsonData = { ...jsonData, ...params };
|
||||
}
|
||||
let jsonData: any = { url, ...params };
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + "/v0/crawl",
|
||||
this.apiUrl + `/${this.version}/crawl`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
const jobId: string = response.data.jobId;
|
||||
const id: string = this.version === 'v0' ? response.data.jobId : response.data.id;
|
||||
let checkUrl: string | undefined = undefined;
|
||||
if (waitUntilDone) {
|
||||
return this.monitorJobStatus(jobId, headers, pollInterval);
|
||||
if (this.version === 'v1') { checkUrl = response.data.url }
|
||||
return this.monitorJobStatus(id, headers, pollInterval, checkUrl);
|
||||
} else {
|
||||
return { success: true, jobId };
|
||||
if (this.version === 'v0') {
|
||||
return {
|
||||
success: true,
|
||||
jobId: id
|
||||
} as CrawlResponseV0;
|
||||
} else {
|
||||
return {
|
||||
success: true,
|
||||
id: id
|
||||
} as CrawlResponse;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "start crawl job");
|
||||
}
|
||||
} catch (error: any) {
|
||||
console.log(error);
|
||||
throw new Error(error.message);
|
||||
if (error.response?.data?.error) {
|
||||
throw new Error(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`);
|
||||
} else {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error." };
|
||||
return { success: false, error: "Internal server error." } as this['version'] extends 'v0' ? CrawlResponseV0 : CrawlResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks the status of a crawl job using the Firecrawl API.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @returns {Promise<JobStatusResponse>} The response containing the job status.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @returns The response containing the job status.
|
||||
*/
|
||||
async checkCrawlStatus(jobId: string): Promise<JobStatusResponse> {
|
||||
async checkCrawlStatus(id?: string): Promise<this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse> {
|
||||
if (!id) {
|
||||
throw new Error("No crawl ID provided");
|
||||
}
|
||||
|
||||
const headers: AxiosRequestHeaders = this.prepareHeaders();
|
||||
try {
|
||||
const response: AxiosResponse = await this.getRequest(
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
this.version === 'v1' ?
|
||||
`${this.apiUrl}/${this.version}/crawl/${id}` :
|
||||
`${this.apiUrl}/${this.version}/crawl/status/${id}`,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return {
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
current: response.data.current,
|
||||
current_url: response.data.current_url,
|
||||
current_step: response.data.current_step,
|
||||
total: response.data.total,
|
||||
data: response.data.data,
|
||||
partial_data: !response.data.data
|
||||
? response.data.partial_data
|
||||
: undefined,
|
||||
};
|
||||
if (this.version === 'v0') {
|
||||
return ({
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
current: response.data.current,
|
||||
current_url: response.data.current_url,
|
||||
current_step: response.data.current_step,
|
||||
total: response.data.total,
|
||||
data: response.data.data,
|
||||
partial_data: !response.data.data
|
||||
? response.data.partial_data
|
||||
: undefined,
|
||||
} as CrawlStatusResponseV0) as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse;
|
||||
} else {
|
||||
return ({
|
||||
success: true,
|
||||
status: response.data.status,
|
||||
total: response.data.total,
|
||||
completed: response.data.completed,
|
||||
creditsUsed: response.data.creditsUsed,
|
||||
expiresAt: new Date(response.data.expiresAt),
|
||||
next: response.data.next,
|
||||
data: response.data.data,
|
||||
error: response.data.error
|
||||
} as CrawlStatusResponse) as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse;
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "check crawl status");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return {
|
||||
success: false,
|
||||
status: "unknown",
|
||||
current: 0,
|
||||
current_url: "",
|
||||
current_step: "",
|
||||
total: 0,
|
||||
error: "Internal server error.",
|
||||
};
|
||||
|
||||
return this.version === 'v0' ?
|
||||
({
|
||||
success: false,
|
||||
status: "unknown",
|
||||
current: 0,
|
||||
current_url: "",
|
||||
current_step: "",
|
||||
total: 0,
|
||||
error: "Internal server error.",
|
||||
} as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse) :
|
||||
({
|
||||
success: false,
|
||||
error: "Internal server error.",
|
||||
} as this['version'] extends 'v0' ? CrawlStatusResponseV0 : CrawlStatusResponse);
|
||||
}
|
||||
|
||||
async mapUrl(url: string, params?: MapParams): Promise<MapResponse> {
|
||||
if (this.version == 'v0') {
|
||||
throw new Error("Map is not supported in v0");
|
||||
}
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: { url: string } & MapParams = { url, ...params };
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/${this.version}/map`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
if (response.status === 200) {
|
||||
return response.data as MapResponse;
|
||||
} else {
|
||||
this.handleError(response, "map");
|
||||
}
|
||||
} catch (error: any) {
|
||||
throw new Error(error.message);
|
||||
}
|
||||
return { success: false, error: "Internal server error." } as MapResponse;
|
||||
}
|
||||
|
||||
/**
|
||||
* Prepares the headers for an API request.
|
||||
* @returns {AxiosRequestHeaders} The prepared headers.
|
||||
* @param idempotencyKey - Optional key to ensure idempotency.
|
||||
* @returns The prepared headers.
|
||||
*/
|
||||
prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders {
|
||||
return {
|
||||
@ -331,14 +602,14 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Sends a POST request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {Params} data - The data to send in the request.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the POST request.
|
||||
* @param url - The URL to send the request to.
|
||||
* @param data - The data to send in the request.
|
||||
* @param headers - The headers for the request.
|
||||
* @returns The response from the POST request.
|
||||
*/
|
||||
postRequest(
|
||||
url: string,
|
||||
data: Params,
|
||||
data: any,
|
||||
headers: AxiosRequestHeaders
|
||||
): Promise<AxiosResponse> {
|
||||
return axios.post(url, data, { headers });
|
||||
@ -346,9 +617,9 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Sends a GET request to the specified URL.
|
||||
* @param {string} url - The URL to send the request to.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @returns {Promise<AxiosResponse>} The response from the GET request.
|
||||
* @param url - The URL to send the request to.
|
||||
* @param headers - The headers for the request.
|
||||
* @returns The response from the GET request.
|
||||
*/
|
||||
getRequest(
|
||||
url: string,
|
||||
@ -359,38 +630,44 @@ export default class FirecrawlApp {
|
||||
|
||||
/**
|
||||
* Monitors the status of a crawl job until completion or failure.
|
||||
* @param {string} jobId - The job ID of the crawl operation.
|
||||
* @param {AxiosRequestHeaders} headers - The headers for the request.
|
||||
* @param {number} timeout - Timeout in seconds for job status checks.
|
||||
* @returns {Promise<any>} The final job status or data.
|
||||
* @param id - The ID of the crawl operation.
|
||||
* @param headers - The headers for the request.
|
||||
* @param checkInterval - Interval in seconds for job status checks.
|
||||
* @param checkUrl - Optional URL to check the status (used for v1 API)
|
||||
* @returns The final job status or data.
|
||||
*/
|
||||
async monitorJobStatus(
|
||||
jobId: string,
|
||||
id: string,
|
||||
headers: AxiosRequestHeaders,
|
||||
checkInterval: number
|
||||
): Promise<any> {
|
||||
checkInterval: number,
|
||||
checkUrl?: string
|
||||
): Promise<this['version'] extends 'v0' ? CrawlStatusResponseV0 | FirecrawlDocumentV0[] : CrawlStatusResponse> {
|
||||
let apiUrl: string = '';
|
||||
while (true) {
|
||||
if (this.version === 'v1') {
|
||||
apiUrl = checkUrl ?? `${this.apiUrl}/v1/crawl/${id}`;
|
||||
} else if (this.version === 'v0') {
|
||||
apiUrl = `${this.apiUrl}/v0/crawl/status/${id}`;
|
||||
}
|
||||
const statusResponse: AxiosResponse = await this.getRequest(
|
||||
this.apiUrl + `/v0/crawl/status/${jobId}`,
|
||||
apiUrl,
|
||||
headers
|
||||
);
|
||||
if (statusResponse.status === 200) {
|
||||
const statusData = statusResponse.data;
|
||||
if (statusData.status === "completed") {
|
||||
if ("data" in statusData) {
|
||||
return statusData.data;
|
||||
return this.version === 'v0' ? statusData.data : statusData;
|
||||
} else {
|
||||
throw new Error("Crawl job completed but no data was returned");
|
||||
}
|
||||
} else if (
|
||||
["active", "paused", "pending", "queued"].includes(statusData.status)
|
||||
["active", "paused", "pending", "queued", "scraping"].includes(statusData.status)
|
||||
) {
|
||||
if (checkInterval < 2) {
|
||||
checkInterval = 2;
|
||||
}
|
||||
checkInterval = Math.max(checkInterval, 2);
|
||||
await new Promise((resolve) =>
|
||||
setTimeout(resolve, checkInterval * 1000)
|
||||
); // Wait for the specified timeout before checking again
|
||||
);
|
||||
} else {
|
||||
throw new Error(
|
||||
`Crawl job failed or was stopped. Status: ${statusData.status}`
|
||||
|
@ -11,7 +11,7 @@
|
||||
// "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */
|
||||
|
||||
/* Language and Environment */
|
||||
"target": "es2016", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
"target": "es2020", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */
|
||||
// "lib": [], /* Specify a set of bundled library declaration files that describe the target runtime environment. */
|
||||
// "jsx": "preserve", /* Specify what JSX code is generated. */
|
||||
// "experimentalDecorators": true, /* Enable experimental support for legacy experimental decorators. */
|
||||
@ -25,9 +25,9 @@
|
||||
// "moduleDetection": "auto", /* Control what method is used to detect module-format JS files. */
|
||||
|
||||
/* Modules */
|
||||
"module": "NodeNext", /* Specify what module code is generated. */
|
||||
"module": "commonjs", /* Specify what module code is generated. */
|
||||
"rootDir": "./src", /* Specify the root folder within your source files. */
|
||||
"moduleResolution": "nodenext", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
"moduleResolution": "node", /* Specify how TypeScript looks up a file from a given module specifier. */
|
||||
// "baseUrl": "./", /* Specify the base directory to resolve non-relative module names. */
|
||||
// "paths": {}, /* Specify a set of entries that re-map imports to additional lookup locations. */
|
||||
// "rootDirs": [], /* Allow multiple folders to be treated as one when resolving modules. */
|
||||
|
33
apps/js-sdk/package-lock.json
generated
33
apps/js-sdk/package-lock.json
generated
@ -9,7 +9,7 @@
|
||||
"version": "1.0.0",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.19",
|
||||
"@mendable/firecrawl-js": "^0.0.36",
|
||||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
@ -422,15 +422,29 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js": {
|
||||
"version": "0.0.19",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.19.tgz",
|
||||
"integrity": "sha512-u9BDVIN/bftDztxLlE2cf02Nz0si3+Vmy9cANDFHj/iriT3guzI8ITBk4uC81CyRmPzNyXrW6hSAG90g9ol4cA==",
|
||||
"version": "0.0.36",
|
||||
"resolved": "https://registry.npmjs.org/@mendable/firecrawl-js/-/firecrawl-js-0.0.36.tgz",
|
||||
"integrity": "sha512-5zQMWUD49r6Q7cxj+QBthQ964Bm9fMooW4E8E4nIca3BMXCeEuQFVf5C3OEWwZf0SjJvR+5Yx2wUbXJWd1wCOA==",
|
||||
"dependencies": {
|
||||
"axios": "^1.6.8",
|
||||
"dotenv": "^16.4.5",
|
||||
"uuid": "^9.0.1",
|
||||
"zod": "^3.23.8",
|
||||
"zod-to-json-schema": "^3.23.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@mendable/firecrawl-js/node_modules/uuid": {
|
||||
"version": "9.0.1",
|
||||
"resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz",
|
||||
"integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==",
|
||||
"funding": [
|
||||
"https://github.com/sponsors/broofa",
|
||||
"https://github.com/sponsors/ctavan"
|
||||
],
|
||||
"bin": {
|
||||
"uuid": "dist/bin/uuid"
|
||||
}
|
||||
},
|
||||
"node_modules/@tsconfig/node10": {
|
||||
"version": "1.0.11",
|
||||
"resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz",
|
||||
@ -531,6 +545,17 @@
|
||||
"node": ">=0.3.1"
|
||||
}
|
||||
},
|
||||
"node_modules/dotenv": {
|
||||
"version": "16.4.5",
|
||||
"resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.5.tgz",
|
||||
"integrity": "sha512-ZmdL2rui+eB2YwhsWzjInR8LldtZHGDoQ1ugH85ppHKwpUHL7j7rN0Ti9NCnGiQbhaZ11FpR+7ao1dNsmduNUg==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://dotenvx.com"
|
||||
}
|
||||
},
|
||||
"node_modules/esbuild": {
|
||||
"version": "0.20.2",
|
||||
"resolved": "https://registry.npmjs.org/esbuild/-/esbuild-0.20.2.tgz",
|
||||
|
@ -11,7 +11,7 @@
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"@mendable/firecrawl-js": "^0.0.19",
|
||||
"@mendable/firecrawl-js": "^0.0.36",
|
||||
"axios": "^1.6.8",
|
||||
"ts-node": "^10.9.2",
|
||||
"typescript": "^5.4.5",
|
||||
|
@ -1 +0,0 @@
|
||||
from .firecrawl import FirecrawlApp
|
@ -1,299 +0,0 @@
|
||||
"""
|
||||
FirecrawlApp Module
|
||||
|
||||
This module provides a class `FirecrawlApp` for interacting with the Firecrawl API.
|
||||
It includes methods to scrape URLs, perform searches, initiate and monitor crawl jobs,
|
||||
and check the status of these jobs. The module uses requests for HTTP communication
|
||||
and handles retries for certain HTTP status codes.
|
||||
|
||||
Classes:
|
||||
- FirecrawlApp: Main class for interacting with the Firecrawl API.
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
|
||||
class FirecrawlApp:
|
||||
"""
|
||||
Initialize the FirecrawlApp instance.
|
||||
|
||||
Args:
|
||||
api_key (Optional[str]): API key for authenticating with the Firecrawl API.
|
||||
api_url (Optional[str]): Base URL for the Firecrawl API.
|
||||
"""
|
||||
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
|
||||
self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
|
||||
if self.api_key is None:
|
||||
raise ValueError('No API key provided')
|
||||
self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
|
||||
def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
|
||||
"""
|
||||
Scrape the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to scrape.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the scrape request.
|
||||
|
||||
Returns:
|
||||
Any: The scraped data if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the scrape request fails.
|
||||
"""
|
||||
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
# Prepare the base scrape parameters with the URL
|
||||
scrape_params = {'url': url}
|
||||
|
||||
# If there are additional params, process them
|
||||
if params:
|
||||
# Initialize extractorOptions if present
|
||||
extractor_options = params.get('extractorOptions', {})
|
||||
# Check and convert the extractionSchema if it's a Pydantic model
|
||||
if 'extractionSchema' in extractor_options:
|
||||
if hasattr(extractor_options['extractionSchema'], 'schema'):
|
||||
extractor_options['extractionSchema'] = extractor_options['extractionSchema'].schema()
|
||||
# Ensure 'mode' is set, defaulting to 'llm-extraction' if not explicitly provided
|
||||
extractor_options['mode'] = extractor_options.get('mode', 'llm-extraction')
|
||||
# Update the scrape_params with the processed extractorOptions
|
||||
scrape_params['extractorOptions'] = extractor_options
|
||||
|
||||
# Include any other params directly at the top level of scrape_params
|
||||
for key, value in params.items():
|
||||
if key != 'extractorOptions':
|
||||
scrape_params[key] = value
|
||||
# Make the POST request with the prepared headers and JSON data
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v0/scrape',
|
||||
headers=headers,
|
||||
json=scrape_params,
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
||||
elif response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
||||
|
||||
def search(self, query, params=None):
|
||||
"""
|
||||
Perform a search using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
query (str): The search query.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
|
||||
|
||||
Returns:
|
||||
Any: The search results if the request is successful.
|
||||
|
||||
Raises:
|
||||
Exception: If the search request fails.
|
||||
"""
|
||||
headers = {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}'
|
||||
}
|
||||
json_data = {'query': query}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = requests.post(
|
||||
f'{self.api_url}/v0/search',
|
||||
headers=headers,
|
||||
json=json_data
|
||||
)
|
||||
if response.status_code == 200:
|
||||
response = response.json()
|
||||
|
||||
if response['success'] and 'data' in response:
|
||||
return response['data']
|
||||
else:
|
||||
raise Exception(f'Failed to search. Error: {response["error"]}')
|
||||
|
||||
elif response.status_code in [402, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Failed to search. Status code: {response.status_code}')
|
||||
|
||||
def crawl_url(self, url, params=None, wait_until_done=True, timeout=2, idempotency_key=None):
|
||||
"""
|
||||
Initiate a crawl job for the specified URL using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
url (str): The URL to crawl.
|
||||
params (Optional[Dict[str, Any]]): Additional parameters for the crawl request.
|
||||
wait_until_done (bool): Whether to wait until the crawl job is completed.
|
||||
timeout (int): Timeout between status checks when waiting for job completion.
|
||||
idempotency_key (Optional[str]): A unique uuid key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Any: The crawl job ID or the crawl results if waiting until completion.
|
||||
|
||||
Raises:
|
||||
Exception: If the crawl job initiation or monitoring fails.
|
||||
"""
|
||||
headers = self._prepare_headers(idempotency_key)
|
||||
json_data = {'url': url}
|
||||
if params:
|
||||
json_data.update(params)
|
||||
response = self._post_request(f'{self.api_url}/v0/crawl', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
job_id = response.json().get('jobId')
|
||||
if wait_until_done:
|
||||
return self._monitor_job_status(job_id, headers, timeout)
|
||||
else:
|
||||
return {'jobId': job_id}
|
||||
else:
|
||||
self._handle_error(response, 'start crawl job')
|
||||
|
||||
def check_crawl_status(self, job_id):
|
||||
"""
|
||||
Check the status of a crawl job using the Firecrawl API.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
|
||||
Returns:
|
||||
Any: The status of the crawl job.
|
||||
|
||||
Raises:
|
||||
Exception: If the status check request fails.
|
||||
"""
|
||||
headers = self._prepare_headers()
|
||||
response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if response.status_code == 200:
|
||||
return response.json()
|
||||
else:
|
||||
self._handle_error(response, 'check crawl status')
|
||||
|
||||
def _prepare_headers(self, idempotency_key=None):
|
||||
"""
|
||||
Prepare the headers for API requests.
|
||||
|
||||
Args:
|
||||
idempotency_key (Optional[str]): A unique key to ensure idempotency of requests.
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: The headers including content type, authorization, and optionally idempotency key.
|
||||
"""
|
||||
if idempotency_key:
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
'x-idempotency-key': idempotency_key
|
||||
}
|
||||
|
||||
return {
|
||||
'Content-Type': 'application/json',
|
||||
'Authorization': f'Bearer {self.api_key}',
|
||||
}
|
||||
|
||||
def _post_request(self, url, data, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a POST request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the POST request to.
|
||||
data (Dict[str, Any]): The JSON data to include in the POST request.
|
||||
headers (Dict[str, str]): The headers to include in the POST request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the POST request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.post(url, headers=headers, json=data)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2 ** attempt))
|
||||
else:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _get_request(self, url, headers, retries=3, backoff_factor=0.5):
|
||||
"""
|
||||
Make a GET request with retries.
|
||||
|
||||
Args:
|
||||
url (str): The URL to send the GET request to.
|
||||
headers (Dict[str, str]): The headers to include in the GET request.
|
||||
retries (int): Number of retries for the request.
|
||||
backoff_factor (float): Backoff factor for retries.
|
||||
|
||||
Returns:
|
||||
requests.Response: The response from the GET request.
|
||||
|
||||
Raises:
|
||||
requests.RequestException: If the request fails after the specified retries.
|
||||
"""
|
||||
for attempt in range(retries):
|
||||
response = requests.get(url, headers=headers)
|
||||
if response.status_code == 502:
|
||||
time.sleep(backoff_factor * (2 ** attempt))
|
||||
else:
|
||||
return response
|
||||
return response
|
||||
|
||||
def _monitor_job_status(self, job_id, headers, timeout):
|
||||
"""
|
||||
Monitor the status of a crawl job until completion.
|
||||
|
||||
Args:
|
||||
job_id (str): The ID of the crawl job.
|
||||
headers (Dict[str, str]): The headers to include in the status check requests.
|
||||
timeout (int): Timeout between status checks.
|
||||
|
||||
Returns:
|
||||
Any: The crawl results if the job is completed successfully.
|
||||
|
||||
Raises:
|
||||
Exception: If the job fails or an error occurs during status checks.
|
||||
"""
|
||||
while True:
|
||||
status_response = self._get_request(f'{self.api_url}/v0/crawl/status/{job_id}', headers)
|
||||
if status_response.status_code == 200:
|
||||
status_data = status_response.json()
|
||||
if status_data['status'] == 'completed':
|
||||
if 'data' in status_data:
|
||||
return status_data['data']
|
||||
else:
|
||||
raise Exception('Crawl job completed but no data was returned')
|
||||
elif status_data['status'] in ['active', 'paused', 'pending', 'queued', 'waiting']:
|
||||
timeout=max(timeout,2)
|
||||
time.sleep(timeout) # Wait for the specified timeout before checking again
|
||||
else:
|
||||
raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
||||
else:
|
||||
self._handle_error(status_response, 'check crawl status')
|
||||
|
||||
def _handle_error(self, response, action):
|
||||
"""
|
||||
Handle errors from API responses.
|
||||
|
||||
Args:
|
||||
response (requests.Response): The response object from the API request.
|
||||
action (str): Description of the action that was being performed.
|
||||
|
||||
Raises:
|
||||
Exception: An exception with a message containing the status code and error details from the response.
|
||||
"""
|
||||
if response.status_code in [402, 408, 409, 500]:
|
||||
error_message = response.json().get('error', 'Unknown error occurred')
|
||||
raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
||||
else:
|
||||
raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
BIN
apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
vendored
BIN
apps/python-sdk/dist/firecrawl-py-0.0.12.tar.gz
vendored
Binary file not shown.
Binary file not shown.
75
apps/python-sdk/examplev0.py
Normal file
75
apps/python-sdk/examplev0.py
Normal file
@ -0,0 +1,75 @@
|
||||
import uuid
|
||||
from firecrawl.firecrawl import FirecrawlApp
|
||||
|
||||
app = FirecrawlApp(api_key="fc-YOUR_API_KEY")
|
||||
|
||||
# Scrape a website:
|
||||
scrape_result = app.scrape_url('firecrawl.dev')
|
||||
print(scrape_result['markdown'])
|
||||
|
||||
# Crawl a website:
|
||||
idempotency_key = str(uuid.uuid4()) # optional idempotency key
|
||||
crawl_result = app.crawl_url('mendable.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, idempotency_key)
|
||||
print(crawl_result)
|
||||
|
||||
# LLM Extraction:
|
||||
# Define schema to extract contents into using pydantic
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
|
||||
class ArticleSchema(BaseModel):
|
||||
title: str
|
||||
points: int
|
||||
by: str
|
||||
commentsURL: str
|
||||
|
||||
class TopArticlesSchema(BaseModel):
|
||||
top: List[ArticleSchema] = Field(..., max_items=5, description="Top 5 stories")
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': TopArticlesSchema.model_json_schema(),
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
||||
|
||||
# Define schema to extract contents into using json schema
|
||||
json_schema = {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"top": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"points": {"type": "number"},
|
||||
"by": {"type": "string"},
|
||||
"commentsURL": {"type": "string"}
|
||||
},
|
||||
"required": ["title", "points", "by", "commentsURL"]
|
||||
},
|
||||
"minItems": 5,
|
||||
"maxItems": 5,
|
||||
"description": "Top 5 stories on Hacker News"
|
||||
}
|
||||
},
|
||||
"required": ["top"]
|
||||
}
|
||||
|
||||
llm_extraction_result = app.scrape_url('https://news.ycombinator.com', {
|
||||
'extractorOptions': {
|
||||
'extractionSchema': json_schema,
|
||||
'mode': 'llm-extraction'
|
||||
},
|
||||
'pageOptions':{
|
||||
'onlyMainContent': True
|
||||
}
|
||||
})
|
||||
|
||||
print(llm_extraction_result['llm_extraction'])
|
@ -13,7 +13,7 @@ import os
|
||||
|
||||
from .firecrawl import FirecrawlApp
|
||||
|
||||
__version__ = "0.0.16"
|
||||
__version__ = "1.0.0"
|
||||
|
||||
# Define the logger for the Firecrawl project
|
||||
logger: logging.Logger = logging.getLogger("firecrawl")
|
||||
|
@ -7,7 +7,7 @@ from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
API_URL = "http://127.0.0.1:3002";
|
||||
API_URL = "http://127.0.0.1:3002"
|
||||
ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
|
||||
TEST_API_KEY = os.getenv('TEST_API_KEY')
|
||||
|
||||
@ -20,32 +20,34 @@ FirecrawlApp = firecrawl.FirecrawlApp
|
||||
|
||||
def test_no_api_key():
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app = FirecrawlApp(api_url=API_URL)
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, version='v0')
|
||||
assert "No API key provided" in str(excinfo.value)
|
||||
|
||||
def test_scrape_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.scrape_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during scrape URL: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_blocklisted_url():
|
||||
blocklisted_url = "https://facebook.com/fake-test"
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.scrape_url(blocklisted_url)
|
||||
assert "Unexpected error during scrape URL: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_successful_response_with_valid_preview_token():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token")
|
||||
app = FirecrawlApp(api_url=API_URL, api_key="this_is_just_a_preview_token", version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_scrape_url_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai')
|
||||
print(response)
|
||||
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
assert 'markdown' in response
|
||||
@ -54,7 +56,7 @@ def test_scrape_url_e2e():
|
||||
assert "_Roast_" in response['content']
|
||||
|
||||
def test_successful_response_with_valid_api_key_and_include_html():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://roastmywebsite.ai', {'pageOptions': {'includeHtml': True}})
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
@ -66,7 +68,7 @@ def test_successful_response_with_valid_api_key_and_include_html():
|
||||
assert "<h1" in response['html']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001.pdf')
|
||||
assert response is not None
|
||||
assert 'content' in response
|
||||
@ -74,7 +76,7 @@ def test_successful_response_for_valid_scrape_with_pdf_file():
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_extension():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url('https://arxiv.org/pdf/astro-ph/9301001')
|
||||
time.sleep(6) # wait for 6 seconds
|
||||
assert response is not None
|
||||
@ -83,20 +85,20 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
|
||||
assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['content']
|
||||
|
||||
def test_crawl_url_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.crawl_url('https://firecrawl.dev')
|
||||
assert "Unexpected error during start crawl job: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_should_return_error_for_blocklisted_url():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
blocklisted_url = "https://twitter.com/fake-test"
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
app.crawl_url(blocklisted_url)
|
||||
assert "Unexpected error during start crawl job: Status code 403. Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it." in str(excinfo.value)
|
||||
|
||||
def test_crawl_url_wait_for_completion_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True)
|
||||
assert response is not None
|
||||
assert len(response) > 0
|
||||
@ -104,7 +106,7 @@ def test_crawl_url_wait_for_completion_e2e():
|
||||
assert "_Roast_" in response[0]['content']
|
||||
|
||||
def test_crawl_url_with_idempotency_key_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
uniqueIdempotencyKey = str(uuid4())
|
||||
response = app.crawl_url('https://roastmywebsite.ai', {'crawlerOptions': {'excludes': ['blog/*']}}, True, 2, uniqueIdempotencyKey)
|
||||
assert response is not None
|
||||
@ -117,7 +119,7 @@ def test_crawl_url_with_idempotency_key_e2e():
|
||||
assert "Conflict: Failed to start crawl job due to a conflict. Idempotency key already used" in str(excinfo.value)
|
||||
|
||||
def test_check_crawl_status_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.crawl_url('https://firecrawl.dev', {'crawlerOptions': {'excludes': ['blog/*']}}, False)
|
||||
assert response is not None
|
||||
assert 'jobId' in response
|
||||
@ -131,21 +133,21 @@ def test_check_crawl_status_e2e():
|
||||
assert len(status_response['data']) > 0
|
||||
|
||||
def test_search_e2e():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.search("test query")
|
||||
assert response is not None
|
||||
assert 'content' in response[0]
|
||||
assert len(response) > 2
|
||||
|
||||
def test_search_invalid_api_key():
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
|
||||
invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key", version='v0')
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
invalid_app.search("test query")
|
||||
assert "Unexpected error during search: Status code 401. Unauthorized: Invalid token" in str(excinfo.value)
|
||||
|
||||
def test_llm_extraction():
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
|
||||
response = app.scrape_url("https://mendable.ai", {
|
||||
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY, version='v0')
|
||||
response = app.scrape_url("https://firecrawl.dev", {
|
||||
'extractorOptions': {
|
||||
'mode': 'llm-extraction',
|
||||
'extractionPrompt': "Based on the information on the page, find what the company's mission is and whether it supports SSO, and whether it is open source",
|
||||
|
@ -0,0 +1,3 @@
|
||||
API_URL=http://localhost:3002
|
||||
ABSOLUTE_FIRECRAWL_PATH=/Users/user/firecrawl/apps/python-sdk/firecrawl/firecrawl.py
|
||||
TEST_API_KEY=fc-YOUR_API_KEY
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user