mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 03:29:00 +08:00
Merge branch 'main' into nsc/geo-to-location
This commit is contained in:
commit
07e76f2ba5
16
README.md
16
README.md
@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
|
|||||||
- **Media parsing**: pdfs, docx, images.
|
- **Media parsing**: pdfs, docx, images.
|
||||||
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
- **Reliability first**: designed to get the data you need - no matter how hard it is.
|
||||||
- **Actions**: click, scroll, input, wait and more before extracting data
|
- **Actions**: click, scroll, input, wait and more before extracting data
|
||||||
|
- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
|
||||||
|
|
||||||
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
|
||||||
|
|
||||||
@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
|
|||||||
}'
|
}'
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Batch Scraping Multiple URLs (New)
|
||||||
|
|
||||||
|
You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
|
||||||
|
-H 'Content-Type: application/json' \
|
||||||
|
-H 'Authorization: Bearer YOUR_API_KEY' \
|
||||||
|
-d '{
|
||||||
|
"urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
|
||||||
|
"formats" : ["markdown", "html"]
|
||||||
|
}'
|
||||||
|
```
|
||||||
|
|
||||||
### Search (v0) (Beta)
|
### Search (v0) (Beta)
|
||||||
|
|
||||||
@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
|
|||||||
scrapeOptions: {
|
scrapeOptions: {
|
||||||
formats: ['markdown', 'html'],
|
formats: ['markdown', 'html'],
|
||||||
}
|
}
|
||||||
} as CrawlParams, true, 30) as CrawlStatusResponse;
|
} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
|
||||||
|
|
||||||
if (crawlResponse) {
|
if (crawlResponse) {
|
||||||
console.log(crawlResponse)
|
console.log(crawlResponse)
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
# ===== Required ENVS ======
|
# ===== Required ENVS ======
|
||||||
NUM_WORKERS_PER_QUEUE=8
|
NUM_WORKERS_PER_QUEUE=8
|
||||||
PORT=3002
|
PORT=3002
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
|
||||||
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
|
|||||||
|
|
||||||
# ===== Optional ENVS ======
|
# ===== Optional ENVS ======
|
||||||
|
|
||||||
|
# SearchApi key. Head to https://searchapi.com/ to get your API key
|
||||||
|
SEARCHAPI_API_KEY=
|
||||||
|
# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
|
||||||
|
SEARCHAPI_ENGINE=
|
||||||
|
|
||||||
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
# Supabase Setup (used to support DB authentication, advanced logging, etc.)
|
||||||
SUPABASE_ANON_TOKEN=
|
SUPABASE_ANON_TOKEN=
|
||||||
SUPABASE_URL=
|
SUPABASE_URL=
|
||||||
SUPABASE_SERVICE_TOKEN=
|
SUPABASE_SERVICE_TOKEN=
|
||||||
|
|
||||||
# Other Optionals
|
# Other Optionals
|
||||||
|
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
|
|||||||
BULL_AUTH_KEY=
|
BULL_AUTH_KEY=
|
||||||
LOGTAIL_KEY=
|
LOGTAIL_KEY=
|
||||||
PLAYWRIGHT_MICROSERVICE_URL=
|
PLAYWRIGHT_MICROSERVICE_URL=
|
||||||
|
SEARCHAPI_API_KEY=
|
||||||
|
@ -4,6 +4,7 @@ import {
|
|||||||
BatchScrapeRequest,
|
BatchScrapeRequest,
|
||||||
batchScrapeRequestSchema,
|
batchScrapeRequestSchema,
|
||||||
CrawlResponse,
|
CrawlResponse,
|
||||||
|
legacyExtractorOptions,
|
||||||
legacyScrapeOptions,
|
legacyScrapeOptions,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
@ -34,6 +35,8 @@ export async function batchScrapeController(
|
|||||||
}
|
}
|
||||||
|
|
||||||
const pageOptions = legacyScrapeOptions(req.body);
|
const pageOptions = legacyScrapeOptions(req.body);
|
||||||
|
const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
|
||||||
|
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
@ -65,6 +68,7 @@ export async function batchScrapeController(
|
|||||||
plan: req.auth.plan,
|
plan: req.auth.plan,
|
||||||
crawlerOptions: null,
|
crawlerOptions: null,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
|
extractorOptions,
|
||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
sitemapped: true,
|
sitemapped: true,
|
||||||
|
@ -121,8 +121,13 @@ export async function runWebScraper({
|
|||||||
: docs;
|
: docs;
|
||||||
|
|
||||||
if(is_scrape === false) {
|
if(is_scrape === false) {
|
||||||
billTeam(team_id, undefined, filteredDocs.length).catch(error => {
|
let creditsToBeBilled = 1; // Assuming 1 credit per document
|
||||||
Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
|
if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
|
||||||
|
creditsToBeBilled = 5;
|
||||||
|
}
|
||||||
|
|
||||||
|
billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
|
||||||
|
Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
|
||||||
// Optionally, you could notify an admin or add to a retry queue here
|
// Optionally, you could notify an admin or add to a retry queue here
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -209,14 +209,15 @@ export async function scrapSingleUrl(
|
|||||||
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
if (action.type === "click" || action.type === "write" || action.type === "press") {
|
||||||
const result: Action[] = [];
|
const result: Action[] = [];
|
||||||
// Don't add a wait if the previous action is a wait
|
// Don't add a wait if the previous action is a wait
|
||||||
if (index === 0 || array[index - 1].type !== "wait") {
|
// if (index === 0 || array[index - 1].type !== "wait") {
|
||||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||||
}
|
// }
|
||||||
|
// Fire-engine now handles wait times automatically, leaving the code here for now
|
||||||
result.push(action);
|
result.push(action);
|
||||||
// Don't add a wait if the next action is a wait
|
// Don't add a wait if the next action is a wait
|
||||||
if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
// if (index === array.length - 1 || array[index + 1].type !== "wait") {
|
||||||
result.push({ type: "wait", milliseconds: 1200 } as Action);
|
// result.push({ type: "wait", milliseconds: 1200 } as Action);
|
||||||
}
|
// }
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
return [action as Action];
|
return [action as Action];
|
||||||
|
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
|
|||||||
import { SearchResult } from "../../src/lib/entities";
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
import { googleSearch } from "./googlesearch";
|
import { googleSearch } from "./googlesearch";
|
||||||
import { fireEngineMap } from "./fireEngine";
|
import { fireEngineMap } from "./fireEngine";
|
||||||
|
import { searchapi_search } from "./searchapi";
|
||||||
import { serper_search } from "./serper";
|
import { serper_search } from "./serper";
|
||||||
|
|
||||||
export async function search({
|
export async function search({
|
||||||
@ -30,7 +31,16 @@ export async function search({
|
|||||||
timeout?: number;
|
timeout?: number;
|
||||||
}): Promise<SearchResult[]> {
|
}): Promise<SearchResult[]> {
|
||||||
try {
|
try {
|
||||||
|
if (process.env.SEARCHAPI_API_KEY) {
|
||||||
|
return await searchapi_search(query, {
|
||||||
|
num_results,
|
||||||
|
tbs,
|
||||||
|
filter,
|
||||||
|
lang,
|
||||||
|
country,
|
||||||
|
location
|
||||||
|
});
|
||||||
|
}
|
||||||
if (process.env.SERPER_API_KEY) {
|
if (process.env.SERPER_API_KEY) {
|
||||||
return await serper_search(query, {
|
return await serper_search(query, {
|
||||||
num_results,
|
num_results,
|
||||||
|
60
apps/api/src/search/searchapi.ts
Normal file
60
apps/api/src/search/searchapi.ts
Normal file
@ -0,0 +1,60 @@
|
|||||||
|
import axios from "axios";
|
||||||
|
import dotenv from "dotenv";
|
||||||
|
import { SearchResult } from "../../src/lib/entities";
|
||||||
|
|
||||||
|
dotenv.config();
|
||||||
|
|
||||||
|
interface SearchOptions {
|
||||||
|
tbs?: string;
|
||||||
|
filter?: string;
|
||||||
|
lang?: string;
|
||||||
|
country?: string;
|
||||||
|
location?: string;
|
||||||
|
num_results: number;
|
||||||
|
page?: number;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
|
||||||
|
const params = {
|
||||||
|
q: q,
|
||||||
|
hl: options.lang,
|
||||||
|
gl: options.country,
|
||||||
|
location: options.location,
|
||||||
|
num: options.num_results,
|
||||||
|
page: options.page ?? 1,
|
||||||
|
engine: process.env.SEARCHAPI_ENGINE || "google",
|
||||||
|
};
|
||||||
|
|
||||||
|
const url = `https://www.searchapi.io/api/v1/search`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.get(url, {
|
||||||
|
headers: {
|
||||||
|
"Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
"X-SearchApi-Source": "Firecrawl",
|
||||||
|
},
|
||||||
|
params: params,
|
||||||
|
});
|
||||||
|
|
||||||
|
|
||||||
|
if (response.status === 401) {
|
||||||
|
throw new Error("Unauthorized. Please check your API key.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const data = response.data;
|
||||||
|
|
||||||
|
if (data && Array.isArray(data.organic_results)) {
|
||||||
|
return data.organic_results.map((a: any) => ({
|
||||||
|
url: a.link,
|
||||||
|
title: a.title,
|
||||||
|
description: a.snippet,
|
||||||
|
}));
|
||||||
|
} else {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.error(`There was an error searching for content: ${error.message}`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
|
|||||||
import json
|
import json
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
import anthropic
|
import anthropic
|
||||||
|
import agentops
|
||||||
|
|
||||||
# ANSI color codes
|
# ANSI color codes
|
||||||
class Colors:
|
class Colors:
|
||||||
@ -161,4 +162,5 @@ def main():
|
|||||||
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
agentops.init(os.getenv("AGENTOPS_API_KEY"))
|
||||||
main()
|
main()
|
||||||
|
@ -98,7 +98,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"# Create a cache with a 5 minute TTL\n",
|
"# Create a cache with a 5 minute TTL\n",
|
||||||
"cache = caching.CachedContent.create(\n",
|
"cache = caching.CachedContent.create(\n",
|
||||||
" model=\"models/gemini-1.5-pro-001\",\n",
|
" model=\"models/gemini-1.5-pro-002\",\n",
|
||||||
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||||
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||||
" contents=[text_file],\n",
|
" contents=[text_file],\n",
|
||||||
|
@ -0,0 +1,166 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import datetime\n",
|
||||||
|
"import time\n",
|
||||||
|
"import google.generativeai as genai\n",
|
||||||
|
"from google.generativeai import caching\n",
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"from firecrawl import FirecrawlApp\n",
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
|
"# Load environment variables\n",
|
||||||
|
"load_dotenv()\n",
|
||||||
|
"\n",
|
||||||
|
"# Retrieve API keys from environment variables\n",
|
||||||
|
"google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
|
||||||
|
"firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Configure the Google Generative AI module with the API key\n",
|
||||||
|
"genai.configure(api_key=google_api_key)\n",
|
||||||
|
"\n",
|
||||||
|
"# Initialize the FirecrawlApp with your API key\n",
|
||||||
|
"app = FirecrawlApp(api_key=firecrawl_api_key)\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"No data returned from crawl.\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Crawl a website\n",
|
||||||
|
"crawl_url = 'https://dify.ai/'\n",
|
||||||
|
"params = {\n",
|
||||||
|
" \n",
|
||||||
|
" 'crawlOptions': {\n",
|
||||||
|
" 'limit': 100\n",
|
||||||
|
" }\n",
|
||||||
|
"}\n",
|
||||||
|
"crawl_result = app.crawl_url(crawl_url, params=params)\n",
|
||||||
|
"\n",
|
||||||
|
"if crawl_result is not None:\n",
|
||||||
|
" # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
|
||||||
|
" cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
|
||||||
|
"\n",
|
||||||
|
" # Save the modified results as a text file containing JSON data\n",
|
||||||
|
" with open('crawl_result.txt', 'w') as file:\n",
|
||||||
|
" file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
|
||||||
|
"else:\n",
|
||||||
|
" print(\"No data returned from crawl.\")\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Upload the video using the Files API\n",
|
||||||
|
"text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
|
||||||
|
"\n",
|
||||||
|
"# Wait for the file to finish processing\n",
|
||||||
|
"while text_file.state.name == \"PROCESSING\":\n",
|
||||||
|
" print('Waiting for file to be processed.')\n",
|
||||||
|
" time.sleep(2)\n",
|
||||||
|
" text_file = genai.get_file(text_file.name)\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Create a cache with a 5 minute TTL\n",
|
||||||
|
"cache = caching.CachedContent.create(\n",
|
||||||
|
" model=\"models/gemini-1.5-flash-002\",\n",
|
||||||
|
" display_name=\"website crawl testing again\", # used to identify the cache\n",
|
||||||
|
" system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
|
||||||
|
" contents=[text_file],\n",
|
||||||
|
" ttl=datetime.timedelta(minutes=15),\n",
|
||||||
|
")\n",
|
||||||
|
"# Construct a GenerativeModel which uses the created cache.\n",
|
||||||
|
"model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
|
||||||
|
"\n",
|
||||||
|
"Here's how Firecrawl helps:\n",
|
||||||
|
"\n",
|
||||||
|
"* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
|
||||||
|
"* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
|
||||||
|
"* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
|
||||||
|
"\n",
|
||||||
|
"You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
|
||||||
|
"\n",
|
||||||
|
"Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# Query the model\n",
|
||||||
|
"response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
|
||||||
|
"response_dict = response.to_dict()\n",
|
||||||
|
"response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
|
||||||
|
"print(response_text)\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.13"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
Loading…
x
Reference in New Issue
Block a user