diff --git a/README.md b/README.md index dd8a740a..3050cbe0 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge - **Media parsing**: pdfs, docx, images. - **Reliability first**: designed to get the data you need - no matter how hard it is. - **Actions**: click, scroll, input, wait and more before extracting data +- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev) @@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ }' ``` +### Batch Scraping Multiple URLs (New) + +You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape. + +```bash +curl -X POST https://api.firecrawl.dev/v1/batch/scrape \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer YOUR_API_KEY' \ + -d '{ + "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"], + "formats" : ["markdown", "html"] + }' +``` ### Search (v0) (Beta) @@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html'], } -} as CrawlParams, true, 30) as CrawlStatusResponse; +} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse; if (crawlResponse) { console.log(crawlResponse) diff --git a/apps/api/.env.example b/apps/api/.env.example index f3c1dc1b..6ba49daa 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -1,5 +1,5 @@ # ===== Required ENVS ====== -NUM_WORKERS_PER_QUEUE=8 +NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379 @@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true # ===== Optional ENVS ====== +# SearchApi key. Head to https://searchapi.com/ to get your API key +SEARCHAPI_API_KEY= +# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines +SEARCHAPI_ENGINE= + # Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= SUPABASE_SERVICE_TOKEN= # Other Optionals diff --git a/apps/api/.env.local b/apps/api/.env.local index 17f85935..9fa41498 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -12,4 +12,4 @@ ANTHROPIC_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= - +SEARCHAPI_API_KEY= diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 7c68341b..cde4bd76 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -4,6 +4,7 @@ import { BatchScrapeRequest, batchScrapeRequestSchema, CrawlResponse, + legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, } from "./types"; @@ -34,6 +35,8 @@ export async function batchScrapeController( } const pageOptions = legacyScrapeOptions(req.body); + const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; + const sc: StoredCrawl = { crawlerOptions: null, @@ -65,6 +68,7 @@ export async function batchScrapeController( plan: req.auth.plan, crawlerOptions: null, pageOptions, + extractorOptions, origin: "api", crawl_id: id, sitemapped: true, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 8eb679e7..8bd0c12c 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -121,8 +121,13 @@ export async function runWebScraper({ : docs; if(is_scrape === false) { - billTeam(team_id, undefined, filteredDocs.length).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) { + creditsToBeBilled = 5; + } + + billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index cd76793c..c7185b79 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -209,14 +209,15 @@ export async function scrapSingleUrl( if (action.type === "click" || action.type === "write" || action.type === "press") { const result: Action[] = []; // Don't add a wait if the previous action is a wait - if (index === 0 || array[index - 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === 0 || array[index - 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } + // Fire-engine now handles wait times automatically, leaving the code here for now result.push(action); // Don't add a wait if the next action is a wait - if (index === array.length - 1 || array[index + 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === array.length - 1 || array[index + 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } return result; } return [action as Action]; diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f4c5b6d0..3bcb85d2 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { googleSearch } from "./googlesearch"; import { fireEngineMap } from "./fireEngine"; +import { searchapi_search } from "./searchapi"; import { serper_search } from "./serper"; export async function search({ @@ -30,7 +31,16 @@ export async function search({ timeout?: number; }): Promise { try { - + if (process.env.SEARCHAPI_API_KEY) { + return await searchapi_search(query, { + num_results, + tbs, + filter, + lang, + country, + location + }); + } if (process.env.SERPER_API_KEY) { return await serper_search(query, { num_results, diff --git a/apps/api/src/search/searchapi.ts b/apps/api/src/search/searchapi.ts new file mode 100644 index 00000000..24778a77 --- /dev/null +++ b/apps/api/src/search/searchapi.ts @@ -0,0 +1,60 @@ +import axios from "axios"; +import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; + +dotenv.config(); + +interface SearchOptions { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + num_results: number; + page?: number; +} + +export async function searchapi_search(q: string, options: SearchOptions): Promise { + const params = { + q: q, + hl: options.lang, + gl: options.country, + location: options.location, + num: options.num_results, + page: options.page ?? 1, + engine: process.env.SEARCHAPI_ENGINE || "google", + }; + + const url = `https://www.searchapi.io/api/v1/search`; + + try { + const response = await axios.get(url, { + headers: { + "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`, + "Content-Type": "application/json", + "X-SearchApi-Source": "Firecrawl", + }, + params: params, + }); + + + if (response.status === 401) { + throw new Error("Unauthorized. Please check your API key."); + } + + const data = response.data; + + if (data && Array.isArray(data.organic_results)) { + return data.organic_results.map((a: any) => ({ + url: a.link, + title: a.title, + description: a.snippet, + })); + } else { + return []; + } + } catch (error) { + console.error(`There was an error searching for content: ${error.message}`); + return []; + } +} diff --git a/examples/claude_web_crawler/claude_web_crawler.py b/examples/claude_web_crawler/claude_web_crawler.py index 55168f30..6ca29f14 100644 --- a/examples/claude_web_crawler/claude_web_crawler.py +++ b/examples/claude_web_crawler/claude_web_crawler.py @@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp import json from dotenv import load_dotenv import anthropic +import agentops # ANSI color codes class Colors: @@ -161,4 +162,5 @@ def main(): print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") if __name__ == "__main__": + agentops.init(os.getenv("AGENTOPS_API_KEY")) main() diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb index 0876affa..9a2244a1 100644 --- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb @@ -98,7 +98,7 @@ "source": [ "# Create a cache with a 5 minute TTL\n", "cache = caching.CachedContent.create(\n", - " model=\"models/gemini-1.5-pro-001\",\n", + " model=\"models/gemini-1.5-pro-002\",\n", " display_name=\"website crawl testing again\", # used to identify the cache\n", " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", " contents=[text_file],\n", diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb new file mode 100644 index 00000000..19d72c9d --- /dev/null +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "import google.generativeai as genai\n", + "from google.generativeai import caching\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Configure the Google Generative AI module with the API key\n", + "genai.configure(api_key=google_api_key)\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No data returned from crawl.\n" + ] + } + ], + "source": [ + "# Crawl a website\n", + "crawl_url = 'https://dify.ai/'\n", + "params = {\n", + " \n", + " 'crawlOptions': {\n", + " 'limit': 100\n", + " }\n", + "}\n", + "crawl_result = app.crawl_url(crawl_url, params=params)\n", + "\n", + "if crawl_result is not None:\n", + " # Convert crawl results to JSON format, excluding 'content' field from each entry\n", + " cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n", + "\n", + " # Save the modified results as a text file containing JSON data\n", + " with open('crawl_result.txt', 'w') as file:\n", + " file.write(json.dumps(cleaned_crawl_result, indent=4))\n", + "else:\n", + " print(\"No data returned from crawl.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload the video using the Files API\n", + "text_file = genai.upload_file(path=\"crawl_result.txt\")\n", + "\n", + "# Wait for the file to finish processing\n", + "while text_file.state.name == \"PROCESSING\":\n", + " print('Waiting for file to be processed.')\n", + " time.sleep(2)\n", + " text_file = genai.get_file(text_file.name)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cache with a 5 minute TTL\n", + "cache = caching.CachedContent.create(\n", + " model=\"models/gemini-1.5-flash-002\",\n", + " display_name=\"website crawl testing again\", # used to identify the cache\n", + " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", + " contents=[text_file],\n", + " ttl=datetime.timedelta(minutes=15),\n", + ")\n", + "# Construct a GenerativeModel which uses the created cache.\n", + "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n", + "\n", + "Here's how Firecrawl helps:\n", + "\n", + "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n", + "* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n", + "* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n", + "\n", + "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n", + "\n", + "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n", + "\n" + ] + } + ], + "source": [ + "# Query the model\n", + "response = model.generate_content([\"What powers website scraping with Dify?\"])\n", + "response_dict = response.to_dict()\n", + "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", + "print(response_text)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}