From 503e83e83edcf578ab7acec11275ad5695bc3c41 Mon Sep 17 00:00:00 2001 From: Sebastjan Prachovskij Date: Tue, 3 Sep 2024 18:26:11 +0300 Subject: [PATCH 1/9] Add SearchApi to search Add support for engines, improve status code error Remove changes in package, add engine to env params Improve description in env example Remove unnecessary empty line Improve text --- apps/api/.env.example | 11 ++++-- apps/api/.env.local | 2 +- apps/api/src/search/index.ts | 12 ++++++- apps/api/src/search/searchapi.ts | 60 ++++++++++++++++++++++++++++++++ 4 files changed, 80 insertions(+), 5 deletions(-) create mode 100644 apps/api/src/search/searchapi.ts diff --git a/apps/api/.env.example b/apps/api/.env.example index f3c1dc1b..6ba49daa 100644 --- a/apps/api/.env.example +++ b/apps/api/.env.example @@ -1,5 +1,5 @@ # ===== Required ENVS ====== -NUM_WORKERS_PER_QUEUE=8 +NUM_WORKERS_PER_QUEUE=8 PORT=3002 HOST=0.0.0.0 REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379 @@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true # ===== Optional ENVS ====== +# SearchApi key. Head to https://searchapi.com/ to get your API key +SEARCHAPI_API_KEY= +# SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines +SEARCHAPI_ENGINE= + # Supabase Setup (used to support DB authentication, advanced logging, etc.) -SUPABASE_ANON_TOKEN= -SUPABASE_URL= +SUPABASE_ANON_TOKEN= +SUPABASE_URL= SUPABASE_SERVICE_TOKEN= # Other Optionals diff --git a/apps/api/.env.local b/apps/api/.env.local index 17f85935..9fa41498 100644 --- a/apps/api/.env.local +++ b/apps/api/.env.local @@ -12,4 +12,4 @@ ANTHROPIC_API_KEY= BULL_AUTH_KEY= LOGTAIL_KEY= PLAYWRIGHT_MICROSERVICE_URL= - +SEARCHAPI_API_KEY= diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index f4c5b6d0..3bcb85d2 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger"; import { SearchResult } from "../../src/lib/entities"; import { googleSearch } from "./googlesearch"; import { fireEngineMap } from "./fireEngine"; +import { searchapi_search } from "./searchapi"; import { serper_search } from "./serper"; export async function search({ @@ -30,7 +31,16 @@ export async function search({ timeout?: number; }): Promise { try { - + if (process.env.SEARCHAPI_API_KEY) { + return await searchapi_search(query, { + num_results, + tbs, + filter, + lang, + country, + location + }); + } if (process.env.SERPER_API_KEY) { return await serper_search(query, { num_results, diff --git a/apps/api/src/search/searchapi.ts b/apps/api/src/search/searchapi.ts new file mode 100644 index 00000000..24778a77 --- /dev/null +++ b/apps/api/src/search/searchapi.ts @@ -0,0 +1,60 @@ +import axios from "axios"; +import dotenv from "dotenv"; +import { SearchResult } from "../../src/lib/entities"; + +dotenv.config(); + +interface SearchOptions { + tbs?: string; + filter?: string; + lang?: string; + country?: string; + location?: string; + num_results: number; + page?: number; +} + +export async function searchapi_search(q: string, options: SearchOptions): Promise { + const params = { + q: q, + hl: options.lang, + gl: options.country, + location: options.location, + num: options.num_results, + page: options.page ?? 1, + engine: process.env.SEARCHAPI_ENGINE || "google", + }; + + const url = `https://www.searchapi.io/api/v1/search`; + + try { + const response = await axios.get(url, { + headers: { + "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`, + "Content-Type": "application/json", + "X-SearchApi-Source": "Firecrawl", + }, + params: params, + }); + + + if (response.status === 401) { + throw new Error("Unauthorized. Please check your API key."); + } + + const data = response.data; + + if (data && Array.isArray(data.organic_results)) { + return data.organic_results.map((a: any) => ({ + url: a.link, + title: a.title, + description: a.snippet, + })); + } else { + return []; + } + } catch (error) { + console.error(`There was an error searching for content: ${error.message}`); + return []; + } +} From 1c02187054ad1847dcee77728255018ec743f6d5 Mon Sep 17 00:00:00 2001 From: Stijn Smits <167638923+s-smits@users.noreply.github.com> Date: Sun, 6 Oct 2024 13:25:23 +0200 Subject: [PATCH 2/9] Update website_qa_with_gemini_caching.ipynb --- .../website_qa_with_gemini_caching.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb index 0876affa..9a2244a1 100644 --- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb @@ -98,7 +98,7 @@ "source": [ "# Create a cache with a 5 minute TTL\n", "cache = caching.CachedContent.create(\n", - " model=\"models/gemini-1.5-pro-001\",\n", + " model=\"models/gemini-1.5-pro-002\",\n", " display_name=\"website crawl testing again\", # used to identify the cache\n", " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", " contents=[text_file],\n", From 460f5581fef1d800e2a96fbf10f30b242921ac60 Mon Sep 17 00:00:00 2001 From: Stijn Smits <167638923+s-smits@users.noreply.github.com> Date: Mon, 7 Oct 2024 12:17:47 +0200 Subject: [PATCH 3/9] Add files via upload --- ...website_qa_with_gemini_flash_caching.ipynb | 166 ++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb diff --git a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb new file mode 100644 index 00000000..19d72c9d --- /dev/null +++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb @@ -0,0 +1,166 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "import os\n", + "import datetime\n", + "import time\n", + "import google.generativeai as genai\n", + "from google.generativeai import caching\n", + "from dotenv import load_dotenv\n", + "from firecrawl import FirecrawlApp\n", + "import json\n", + "\n", + "# Load environment variables\n", + "load_dotenv()\n", + "\n", + "# Retrieve API keys from environment variables\n", + "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n", + "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", + "\n", + "# Configure the Google Generative AI module with the API key\n", + "genai.configure(api_key=google_api_key)\n", + "\n", + "# Initialize the FirecrawlApp with your API key\n", + "app = FirecrawlApp(api_key=firecrawl_api_key)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No data returned from crawl.\n" + ] + } + ], + "source": [ + "# Crawl a website\n", + "crawl_url = 'https://dify.ai/'\n", + "params = {\n", + " \n", + " 'crawlOptions': {\n", + " 'limit': 100\n", + " }\n", + "}\n", + "crawl_result = app.crawl_url(crawl_url, params=params)\n", + "\n", + "if crawl_result is not None:\n", + " # Convert crawl results to JSON format, excluding 'content' field from each entry\n", + " cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n", + "\n", + " # Save the modified results as a text file containing JSON data\n", + " with open('crawl_result.txt', 'w') as file:\n", + " file.write(json.dumps(cleaned_crawl_result, indent=4))\n", + "else:\n", + " print(\"No data returned from crawl.\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Upload the video using the Files API\n", + "text_file = genai.upload_file(path=\"crawl_result.txt\")\n", + "\n", + "# Wait for the file to finish processing\n", + "while text_file.state.name == \"PROCESSING\":\n", + " print('Waiting for file to be processed.')\n", + " time.sleep(2)\n", + " text_file = genai.get_file(text_file.name)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Create a cache with a 5 minute TTL\n", + "cache = caching.CachedContent.create(\n", + " model=\"models/gemini-1.5-flash-002\",\n", + " display_name=\"website crawl testing again\", # used to identify the cache\n", + " system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n", + " contents=[text_file],\n", + " ttl=datetime.timedelta(minutes=15),\n", + ")\n", + "# Construct a GenerativeModel which uses the created cache.\n", + "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n", + "\n", + "Here's how Firecrawl helps:\n", + "\n", + "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n", + "* **Clean Output:** Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n", + "* **Parallel Crawling:** Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n", + "\n", + "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n", + "\n", + "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n", + "\n" + ] + } + ], + "source": [ + "# Query the model\n", + "response = model.generate_content([\"What powers website scraping with Dify?\"])\n", + "response_dict = response.to_dict()\n", + "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n", + "print(response_text)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From b48eed5716d7bad5f19702bce839b2998fe8aaf6 Mon Sep 17 00:00:00 2001 From: Twilight <46562212+twlite@users.noreply.github.com> Date: Mon, 28 Oct 2024 09:40:35 +0545 Subject: [PATCH 4/9] chore(README.md): use `satisfies` instead of `as` for ts example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index dd8a740a..e7c48fa5 100644 --- a/README.md +++ b/README.md @@ -483,7 +483,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', { scrapeOptions: { formats: ['markdown', 'html'], } -} as CrawlParams, true, 30) as CrawlStatusResponse; +} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse; if (crawlResponse) { console.log(crawlResponse) From e3e8375c7de0c64df66a56f0b3f1d9ddc4fd2c9c Mon Sep 17 00:00:00 2001 From: Eric Ciarla Date: Mon, 28 Oct 2024 11:13:33 -0400 Subject: [PATCH 5/9] Add AgentOps Monitoring --- examples/claude_web_crawler/claude_web_crawler.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/claude_web_crawler/claude_web_crawler.py b/examples/claude_web_crawler/claude_web_crawler.py index 55168f30..6ca29f14 100644 --- a/examples/claude_web_crawler/claude_web_crawler.py +++ b/examples/claude_web_crawler/claude_web_crawler.py @@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp import json from dotenv import load_dotenv import anthropic +import agentops # ANSI color codes class Colors: @@ -161,4 +162,5 @@ def main(): print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}") if __name__ == "__main__": + agentops.init(os.getenv("AGENTOPS_API_KEY")) main() From 007e3edfc5c785da858f11a94f14ea7a5bd28ff0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 12:40:04 -0300 Subject: [PATCH 6/9] Update README.md --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index dd8a740a..d8e5bdcb 100644 --- a/README.md +++ b/README.md @@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge - **Media parsing**: pdfs, docx, images. - **Reliability first**: designed to get the data you need - no matter how hard it is. - **Actions**: click, scroll, input, wait and more before extracting data +- **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev) @@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \ }' ``` +### Batch Scraping Multiple URLs (New) + +You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape. + +```bash +curl -X POST https://api.firecrawl.dev/v1/batch/scrape \ + -H 'Content-Type: application/json' \ + -H 'Authorization: Bearer YOUR_API_KEY' \ + -d '{ + "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"], + "formats" : ["markdown", "html"] + }' +``` ### Search (v0) (Beta) From fa8875d64d4246f0d0b7eecbfcffbbccce473033 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 15:09:50 -0300 Subject: [PATCH 7/9] Update single_url.ts --- apps/api/src/scraper/WebScraper/single_url.ts | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index cd76793c..c7185b79 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -209,14 +209,15 @@ export async function scrapSingleUrl( if (action.type === "click" || action.type === "write" || action.type === "press") { const result: Action[] = []; // Don't add a wait if the previous action is a wait - if (index === 0 || array[index - 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === 0 || array[index - 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } + // Fire-engine now handles wait times automatically, leaving the code here for now result.push(action); // Don't add a wait if the next action is a wait - if (index === array.length - 1 || array[index + 1].type !== "wait") { - result.push({ type: "wait", milliseconds: 1200 } as Action); - } + // if (index === array.length - 1 || array[index + 1].type !== "wait") { + // result.push({ type: "wait", milliseconds: 1200 } as Action); + // } return result; } return [action as Action]; From 726430c2e641666626b400ca62f32062a31978f4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 16:51:49 -0300 Subject: [PATCH 8/9] Nick: llm extract in batch scrape --- apps/api/src/controllers/v1/batch-scrape.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 7c68341b..cde4bd76 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -4,6 +4,7 @@ import { BatchScrapeRequest, batchScrapeRequestSchema, CrawlResponse, + legacyExtractorOptions, legacyScrapeOptions, RequestWithAuth, } from "./types"; @@ -34,6 +35,8 @@ export async function batchScrapeController( } const pageOptions = legacyScrapeOptions(req.body); + const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined; + const sc: StoredCrawl = { crawlerOptions: null, @@ -65,6 +68,7 @@ export async function batchScrapeController( plan: req.auth.plan, crawlerOptions: null, pageOptions, + extractorOptions, origin: "api", crawl_id: id, sitemapped: true, From 0bad436061191fd304dd66ea201eccc97f0e7474 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 28 Oct 2024 17:04:42 -0300 Subject: [PATCH 9/9] Nick: fixed the batch scrape + llm extract billing --- apps/api/src/main/runWebScraper.ts | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 8eb679e7..8bd0c12c 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -121,8 +121,13 @@ export async function runWebScraper({ : docs; if(is_scrape === false) { - billTeam(team_id, undefined, filteredDocs.length).catch(error => { - Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`); + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) { + creditsToBeBilled = 5; + } + + billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => { + Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`); // Optionally, you could notify an admin or add to a retry queue here }); }