Merge branch 'main' into nsc/geo-to-location

2025-08-11 03:29:00 +08:00 · 2024-10-28 20:24:29 -03:00 · 2024-10-28 20:24:29 -03:00 · 07e76f2ba5
commit 07e76f2ba5
parent b6ce49e5bb 0bad436061
11 changed files with 282 additions and 15 deletions
--- a/README.md
+++ b/README.md
@ -80,6 +80,7 @@ To use the API, you need to sign up on [Firecrawl](https://firecrawl.dev) and ge
 - **Media parsing**: pdfs, docx, images.
 - **Reliability first**: designed to get the data you need - no matter how hard it is.
 - **Actions**: click, scroll, input, wait and more before extracting data
 - **Batching (New)**: scrape thousands of URLs at the same time with a new async endpoint
 You can find all of Firecrawl's capabilities and how to use them in our [documentation](https://docs.firecrawl.dev)
@ -350,6 +351,19 @@ curl -X POST https://api.firecrawl.dev/v1/scrape \
    }'
 ```
 ### Batch Scraping Multiple URLs (New)
 You can now batch scrape multiple URLs at the same time. It is very similar to how the /crawl endpoint works. It submits a batch scrape job and returns a job ID to check the status of the batch scrape.
 ```bash
 curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
    -H 'Content-Type: application/json' \
    -H 'Authorization: Bearer YOUR_API_KEY' \
    -d '{
      "urls": ["https://docs.firecrawl.dev", "https://docs.firecrawl.dev/sdks/overview"],
      "formats" : ["markdown", "html"]
    }'
 ```
 ### Search (v0) (Beta)
@ -483,7 +497,7 @@ const crawlResponse = await app.crawlUrl('https://firecrawl.dev', {
  scrapeOptions: {
    formats: ['markdown', 'html'],
  }
-} as CrawlParams, true, 30) as CrawlStatusResponse;
+} satisfies CrawlParams, true, 30) satisfies CrawlStatusResponse;
 if (crawlResponse) {
  console.log(crawlResponse)
--- a/apps/api/.env.example
+++ b/apps/api/.env.example
@ -1,5 +1,5 @@
 # ===== Required ENVS ======
-NUM_WORKERS_PER_QUEUE=8 
+NUM_WORKERS_PER_QUEUE=8
 PORT=3002
 HOST=0.0.0.0
 REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379
@ -11,9 +11,14 @@ USE_DB_AUTHENTICATION=true
 # ===== Optional ENVS ======
 # SearchApi key. Head to https://searchapi.com/ to get your API key
 SEARCHAPI_API_KEY=
 # SearchApi engine, defaults to google. Available options: google, bing, baidu, google_news, etc. Head to https://searchapi.com/ to explore more engines
 SEARCHAPI_ENGINE=
 # Supabase Setup (used to support DB authentication, advanced logging, etc.)
-SUPABASE_ANON_TOKEN= 
+SUPABASE_ANON_TOKEN=
-SUPABASE_URL= 
+SUPABASE_URL=
 SUPABASE_SERVICE_TOKEN=
 # Other Optionals
--- a/apps/api/.env.local
+++ b/apps/api/.env.local
@ -12,4 +12,4 @@ ANTHROPIC_API_KEY=
 BULL_AUTH_KEY=
 LOGTAIL_KEY=
 PLAYWRIGHT_MICROSERVICE_URL=
-
+SEARCHAPI_API_KEY=
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -4,6 +4,7 @@ import {
  BatchScrapeRequest,
  batchScrapeRequestSchema,
  CrawlResponse,
  legacyExtractorOptions,
  legacyScrapeOptions,
  RequestWithAuth,
 } from "./types";
@ -34,6 +35,8 @@ export async function batchScrapeController(
  }
  const pageOptions = legacyScrapeOptions(req.body);
  const extractorOptions = req.body.extract ? legacyExtractorOptions(req.body.extract) : undefined;
  const sc: StoredCrawl = {
    crawlerOptions: null,
@ -65,6 +68,7 @@ export async function batchScrapeController(
        plan: req.auth.plan,
        crawlerOptions: null,
        pageOptions,
        extractorOptions,
        origin: "api",
        crawl_id: id,
        sitemapped: true,
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -121,8 +121,13 @@ export async function runWebScraper({
      : docs;
    if(is_scrape === false) {
-      billTeam(team_id, undefined, filteredDocs.length).catch(error => {
+      let creditsToBeBilled = 1; // Assuming 1 credit per document
-        Logger.error(`Failed to bill team ${team_id} for ${filteredDocs.length} credits: ${error}`);
+      if (extractorOptions && (extractorOptions.mode === "llm-extraction" || extractorOptions.mode === "extract")) {
        creditsToBeBilled = 5;
      }
      billTeam(team_id, undefined, creditsToBeBilled * filteredDocs.length).catch(error => {
        Logger.error(`Failed to bill team ${team_id} for ${creditsToBeBilled * filteredDocs.length} credits: ${error}`);
        // Optionally, you could notify an admin or add to a retry queue here
      });
    }
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -209,14 +209,15 @@ export async function scrapSingleUrl(
            if (action.type === "click" || action.type === "write" || action.type === "press") {
              const result: Action[] = [];
              // Don't add a wait if the previous action is a wait
-              if (index === 0 || array[index - 1].type !== "wait") {
+              // if (index === 0 || array[index - 1].type !== "wait") {
-                result.push({ type: "wait", milliseconds: 1200 } as Action);
+              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
-              }
+              // }
              // Fire-engine now handles wait times automatically, leaving the code here for now
              result.push(action);
              // Don't add a wait if the next action is a wait
-              if (index === array.length - 1 || array[index + 1].type !== "wait") {
+              // if (index === array.length - 1 || array[index + 1].type !== "wait") {
-                result.push({ type: "wait", milliseconds: 1200 } as Action);
+              //   result.push({ type: "wait", milliseconds: 1200 } as Action);
-              }
+              // }
              return result;
            }
            return [action as Action];
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -2,6 +2,7 @@ import { Logger } from "../../src/lib/logger";
 import { SearchResult } from "../../src/lib/entities";
 import { googleSearch } from "./googlesearch";
 import { fireEngineMap } from "./fireEngine";
 import { searchapi_search } from "./searchapi";
 import { serper_search } from "./serper";
 export async function search({
@ -30,7 +31,16 @@ export async function search({
  timeout?: number;
 }): Promise<SearchResult[]> {
  try {
-    
+    if (process.env.SEARCHAPI_API_KEY) {
      return await searchapi_search(query, {
        num_results,
        tbs,
        filter,
        lang,
        country,
        location
      });
    }
    if (process.env.SERPER_API_KEY) {
      return await serper_search(query, {
        num_results,
--- a/apps/api/src/search/searchapi.ts
+++ b/apps/api/src/search/searchapi.ts
@ -0,0 +1,60 @@
 import axios from "axios";
 import dotenv from "dotenv";
 import { SearchResult } from "../../src/lib/entities";
 dotenv.config();
 interface SearchOptions {
  tbs?: string;
  filter?: string;
  lang?: string;
  country?: string;
  location?: string;
  num_results: number;
  page?: number;
 }
 export async function searchapi_search(q: string, options: SearchOptions): Promise<SearchResult[]> {
  const params = {
    q: q,
    hl: options.lang,
    gl: options.country,
    location: options.location,
    num: options.num_results,
    page: options.page ?? 1,
    engine: process.env.SEARCHAPI_ENGINE || "google",
  };
  const url = `https://www.searchapi.io/api/v1/search`;
  try {
    const response = await axios.get(url, {
      headers: {
        "Authorization": `Bearer ${process.env.SEARCHAPI_API_KEY}`,
        "Content-Type": "application/json",
        "X-SearchApi-Source": "Firecrawl",
      },
      params: params,
    });
    if (response.status === 401) {
      throw new Error("Unauthorized. Please check your API key.");
    }
    const data = response.data;
    if (data && Array.isArray(data.organic_results)) {
      return data.organic_results.map((a: any) => ({
        url: a.link,
        title: a.title,
        description: a.snippet,
      }));
    } else {
      return [];
    }
  } catch (error) {
    console.error(`There was an error searching for content: ${error.message}`);
    return [];
  }
 }
--- a/examples/claude_web_crawler/claude_web_crawler.py
+++ b/examples/claude_web_crawler/claude_web_crawler.py
@ -3,6 +3,7 @@ from firecrawl import FirecrawlApp
 import json
 from dotenv import load_dotenv
 import anthropic
 import agentops
 # ANSI color codes
 class Colors:
@ -161,4 +162,5 @@ def main():
        print(f"{Colors.RED}No relevant pages identified. Consider refining the search parameters or trying a different website.{Colors.RESET}")
 if __name__ == "__main__":
    agentops.init(os.getenv("AGENTOPS_API_KEY"))
    main()
--- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
+++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_caching.ipynb
@ -98,7 +98,7 @@
   "source": [
    "# Create a cache with a 5 minute TTL\n",
    "cache = caching.CachedContent.create(\n",
-    "    model=\"models/gemini-1.5-pro-001\",\n",
+    "    model=\"models/gemini-1.5-pro-002\",\n",
    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
    "    contents=[text_file],\n",
--- a/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb
+++ b/examples/website_qa_with_gemini_caching/website_qa_with_gemini_flash_caching.ipynb
@ -0,0 +1,166 @@
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/ericciarla/projects/python_projects/agents_testing/.conda/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import datetime\n",
    "import time\n",
    "import google.generativeai as genai\n",
    "from google.generativeai import caching\n",
    "from dotenv import load_dotenv\n",
    "from firecrawl import FirecrawlApp\n",
    "import json\n",
    "\n",
    "# Load environment variables\n",
    "load_dotenv()\n",
    "\n",
    "# Retrieve API keys from environment variables\n",
    "google_api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
    "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n",
    "\n",
    "# Configure the Google Generative AI module with the API key\n",
    "genai.configure(api_key=google_api_key)\n",
    "\n",
    "# Initialize the FirecrawlApp with your API key\n",
    "app = FirecrawlApp(api_key=firecrawl_api_key)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No data returned from crawl.\n"
     ]
    }
   ],
   "source": [
    "# Crawl a website\n",
    "crawl_url = 'https://dify.ai/'\n",
    "params = {\n",
    "   \n",
    "    'crawlOptions': {\n",
    "        'limit': 100\n",
    "    }\n",
    "}\n",
    "crawl_result = app.crawl_url(crawl_url, params=params)\n",
    "\n",
    "if crawl_result is not None:\n",
    "    # Convert crawl results to JSON format, excluding 'content' field from each entry\n",
    "    cleaned_crawl_result = [{k: v for k, v in entry.items() if k != 'content'} for entry in crawl_result]\n",
    "\n",
    "    # Save the modified results as a text file containing JSON data\n",
    "    with open('crawl_result.txt', 'w') as file:\n",
    "        file.write(json.dumps(cleaned_crawl_result, indent=4))\n",
    "else:\n",
    "    print(\"No data returned from crawl.\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Upload the video using the Files API\n",
    "text_file = genai.upload_file(path=\"crawl_result.txt\")\n",
    "\n",
    "# Wait for the file to finish processing\n",
    "while text_file.state.name == \"PROCESSING\":\n",
    "    print('Waiting for file to be processed.')\n",
    "    time.sleep(2)\n",
    "    text_file = genai.get_file(text_file.name)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a cache with a 5 minute TTL\n",
    "cache = caching.CachedContent.create(\n",
    "    model=\"models/gemini-1.5-flash-002\",\n",
    "    display_name=\"website crawl testing again\", # used to identify the cache\n",
    "    system_instruction=\"You are an expert at this website, and your job is to answer user's query based on the website you have access to.\",\n",
    "    contents=[text_file],\n",
    "    ttl=datetime.timedelta(minutes=15),\n",
    ")\n",
    "# Construct a GenerativeModel which uses the created cache.\n",
    "model = genai.GenerativeModel.from_cached_content(cached_content=cache)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dify.AI utilizes the **Firecrawl** service for website scraping. This service can crawl and convert any website into clean markdown or structured data that's ready for use in building RAG applications. \n",
      "\n",
      "Here's how Firecrawl helps:\n",
      "\n",
      "* **Crawling and Conversion:** Firecrawl crawls the website and converts the content into a format that is easily understood by LLMs, such as markdown or structured data.\n",
      "* **Clean Output:**  Firecrawl ensures the data is clean and free of errors, making it easier to use in Dify's RAG engine.\n",
      "* **Parallel Crawling:**  Firecrawl efficiently crawls web pages in parallel, delivering results quickly.\n",
      "\n",
      "You can find Firecrawl on their website: [https://www.firecrawl.dev/](https://www.firecrawl.dev/)\n",
      "\n",
      "Firecrawl offers both a cloud service and an open-source software (OSS) edition. \n",
      "\n"
     ]
    }
   ],
   "source": [
    "# Query the model\n",
    "response = model.generate_content([\"What powers website scraping with Dify?\"])\n",
    "response_dict = response.to_dict()\n",
    "response_text = response_dict['candidates'][0]['content']['parts'][0]['text']\n",
    "print(response_text)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.13"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }