From c53df8650aa7728d9711e37dc34c7dd1e2045757 Mon Sep 17 00:00:00 2001 From: Loris Date: Thu, 13 Mar 2025 17:21:44 +0100 Subject: [PATCH 01/28] Update searxng.ts (#1319) remove default categories. otherwise we cannot use engines --- apps/api/src/search/searxng.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/search/searxng.ts b/apps/api/src/search/searxng.ts index a0e711bd..2645fc1d 100644 --- a/apps/api/src/search/searxng.ts +++ b/apps/api/src/search/searxng.ts @@ -26,7 +26,7 @@ export async function searxng_search( // location: options.location, //not possible with SearXNG // num: options.num_results, //not possible with SearXNG engines: process.env.SEARXNG_ENGINES || "", - categories: process.env.SEARXNG_CATEGORIES || "general", + categories: process.env.SEARXNG_CATEGORIES || "", pageno: options.page ?? 1, format: "json" }; From f87e11712c5c5ad937c4ca1abd29a2e8594ff1c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 13 Mar 2025 17:30:37 +0100 Subject: [PATCH 02/28] fix: don't log bull secret --- apps/api/src/index.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index bc6925e5..50fce459 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -89,9 +89,6 @@ const HOST = process.env.HOST ?? "localhost"; function startServer(port = DEFAULT_PORT) { const server = app.listen(Number(port), HOST, () => { logger.info(`Worker ${process.pid} listening on port ${port}`); - logger.info( - `For the Queue UI, open: http://${HOST}:${port}/admin/${process.env.BULL_AUTH_KEY}/queues`, - ); }); const exitHandler = () => { From 134de67a3b7dec2942e4578f4dd0b8bc19f781b1 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 13 Mar 2025 12:48:56 -0400 Subject: [PATCH 03/28] (fix/map) Map failed to filter by path if indexed (#1333) * Nick: * Update map.ts * Update map.ts --- apps/api/src/controllers/v1/map.ts | 26 ++++++++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 1 + 2 files changed, 27 insertions(+) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 30245aa5..ebb0b324 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -56,6 +56,7 @@ export async function getMapResults({ allowExternalLinks, abort = new AbortController().signal, // noop mock, + filterByPath = true, }: { url: string; search?: string; @@ -70,6 +71,7 @@ export async function getMapResults({ allowExternalLinks?: boolean; abort?: AbortSignal; mock?: string; + filterByPath?: boolean; }): Promise { const id = uuidv4(); let links: string[] = [url]; @@ -247,6 +249,29 @@ export async function getMapResults({ links = links.filter((x) => isSameSubdomain(x, url)); } + // Filter by path if enabled + if (filterByPath && !allowExternalLinks) { + try { + const urlObj = new URL(url); + const urlPath = urlObj.pathname; + // Only apply path filtering if the URL has a significant path (not just '/' or empty) + // This means we only filter by path if the user has not selected a root domain + if (urlPath && urlPath !== '/' && urlPath.length > 1) { + links = links.filter(link => { + try { + const linkObj = new URL(link); + return linkObj.pathname.startsWith(urlPath); + } catch (e) { + return false; + } + }); + } + } catch (e) { + // If URL parsing fails, continue without path filtering + logger.warn(`Failed to parse URL for path filtering: ${url}`, { error: e }); + } + } + // remove duplicates that could be due to http/https or www links = removeDuplicateUrls(links); } @@ -300,6 +325,7 @@ export async function mapController( plan: req.auth.plan, abort: abort.signal, mock: req.body.useMock, + filterByPath: req.body.filterByPath !== false, }), ...(req.body.timeout !== undefined ? [ new Promise((resolve, reject) => setTimeout(() => { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 5a71da9c..1e462549 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -506,6 +506,7 @@ export const mapRequestSchema = crawlerOptions limit: z.number().min(1).max(30000).default(5000), timeout: z.number().positive().finite().optional(), useMock: z.string().optional(), + filterByPath: z.boolean().default(true), }) .strict(strictMessage); From 7ec278a908ba994f350c9aeecad4057eadfca7c5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 13 Mar 2025 13:05:12 -0400 Subject: [PATCH 04/28] Nick: fixes --- apps/api/src/lib/extract/team-id-sync.ts | 6 +++++- apps/api/src/services/queue-jobs.ts | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/apps/api/src/lib/extract/team-id-sync.ts b/apps/api/src/lib/extract/team-id-sync.ts index 90a32651..e477583a 100644 --- a/apps/api/src/lib/extract/team-id-sync.ts +++ b/apps/api/src/lib/extract/team-id-sync.ts @@ -1,7 +1,9 @@ import { supabase_rr_service, supabase_service } from "../../services/supabase"; import { logger } from "../logger"; -export async function getTeamIdSyncB(teamId: string) { +import { withAuth } from "../withAuth"; + +async function getTeamIdSyncBOriginal(teamId: string) { try { const { data, error } = await supabase_rr_service .from("eb-sync") @@ -17,3 +19,5 @@ export async function getTeamIdSyncB(teamId: string) { return null; } } + +export const getTeamIdSyncB = withAuth(getTeamIdSyncBOriginal, null); diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index 7e2a6f03..bbb17d6b 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -79,7 +79,7 @@ async function addScrapeJobRaw( // If above by 2x, send them an email // No need to 2x as if there are more than the max concurrency in the concurrency queue, it is already 2x if(concurrencyQueueJobs > maxConcurrency) { - logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency); + logger.info("Concurrency limited 2x (single) - ", "Concurrency queue jobs: ", concurrencyQueueJobs, "Max concurrency: ", maxConcurrency, "Team ID: ", webScraperOptions.team_id); // sendNotificationWithCustomDays(webScraperOptions.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => { // logger.error("Error sending notification (concurrency limit reached): ", error); // }); @@ -168,7 +168,7 @@ export async function addScrapeJobs( // equals 2x the max concurrency if(addToCQ.length > maxConcurrency) { - logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency); + logger.info("Concurrency limited 2x (multiple) - ", "Concurrency queue jobs: ", addToCQ.length, "Max concurrency: ", maxConcurrency, "Team ID: ", jobs[0].data.team_id); // sendNotificationWithCustomDays(jobs[0].data.team_id, NotificationType.CONCURRENCY_LIMIT_REACHED, 10, false).catch((error) => { // logger.error("Error sending notification (concurrency limit reached): ", error); // }); From c6cad942ab81f5bb95849fad541e00177d8e5a4a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 13 Mar 2025 13:07:51 -0400 Subject: [PATCH 05/28] Nick: errors -> warn --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- apps/api/src/scraper/scrapeURL/lib/extractLinks.ts | 2 +- apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts | 2 +- apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5f26f817..ba4793d8 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -456,7 +456,7 @@ export class WebCrawler { } }).filter(x => x !== null) as string[])]; } catch (error) { - this.logger.error("Failed to call html-transformer! Falling back to cheerio...", { + this.logger.warn("Failed to call html-transformer! Falling back to cheerio...", { error, module: "scrapeURL", method: "extractMetadata" }); diff --git a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts index ef784a71..48117ae7 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts @@ -41,7 +41,7 @@ export async function extractLinks(html: string, baseUrl: string): Promise Date: Thu, 13 Mar 2025 11:05:09 -0700 Subject: [PATCH 06/28] Meaningful log message for high resource usage errors --- apps/api/src/services/indexing/index-worker.ts | 3 ++- apps/api/src/services/queue-worker.ts | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/indexing/index-worker.ts b/apps/api/src/services/indexing/index-worker.ts index 0b38b90a..30285978 100644 --- a/apps/api/src/services/indexing/index-worker.ts +++ b/apps/api/src/services/indexing/index-worker.ts @@ -156,7 +156,8 @@ const workerFun = async (queue: Queue, jobProcessor: (token: string, job: Job) = const canAcceptConnection = await monitor.acceptConnection(); if (!canAcceptConnection) { - logger.info("Cant accept connection"); + console.log("Can't accept connection due to RAM/CPU load"); + logger.info("Can't accept connection due to RAM/CPU load"); cantAcceptConnectionCount++; if (cantAcceptConnectionCount >= 25) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 5493da73..96de8d72 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -567,7 +567,8 @@ const workerFun = async ( const token = uuidv4(); const canAcceptConnection = await monitor.acceptConnection(); if (!canAcceptConnection) { - console.log("Cant accept connection"); + console.log("Can't accept connection due to RAM/CPU load"); + logger.info("Can't accept connection due to RAM/CPU load"); cantAcceptConnectionCount++; if (cantAcceptConnectionCount >= 25) { From c7ae50d2d0be9aef9478b74c766a32d3edfb568b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 13 Mar 2025 19:31:05 +0100 Subject: [PATCH 07/28] fix(crawler): sitemaps poisoning crawls with unrelated links (#1334) --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ba4793d8..29a35e3b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -271,7 +271,7 @@ export class WebCrawler { return urlsHandler(urls); } else { let filteredLinks = this.filterLinks( - [...new Set(urls)], + [...new Set(urls)].filter(x => this.filterURL(x, this.initialUrl) !== null), leftOfLimit, this.maxCrawledDepth, fromMap, From 387dd3aa380c711734e686b0d4fb2370a0ab5f3a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 13 Mar 2025 20:07:52 +0100 Subject: [PATCH 08/28] fix(tests/snips/billing): don't wait 40s for nothing when self hosted --- apps/api/src/__tests__/snips/billing.test.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/billing.test.ts b/apps/api/src/__tests__/snips/billing.test.ts index b3639fcb..a7d7bccf 100644 --- a/apps/api/src/__tests__/snips/billing.test.ts +++ b/apps/api/src/__tests__/snips/billing.test.ts @@ -5,7 +5,9 @@ const sleepForBatchBilling = () => sleep(20000); beforeAll(async () => { // Wait for previous test runs to stop billing processing - await sleep(40000); + if (!process.env.TEST_SUITE_SELF_HOSTED) { + await sleep(40000); + } }, 50000); describe("Billing tests", () => { From c3ebfafba7fa0c9e7a4649d06c01974012b6d401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 13 Mar 2025 20:10:39 +0100 Subject: [PATCH 09/28] fix(llmExtract): remove unsupported JSON schema properties (#1335) --- apps/api/src/__tests__/snips/extract.test.ts | 22 +++++++++++++ .../scrapeURL/transformers/llmExtract.ts | 32 ++++++++++++++++++- 2 files changed, 53 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/extract.test.ts b/apps/api/src/__tests__/snips/extract.test.ts index 50fecde3..acba310d 100644 --- a/apps/api/src/__tests__/snips/extract.test.ts +++ b/apps/api/src/__tests__/snips/extract.test.ts @@ -29,6 +29,28 @@ describe("Extract tests", () => { expect(typeof res.data.is_open_source).toBe("boolean"); expect(res.data.is_open_source).toBe(true); }, 60000); + + it.concurrent("works with unsupported JSON schema parameters", async () => { + const res = await extract({ + urls: ["https://firecrawl.dev"], + schema: { + "type": "object", + "properties": { + "company_name": { + "type": "string", + "pattern": "^[a-zA-Z0-9]+$" + }, + }, + "required": [ + "company_name" + ] + }, + origin: "api-sdk", + }); + + expect(res.data).toHaveProperty("company_name"); + expect(typeof res.data.company_name).toBe("string") + }, 60000); } else { it.concurrent("dummy test", () => { expect(true).toBe(true); diff --git a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts index 256b24d9..e3c76362 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts @@ -365,7 +365,37 @@ export async function performLLMExtract( export function removeDefaultProperty(schema: any): any { if (typeof schema !== "object" || schema === null) return schema; - const { default: _, ...rest } = schema; + const rest = { ...schema }; + + // unsupported global keys + delete rest.default; + + // unsupported object keys + delete rest.patternProperties; + delete rest.unevaluatedProperties; + delete rest.propertyNames; + delete rest.minProperties; + delete rest.maxProperties; + + // unsupported string keys + delete rest.minLength; + delete rest.maxLength; + delete rest.pattern; + delete rest.format; + + // unsupported number keys + delete rest.minimum; + delete rest.maximum; + delete rest.multipleOf; + + // unsupported array keys + delete rest.unevaluatedItems; + delete rest.contains; + delete rest.minContains; + delete rest.maxContains; + delete rest.minItems; + delete rest.maxItems; + delete rest.uniqueItems; for (const key in rest) { if (Array.isArray(rest[key])) { From db3faf85f49888f3c8197a975db9625ac7c32805 Mon Sep 17 00:00:00 2001 From: Aparup Ganguly Date: Fri, 14 Mar 2025 01:15:20 +0530 Subject: [PATCH 10/28] Claude 3.7 implementation --- .../claude-3.7-stock-analyzer.py | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py diff --git a/examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py b/examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py new file mode 100644 index 00000000..a37d9dc7 --- /dev/null +++ b/examples/claude-3.7-stock-analyzer/claude-3.7-stock-analyzer.py @@ -0,0 +1,180 @@ +import os +from firecrawl import FirecrawlApp +import json +from dotenv import load_dotenv +import anthropic +from e2b_code_interpreter import Sandbox +import base64 + +# ANSI color codes +class Colors: + CYAN = '\033[96m' + YELLOW = '\033[93m' + GREEN = '\033[92m' + RED = '\033[91m' + MAGENTA = '\033[95m' + BLUE = '\033[94m' + RESET = '\033[0m' + +# Load environment variables +load_dotenv() + +# Retrieve API keys from environment variables +firecrawl_api_key = os.getenv("FIRECRAWL_API_KEY") +anthropic_api_key = os.getenv("ANTHROPIC_API_KEY") +e2b_api_key = os.getenv("E2B_API_KEY") + +# Initialize the FirecrawlApp and Anthropic client +app = FirecrawlApp(api_key=firecrawl_api_key) +client = anthropic.Anthropic(api_key=anthropic_api_key) +sandbox = Sandbox(api_key=e2b_api_key) + +# Find the relevant stock pages via map +def find_relevant_page_via_map(stock_search_term, url, app): + try: + print(f"{Colors.CYAN}Searching for stock: {stock_search_term}{Colors.RESET}") + print(f"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}") + + map_search_parameter = stock_search_term + + print(f"{Colors.GREEN}Search parameter: {map_search_parameter}{Colors.RESET}") + + print(f"{Colors.YELLOW}Mapping website using the identified search parameter...{Colors.RESET}") + map_website = app.map_url(url, params={"search": map_search_parameter}) + print(f"{Colors.GREEN}Website mapping completed successfully.{Colors.RESET}") + print(f"{Colors.GREEN}Located {len(map_website['links'])} relevant links.{Colors.RESET}") + return map_website['links'] + except Exception as e: + print(f"{Colors.RED}Error encountered during relevant page identification: {str(e)}{Colors.RESET}") + return None + +# Function to plot the scores using e2b +def plot_scores(stock_names, stock_scores): + print(f"{Colors.YELLOW}Plotting scores...{Colors.RESET}") + code_to_run = f""" +import matplotlib.pyplot as plt + +stock_names = {stock_names} +stock_scores = {stock_scores} + +plt.figure(figsize=(10, 5)) +plt.bar(stock_names, stock_scores, color='blue') +plt.xlabel('Stock Names') +plt.ylabel('Scores') +plt.title('Stock Investment Scores') +plt.xticks(rotation=45) +plt.tight_layout() +plt.savefig('chart.png') +plt.show() +""" + # Run the code inside the sandbox + execution = sandbox.run_code(code_to_run) + + # Check if there are any results + if execution.results and execution.results[0].png: + first_result = execution.results[0] + + # Get the directory where the current python file is located + current_dir = os.path.dirname(os.path.abspath(__file__)) + # Save the png to a file in the examples directory. The png is in base64 format. + with open(os.path.join(current_dir, 'chart.png'), 'wb') as f: + f.write(base64.b64decode(first_result.png)) + print('Chart saved as examples/chart.png') + else: + print(f"{Colors.RED}No results returned from the sandbox execution.{Colors.RESET}") + +# Analyze the top stocks and provide investment recommendation +def analyze_top_stocks(map_website, app, client): + try: + # Get top 5 links from the map result + top_links = map_website[:10] + print(f"{Colors.CYAN}Proceeding to analyze top {len(top_links)} links: {top_links}{Colors.RESET}") + + # Scrape the pages in batch + batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']}) + print(f"{Colors.GREEN}Batch page scraping completed successfully.{Colors.RESET}") + + # Prepare content for LLM + stock_contents = [] + for scrape_result in batch_scrape_result['data']: + stock_contents.append({ + 'content': scrape_result['markdown'] + }) + + # Pass all the content to the LLM to analyze and decide which stock to invest in + analyze_prompt = f""" +Based on the following information about different stocks from their Robinhood pages, analyze and determine which stock is the best investment opportunity. DO NOT include any other text, just the JSON. + +Return the result in the following JSON format. Only return the JSON, nothing else. Do not include backticks or any other formatting, just the JSON. +{{ + "scores": [ + {{ + "stock_name": "", + "score": + }}, + ... + ] +}} + +Stock Information: +""" + + for stock in stock_contents: + analyze_prompt += f"Content:\n{stock['content']}\n" + + print(f"{Colors.YELLOW}Analyzing stock information with LLM...{Colors.RESET}") + analyze_prompt += f"\n\nStart JSON:\n" + completion = client.messages.create( + model="claude-3-7-sonnet-20250219", + max_tokens=1000, + temperature=0, + system="You are a financial analyst. Only return the JSON, nothing else.", + messages=[ + { + "role": "user", + "content": analyze_prompt + } + ] + ) + + result = completion.content[0].text + print(f"{Colors.GREEN}Analysis completed. Here is the recommendation:{Colors.RESET}") + print(f"{Colors.MAGENTA}{result}{Colors.RESET}") + + # Plot the scores using e2b + try: + result_json = json.loads(result) + scores = result_json['scores'] + stock_names = [score['stock_name'] for score in scores] + stock_scores = [score['score'] for score in scores] + + plot_scores(stock_names, stock_scores) + except json.JSONDecodeError as json_err: + print(f"{Colors.RED}Error decoding JSON response: {str(json_err)}{Colors.RESET}") + + except Exception as e: + print(f"{Colors.RED}Error encountered during stock analysis: {str(e)}{Colors.RESET}") + +# Main function to execute the process +def main(): + # Get user input + stock_search_term = input(f"{Colors.BLUE}Enter the stock you're interested in: {Colors.RESET}") + if not stock_search_term.strip(): + print(f"{Colors.RED}No stock entered. Exiting.{Colors.RESET}") + return + + url = "https://robinhood.com/stocks" + + print(f"{Colors.YELLOW}Initiating stock analysis process...{Colors.RESET}") + # Find the relevant pages + map_website = find_relevant_page_via_map(stock_search_term, url, app) + + if map_website: + print(f"{Colors.GREEN}Relevant stock pages identified. Proceeding with detailed analysis...{Colors.RESET}") + # Analyze top stocks + analyze_top_stocks(map_website, app, client) + else: + print(f"{Colors.RED}No relevant stock pages identified. Consider refining the search term or trying a different stock.{Colors.RESET}") + +if __name__ == "__main__": + main() From ca93ba6c6d87f52837ddf500558d9ee50efd8207 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sat, 15 Mar 2025 12:42:53 +0100 Subject: [PATCH 11/28] fix(js-sdk/crawl,batch-scrape): retry status call if it returns an error up to 3 times (#1343) --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 32fa0b41..07fdf64c 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.19.1", + "version": "1.19.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index ab09432e..3175615d 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1332,12 +1332,14 @@ export default class FirecrawlApp { checkInterval: number ): Promise { try { + let failedTries = 0; while (true) { let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, headers ); if (statusResponse.status === 200) { + failedTries = 0; let statusData = statusResponse.data; if (statusData.status === "completed") { if ("data" in statusData) { @@ -1369,7 +1371,10 @@ export default class FirecrawlApp { ); } } else { - this.handleError(statusResponse, "check crawl status"); + failedTries++; + if (failedTries >= 3) { + this.handleError(statusResponse, "check crawl status"); + } } } } catch (error: any) { From f1206e487003ca7fae93273720c803056df5b55d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 15 Mar 2025 22:50:19 -0400 Subject: [PATCH 12/28] Nick: urls optional on extract --- apps/api/src/controllers/v1/types.ts | 9 +++- apps/api/src/lib/extract/build-prompts.ts | 7 ++++ .../api/src/lib/extract/extraction-service.ts | 42 +++++++++++++++---- apps/js-sdk/firecrawl/src/index.ts | 6 +-- apps/python-sdk/firecrawl/firecrawl.py | 7 +++- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1e462549..d52c55c5 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -314,7 +314,8 @@ export const extractV1Options = z .object({ urls: url .array() - .max(10, "Maximum of 10 URLs allowed per request while in beta."), + .max(10, "Maximum of 10 URLs allowed per request while in beta.") + .optional(), prompt: z.string().max(10000).optional(), systemPrompt: z.string().max(10000).optional(), schema: z @@ -354,6 +355,12 @@ export const extractV1Options = z .optional(), }) .strict(strictMessage) + .refine( + (obj) => obj.urls || obj.prompt, + { + message: "Either 'urls' or 'prompt' must be provided.", + }, + ) .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 24710660..f7dd4e32 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt( export function buildBatchExtractPrompt(prompt: string): string { return `Today is: ${new Date().toISOString()}\n${prompt}`; } + + +export function buildRephraseToSerpPrompt(prompt: string): string { + return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results: + +Original Prompt: "${prompt}"`; +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 45d18fe6..acd9f8b0 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -6,7 +6,7 @@ import { } from "../../controllers/v1/types"; import { PlanType } from "../../types"; import { logger as _logger } from "../logger"; -import { processUrl } from "./url-processor"; +import { generateBasicCompletion, processUrl } from "./url-processor"; import { scrapeDocument } from "./document-scraper"; import { generateCompletions, @@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer"; import { SourceTracker } from "./helpers/source-tracker"; import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; +import { search } from "../../search"; +import { buildRephraseToSerpPrompt } from "./build-prompts"; interface ExtractServiceOptions { request: ExtractRequest; @@ -84,16 +86,43 @@ export async function performExtraction( let totalUrlsScraped = 0; let sources: Record = {}; + const logger = _logger.child({ module: "extract", method: "performExtraction", extractId, }); - if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) { + // If no URLs are provided, generate URLs from the prompt + if ((!request.urls || request.urls.length === 0) && request.prompt) { + logger.debug("Generating URLs from prompt...", { + prompt: request.prompt, + }); + const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt)); + const searchResults = await search({ + query: rephrasedPrompt.replace('"', "").replace("'", ""), + num_results: 10, + }); + + request.urls = searchResults.map(result => result.url) as string[]; + } + if (request.urls && request.urls.length === 0) { + logger.error("No search results found", { + query: request.prompt, + }); + return { + success: false, + error: "No search results found", + extractId, + }; + } + + const urls = request.urls || ([] as string[]); + + if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) { logger.debug("Loading cached docs..."); try { - const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey); + const cache = await getCachedDocs(urls, request.__experimental_cacheKey); for (const doc of cache) { if (doc.metadata.url) { docsMap.set(normalizeUrl(doc.metadata.url), doc); @@ -122,11 +151,10 @@ export async function performExtraction( let startMap = Date.now(); let aggMapLinks: string[] = []; logger.debug("Processing URLs...", { - urlCount: request.urls.length, + urlCount: request.urls?.length || 0, }); - // Process URLs - const urlPromises = request.urls.map((url) => + const urlPromises = urls.map((url) => processUrl( { url, @@ -746,7 +774,7 @@ export async function performExtraction( time_taken: (new Date().getTime() - Date.now()) / 1000, team_id: teamId, mode: "extract", - url: request.urls.join(", "), + url: request.urls?.join(", ") || "", scrapeOptions: request, origin: request.origin ?? "api", num_tokens: totalTokensUsed, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3175615d..98fdf696 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1119,14 +1119,14 @@ export default class FirecrawlApp { /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. - * @param url - The URL to extract information from. + * @param urls - The URLs to extract information from. Optional if using other methods for data extraction. * @param params - Additional parameters for the extract request. * @returns The response from the extract operation. */ - async extract(urls: string[], params?: ExtractParams): Promise> | ErrorResponse> { + async extract(urls?: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - let jsonData: { urls: string[] } & ExtractParams = { urls, ...params }; + let jsonData: { urls?: string[] } & ExtractParams = { urls: urls || [], ...params }; let jsonSchema: any; try { if (!params?.schema) { diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d79b174c..4108868e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -646,12 +646,12 @@ class FirecrawlApp: else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: + def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any: """ Extracts information from a URL using the Firecrawl API. Args: - urls (List[str]): The URLs to extract information from. + urls (Optional[List[str]]): The URLs to extract information from. params (Optional[ExtractParams]): Additional parameters for the extract request. Returns: @@ -662,6 +662,9 @@ class FirecrawlApp: if not params or (not params.get('prompt') and not params.get('schema')): raise ValueError("Either prompt or schema is required") + if not urls and not params.get('prompt'): + raise ValueError("Either urls or prompt is required") + schema = params.get('schema') if schema: if hasattr(schema, 'model_json_schema'): From 611c2d9c83046f700d451419fcc61c1523856a28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 16 Mar 2025 16:02:53 +0100 Subject: [PATCH 13/28] feat(v1/scrape): add further logging to document scrape bugs better --- apps/api/src/controllers/v1/scrape.ts | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index bd01a31e..2647038c 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -19,12 +19,21 @@ export async function scrapeController( req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, res: Response, ) { + const jobId = uuidv4(); + const preNormalizedBody = { ...req.body }; + + logger.debug("Scrape " + jobId + " starting", { + request: req.body, + originalRequest: preNormalizedBody, + teamId: req.auth.team_id, + account: req.account, + }); + req.body = scrapeRequestSchema.parse(req.body); let earlyReturn = false; const origin = req.body.origin; const timeout = req.body.timeout; - const jobId = uuidv4(); const startTime = new Date().getTime(); const jobPriority = await getJobPriority({ From bad822421f063fee741e4ac2438fbee6a07800e2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 16 Mar 2025 16:04:25 +0100 Subject: [PATCH 14/28] fix(v1/scrape): make log show up in queries --- apps/api/src/controllers/v1/scrape.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/controllers/v1/scrape.ts b/apps/api/src/controllers/v1/scrape.ts index 2647038c..ec11e2cb 100644 --- a/apps/api/src/controllers/v1/scrape.ts +++ b/apps/api/src/controllers/v1/scrape.ts @@ -23,6 +23,7 @@ export async function scrapeController( const preNormalizedBody = { ...req.body }; logger.debug("Scrape " + jobId + " starting", { + scrapeId: jobId, request: req.body, originalRequest: preNormalizedBody, teamId: req.auth.team_id, From 180770f1a3802fe35343e63aaa3826830b673a49 Mon Sep 17 00:00:00 2001 From: Eric Ciarla <43451761+ericciarla@users.noreply.github.com> Date: Sun, 16 Mar 2025 08:16:35 -0700 Subject: [PATCH 15/28] init and final (#1349) --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1e462549..c21c02e8 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1002,7 +1002,7 @@ export const generateLLMsTextRequestSchema = z.object({ maxUrls: z .number() .min(1) - .max(100) + .max(5000) .default(10) .describe("Maximum number of URLs to process"), showFullText: z From 87ad53e727a7877405a63d452fdaa37f9c21e176 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 16 Mar 2025 16:37:08 +0100 Subject: [PATCH 16/28] fix(api/tests/snips/billing): bump timeout --- apps/api/src/__tests__/snips/billing.test.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/billing.test.ts b/apps/api/src/__tests__/snips/billing.test.ts index a7d7bccf..3a4898ef 100644 --- a/apps/api/src/__tests__/snips/billing.test.ts +++ b/apps/api/src/__tests__/snips/billing.test.ts @@ -139,7 +139,7 @@ describe("Billing tests", () => { if (crawl1.success && crawl2.success) { expect(rc1 - rc2).toBe(crawl1.completed + crawl2.completed * 5); } - }, 300000); + }, 600000); it("bills map correctly", async () => { const rc1 = (await creditUsage()).remaining_credits; From 670ca84ae922bfd7478d1ef71f85216550784e94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 16 Mar 2025 19:57:08 +0100 Subject: [PATCH 17/28] fix(v1/checkCredits): snap crawl limit to remaining credits if over without erroring out (#1350) --- apps/api/src/routes/v1.ts | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 4fee4a1e..f6a46381 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -52,7 +52,18 @@ function checkCreditsMiddleware( if (chunk) { req.acuc = chunk; } + req.account = { remainingCredits }; if (!success) { + if (!minimum && req.body && (req.body as any).limit !== undefined && remainingCredits > 0) { + logger.warn("Adjusting limit to remaining credits", { + teamId: req.auth.team_id, + remainingCredits, + request: req.body, + }); + (req.body as any).limit = remainingCredits; + return next(); + } + const currencyName = req.acuc.is_extract ? "tokens" : "credits" logger.error( `Insufficient ${currencyName}: ${JSON.stringify({ team_id: req.auth.team_id, minimum, remainingCredits })}`, @@ -72,7 +83,6 @@ function checkCreditsMiddleware( }); } } - req.account = { remainingCredits }; next(); })().catch((err) => next(err)); }; From 200de9e7e703227969db184f724e893b34730f42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Sun, 16 Mar 2025 19:57:27 +0100 Subject: [PATCH 18/28] feat(scrape): add warning to document if it was concurrency limited (#1348) * feat(scrape): add warning to document if it was concurrency limited * phrasing + test fix --- apps/api/src/__tests__/snips/billing.test.ts | 2 +- apps/api/src/services/queue-jobs.ts | 2 ++ apps/api/src/services/queue-worker.ts | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/apps/api/src/__tests__/snips/billing.test.ts b/apps/api/src/__tests__/snips/billing.test.ts index 3a4898ef..8afa5a44 100644 --- a/apps/api/src/__tests__/snips/billing.test.ts +++ b/apps/api/src/__tests__/snips/billing.test.ts @@ -97,7 +97,7 @@ describe("Billing tests", () => { const rc2 = (await creditUsage()).remaining_credits; expect(rc1 - rc2).toBe(12); - }, 300000); + }, 600000); it("bills crawl correctly", async () => { const rc1 = (await creditUsage()).remaining_credits; diff --git a/apps/api/src/services/queue-jobs.ts b/apps/api/src/services/queue-jobs.ts index bbb17d6b..1ce2211c 100644 --- a/apps/api/src/services/queue-jobs.ts +++ b/apps/api/src/services/queue-jobs.ts @@ -84,6 +84,8 @@ async function addScrapeJobRaw( // logger.error("Error sending notification (concurrency limit reached): ", error); // }); } + + webScraperOptions.concurrencyLimited = true; await _addScrapeJobToConcurrencyQueue( webScraperOptions, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 96de8d72..52f1f55a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -923,6 +923,10 @@ async function processJob(job: Job & { id: string }, token: string) { delete doc.rawHtml; } + if (job.data.concurrencyLimited) { + doc.warning = "This scrape job was throttled at your current concurrency limit. If you'd like to scrape faster, you can upgrade your plan." + (doc.warning ? " " + doc.warning : ""); + } + const data = { success: true, result: { From 0fb9c1f32e9bb5f9cfc43190def468ebfe9f47d2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 22:28:47 -0400 Subject: [PATCH 19/28] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 98fdf696..54bb6f4f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1126,7 +1126,7 @@ export default class FirecrawlApp { async extract(urls?: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - let jsonData: { urls?: string[] } & ExtractParams = { urls: urls || [], ...params }; + let jsonData: { urls?: string[] } & ExtractParams = { urls: urls, ...params }; let jsonSchema: any; try { if (!params?.schema) { From 20c93db43f4a3332d0345a20c77884cdfadfddb4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 22:29:25 -0400 Subject: [PATCH 20/28] (feat/extract) URLs can now be optional in /extract (#1346) * Nick: urls optional on extract * Update index.ts --- apps/api/src/controllers/v1/types.ts | 9 +++- apps/api/src/lib/extract/build-prompts.ts | 7 ++++ .../api/src/lib/extract/extraction-service.ts | 42 +++++++++++++++---- apps/js-sdk/firecrawl/src/index.ts | 6 +-- apps/python-sdk/firecrawl/firecrawl.py | 7 +++- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index c21c02e8..08988f08 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -314,7 +314,8 @@ export const extractV1Options = z .object({ urls: url .array() - .max(10, "Maximum of 10 URLs allowed per request while in beta."), + .max(10, "Maximum of 10 URLs allowed per request while in beta.") + .optional(), prompt: z.string().max(10000).optional(), systemPrompt: z.string().max(10000).optional(), schema: z @@ -354,6 +355,12 @@ export const extractV1Options = z .optional(), }) .strict(strictMessage) + .refine( + (obj) => obj.urls || obj.prompt, + { + message: "Either 'urls' or 'prompt' must be provided.", + }, + ) .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 24710660..f7dd4e32 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt( export function buildBatchExtractPrompt(prompt: string): string { return `Today is: ${new Date().toISOString()}\n${prompt}`; } + + +export function buildRephraseToSerpPrompt(prompt: string): string { + return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results: + +Original Prompt: "${prompt}"`; +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 45d18fe6..acd9f8b0 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -6,7 +6,7 @@ import { } from "../../controllers/v1/types"; import { PlanType } from "../../types"; import { logger as _logger } from "../logger"; -import { processUrl } from "./url-processor"; +import { generateBasicCompletion, processUrl } from "./url-processor"; import { scrapeDocument } from "./document-scraper"; import { generateCompletions, @@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer"; import { SourceTracker } from "./helpers/source-tracker"; import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; +import { search } from "../../search"; +import { buildRephraseToSerpPrompt } from "./build-prompts"; interface ExtractServiceOptions { request: ExtractRequest; @@ -84,16 +86,43 @@ export async function performExtraction( let totalUrlsScraped = 0; let sources: Record = {}; + const logger = _logger.child({ module: "extract", method: "performExtraction", extractId, }); - if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) { + // If no URLs are provided, generate URLs from the prompt + if ((!request.urls || request.urls.length === 0) && request.prompt) { + logger.debug("Generating URLs from prompt...", { + prompt: request.prompt, + }); + const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt)); + const searchResults = await search({ + query: rephrasedPrompt.replace('"', "").replace("'", ""), + num_results: 10, + }); + + request.urls = searchResults.map(result => result.url) as string[]; + } + if (request.urls && request.urls.length === 0) { + logger.error("No search results found", { + query: request.prompt, + }); + return { + success: false, + error: "No search results found", + extractId, + }; + } + + const urls = request.urls || ([] as string[]); + + if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) { logger.debug("Loading cached docs..."); try { - const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey); + const cache = await getCachedDocs(urls, request.__experimental_cacheKey); for (const doc of cache) { if (doc.metadata.url) { docsMap.set(normalizeUrl(doc.metadata.url), doc); @@ -122,11 +151,10 @@ export async function performExtraction( let startMap = Date.now(); let aggMapLinks: string[] = []; logger.debug("Processing URLs...", { - urlCount: request.urls.length, + urlCount: request.urls?.length || 0, }); - // Process URLs - const urlPromises = request.urls.map((url) => + const urlPromises = urls.map((url) => processUrl( { url, @@ -746,7 +774,7 @@ export async function performExtraction( time_taken: (new Date().getTime() - Date.now()) / 1000, team_id: teamId, mode: "extract", - url: request.urls.join(", "), + url: request.urls?.join(", ") || "", scrapeOptions: request, origin: request.origin ?? "api", num_tokens: totalTokensUsed, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3175615d..54bb6f4f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1119,14 +1119,14 @@ export default class FirecrawlApp { /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. - * @param url - The URL to extract information from. + * @param urls - The URLs to extract information from. Optional if using other methods for data extraction. * @param params - Additional parameters for the extract request. * @returns The response from the extract operation. */ - async extract(urls: string[], params?: ExtractParams): Promise> | ErrorResponse> { + async extract(urls?: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - let jsonData: { urls: string[] } & ExtractParams = { urls, ...params }; + let jsonData: { urls?: string[] } & ExtractParams = { urls: urls, ...params }; let jsonSchema: any; try { if (!params?.schema) { diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d79b174c..4108868e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -646,12 +646,12 @@ class FirecrawlApp: else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: + def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any: """ Extracts information from a URL using the Firecrawl API. Args: - urls (List[str]): The URLs to extract information from. + urls (Optional[List[str]]): The URLs to extract information from. params (Optional[ExtractParams]): Additional parameters for the extract request. Returns: @@ -662,6 +662,9 @@ class FirecrawlApp: if not params or (not params.get('prompt') and not params.get('schema')): raise ValueError("Either prompt or schema is required") + if not urls and not params.get('prompt'): + raise ValueError("Either urls or prompt is required") + schema = params.get('schema') if schema: if hasattr(schema, 'model_json_schema'): From 6d250360c29d368d5f2610c5976a92c68faf02a0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 22:45:16 -0400 Subject: [PATCH 21/28] Nick: bump --- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 07fdf64c..3ff8c921 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.19.2", + "version": "1.20.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 273fc1a6..5dd9b2c7 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.13.5" +__version__ = "1.14.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From 7e7b7e10fe9e20a9889b61954a7a8b0af153ec74 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 23:02:16 -0400 Subject: [PATCH 22/28] Nick: fixes py sdk --- apps/python-sdk/firecrawl/__init__.py | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 5dd9b2c7..726a34d0 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.14.0" +__version__ = "1.14.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 4108868e..f4ddb91e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -681,6 +681,8 @@ class FirecrawlApp: 'origin': 'api-sdk' } + if not request_data['urls']: + request_data['urls'] = [] # Only add prompt and systemPrompt if they exist if params.get('prompt'): request_data['prompt'] = params['prompt'] From 010c8750d424f8171975f8b1a3d785b680aff22d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 23:14:44 -0400 Subject: [PATCH 23/28] Nick: let user format the analysis --- apps/api/src/controllers/v1/deep-research.ts | 1 + apps/api/src/lib/deep-research/deep-research-service.ts | 2 ++ apps/api/src/lib/deep-research/research-manager.ts | 5 ++++- apps/api/src/services/queue-worker.ts | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts index df3f49c8..7e454c3d 100644 --- a/apps/api/src/controllers/v1/deep-research.ts +++ b/apps/api/src/controllers/v1/deep-research.ts @@ -10,6 +10,7 @@ export const deepResearchRequestSchema = z.object({ maxDepth: z.number().min(1).max(12).default(7).describe('Maximum depth of research iterations'), maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'), timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'), + analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(), // @deprecated Use query instead topic: z.string().describe('The topic or question to research').optional(), }).refine(data => data.query || data.topic, { diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index d801ab3c..8a404d10 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -14,6 +14,7 @@ interface DeepResearchServiceOptions { maxDepth: number; maxUrls: number; timeLimit: number; + analysisPrompt: string; subId?: string; } @@ -262,6 +263,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { options.query, state.getFindings(), state.getSummaries(), + options.analysisPrompt, ); await state.addActivity({ diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 8a5bf839..d5f4fdd9 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -253,6 +253,7 @@ export class ResearchLLMService { topic: string, findings: DeepResearchFinding[], summaries: string[], + analysisPrompt: string, ): Promise { const { extract } = await generateCompletions({ logger: this.logger.child({ @@ -265,7 +266,9 @@ export class ResearchLLMService { "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( - `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. + analysisPrompt + ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` + : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. Research data: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 52f1f55a..66931fce 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -412,6 +412,7 @@ const processDeepResearchJobInternal = async ( timeLimit: job.data.request.timeLimit, subId: job.data.subId, maxUrls: job.data.request.maxUrls, + analysisPrompt: job.data.request.analysisPrompt, }); if(result.success) { From d12feaea52d804b6a53f26ee6fdc5721dd1cebe8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 17 Mar 2025 18:04:05 +0100 Subject: [PATCH 24/28] fix(crawl): allow execution time longer than 24h --- apps/api/src/lib/crawl-redis.ts | 23 +++++++++++----------- apps/api/src/scraper/WebScraper/crawler.ts | 1 - 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index b31605c7..b741e615 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -27,7 +27,7 @@ export async function saveCrawl(id: string, crawl: StoredCrawl) { plan: crawl.plan, }); await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); - await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id, 24 * 60 * 60); } export async function getCrawl(id: string): Promise { @@ -37,6 +37,7 @@ export async function getCrawl(id: string): Promise { return null; } + await redisConnection.expire("crawl:" + id, 24 * 60 * 60); return JSON.parse(x); } @@ -56,7 +57,7 @@ export async function addCrawlJob(id: string, job_id: string) { crawlId: id, }); await redisConnection.sadd("crawl:" + id + ":jobs", job_id); - await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60); } export async function addCrawlJobs(id: string, job_ids: string[]) { @@ -69,7 +70,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) { crawlId: id, }); await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); - await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60); } export async function addCrawlJobDone( @@ -87,7 +88,6 @@ export async function addCrawlJobDone( await redisConnection.expire( "crawl:" + id + ":jobs_done", 24 * 60 * 60, - "NX", ); if (success) { @@ -104,11 +104,11 @@ export async function addCrawlJobDone( await redisConnection.expire( "crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, - "NX", ); } export async function getDoneJobsOrderedLength(id: string): Promise { + await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60); return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); } @@ -117,6 +117,7 @@ export async function getDoneJobsOrdered( start = 0, end = -1, ): Promise { + await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60); return await redisConnection.lrange( "crawl:" + id + ":jobs_done_ordered", start, @@ -125,6 +126,7 @@ export async function getDoneJobsOrdered( } export async function isCrawlFinished(id: string) { + await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60); return ( (await redisConnection.scard("crawl:" + id + ":jobs_done")) === (await redisConnection.scard("crawl:" + id + ":jobs")) && @@ -133,6 +135,7 @@ export async function isCrawlFinished(id: string) { } export async function isCrawlKickoffFinished(id: string) { + await redisConnection.expire("crawl:" + id + ":kickoff:finish", 24 * 60 * 60); return ( (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null ); @@ -159,9 +162,7 @@ export async function finishCrawl(id: string) { crawlId: id, }); const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); - if (set === 1) { - await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); - } + await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); return set === 1; } else { _logger.debug("Crawl can not be finished yet, not marking as finished.", { @@ -294,14 +295,13 @@ export async function lockURL( res = x === permutations.length; } - await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60); if (res) { await redisConnection.sadd("crawl:" + id + ":visited_unique", url); await redisConnection.expire( "crawl:" + id + ":visited_unique", 24 * 60 * 60, - "NX", ); } @@ -334,7 +334,6 @@ export async function lockURLs( await redisConnection.expire( "crawl:" + id + ":visited_unique", 24 * 60 * 60, - "NX", ); let res: boolean; @@ -353,7 +352,7 @@ export async function lockURLs( res = x === allPermutations.length; } - await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); + await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60); logger.debug("lockURLs final result: " + res, { res }); return res; diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 29a35e3b..2962af3e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -384,7 +384,6 @@ export class WebCrawler { await redisConnection.expire( "crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, - "NX", ); })(); } From 6d3c639f585a1d80b1374d459b629bb53424df38 Mon Sep 17 00:00:00 2001 From: Rafael Miller <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 17 Mar 2025 14:06:29 -0300 Subject: [PATCH 25/28] added 403s to sdk error handlers (#1357) --- apps/js-sdk/firecrawl/src/index.ts | 2 +- apps/python-sdk/firecrawl/firecrawl.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 54bb6f4f..bd63d494 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1388,7 +1388,7 @@ export default class FirecrawlApp { * @param {string} action - The action being performed when the error occurred. */ handleError(response: AxiosResponse, action: string): void { - if ([400, 402, 408, 409, 500].includes(response.status)) { + if ([400, 402, 403, 408, 409, 500].includes(response.status)) { const errorMessage: string = response.data.error || "Unknown error occurred"; const details = response.data.details ? ` - ${JSON.stringify(response.data.details)}` : ''; diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f4ddb91e..a221e74c 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -1098,6 +1098,8 @@ class FirecrawlApp: if response.status_code == 402: message = f"Payment Required: Failed to {action}. {error_message} - {error_details}" + elif response.status_code == 403: + message = f"Website Not Supported: Failed to {action}. {error_message} - {error_details}" elif response.status_code == 408: message = f"Request Timeout: Failed to {action} as the request timed out. {error_message} - {error_details}" elif response.status_code == 409: From d0b468ee7b7d8687cc7869e84875f14f86e0268f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Mon, 17 Mar 2025 20:47:17 +0100 Subject: [PATCH 26/28] feat(scrape/actions/click): add all parameter (FIR-1443) (#1342) * feat(scrape/actions/click): add all parameter * bump sdk --- apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 08988f08..9d9109fb 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -114,6 +114,7 @@ export const actionsSchema = z z.object({ type: z.literal("click"), selector: z.string(), + all: z.boolean().default(false), }), z.object({ type: z.literal("screenshot"), diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 93911485..42df545d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -21,6 +21,7 @@ export type Action = | { type: "click"; selector: string; + all?: boolean; } | { type: "screenshot"; diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 3ff8c921..0aca8907 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.20.0", + "version": "1.20.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index bd63d494..8114e4c3 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -104,6 +104,7 @@ export type Action = { } | { type: "click", selector: string, + all?: boolean, } | { type: "screenshot", fullPage?: boolean, From e97a279efef506cdc3f371f9aa946cb501ded555 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Mar 2025 15:51:40 -0400 Subject: [PATCH 27/28] Nick: let user format the analysis (#1351) --- apps/api/src/controllers/v1/deep-research.ts | 1 + apps/api/src/lib/deep-research/deep-research-service.ts | 2 ++ apps/api/src/lib/deep-research/research-manager.ts | 5 ++++- apps/api/src/services/queue-worker.ts | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts index df3f49c8..7e454c3d 100644 --- a/apps/api/src/controllers/v1/deep-research.ts +++ b/apps/api/src/controllers/v1/deep-research.ts @@ -10,6 +10,7 @@ export const deepResearchRequestSchema = z.object({ maxDepth: z.number().min(1).max(12).default(7).describe('Maximum depth of research iterations'), maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'), timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'), + analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(), // @deprecated Use query instead topic: z.string().describe('The topic or question to research').optional(), }).refine(data => data.query || data.topic, { diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index d801ab3c..8a404d10 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -14,6 +14,7 @@ interface DeepResearchServiceOptions { maxDepth: number; maxUrls: number; timeLimit: number; + analysisPrompt: string; subId?: string; } @@ -262,6 +263,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { options.query, state.getFindings(), state.getSummaries(), + options.analysisPrompt, ); await state.addActivity({ diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index 8a5bf839..d5f4fdd9 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -253,6 +253,7 @@ export class ResearchLLMService { topic: string, findings: DeepResearchFinding[], summaries: string[], + analysisPrompt: string, ): Promise { const { extract } = await generateCompletions({ logger: this.logger.child({ @@ -265,7 +266,9 @@ export class ResearchLLMService { "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( - `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. + analysisPrompt + ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` + : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. Research data: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 52f1f55a..66931fce 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -412,6 +412,7 @@ const processDeepResearchJobInternal = async ( timeLimit: job.data.request.timeLimit, subId: job.data.subId, maxUrls: job.data.request.maxUrls, + analysisPrompt: job.data.request.analysisPrompt, }); if(result.success) { From 4fc5e6f6ca0c1a0fc4e0f887360eeb2cfddf3d44 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 18 Mar 2025 15:52:55 -0400 Subject: [PATCH 28/28] Nick: added analysis prompt to the sdks --- apps/js-sdk/firecrawl/src/index.ts | 4 ++++ apps/python-sdk/firecrawl/firecrawl.py | 1 + 2 files changed, 5 insertions(+) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 54bb6f4f..9f8d00d3 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -371,6 +371,10 @@ export interface DeepResearchParams { * @default 20 */ maxUrls?: number; + /** + * The prompt to use for the final analysis + */ + analysisPrompt?: string; /** * Experimental flag for streaming steps */ diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index f4ddb91e..c315eb20 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -48,6 +48,7 @@ class DeepResearchParams(pydantic.BaseModel): maxDepth: Optional[int] = 7 timeLimit: Optional[int] = 270 maxUrls: Optional[int] = 20 + analysisPrompt: Optional[str] = None __experimental_streamSteps: Optional[bool] = None class DeepResearchResponse(pydantic.BaseModel):