From f1206e487003ca7fae93273720c803056df5b55d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sat, 15 Mar 2025 22:50:19 -0400 Subject: [PATCH 1/3] Nick: urls optional on extract --- apps/api/src/controllers/v1/types.ts | 9 +++- apps/api/src/lib/extract/build-prompts.ts | 7 ++++ .../api/src/lib/extract/extraction-service.ts | 42 +++++++++++++++---- apps/js-sdk/firecrawl/src/index.ts | 6 +-- apps/python-sdk/firecrawl/firecrawl.py | 7 +++- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 1e462549..d52c55c5 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -314,7 +314,8 @@ export const extractV1Options = z .object({ urls: url .array() - .max(10, "Maximum of 10 URLs allowed per request while in beta."), + .max(10, "Maximum of 10 URLs allowed per request while in beta.") + .optional(), prompt: z.string().max(10000).optional(), systemPrompt: z.string().max(10000).optional(), schema: z @@ -354,6 +355,12 @@ export const extractV1Options = z .optional(), }) .strict(strictMessage) + .refine( + (obj) => obj.urls || obj.prompt, + { + message: "Either 'urls' or 'prompt' must be provided.", + }, + ) .transform((obj) => ({ ...obj, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 24710660..f7dd4e32 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt( export function buildBatchExtractPrompt(prompt: string): string { return `Today is: ${new Date().toISOString()}\n${prompt}`; } + + +export function buildRephraseToSerpPrompt(prompt: string): string { + return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results: + +Original Prompt: "${prompt}"`; +} diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index 45d18fe6..acd9f8b0 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -6,7 +6,7 @@ import { } from "../../controllers/v1/types"; import { PlanType } from "../../types"; import { logger as _logger } from "../logger"; -import { processUrl } from "./url-processor"; +import { generateBasicCompletion, processUrl } from "./url-processor"; import { scrapeDocument } from "./document-scraper"; import { generateCompletions, @@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer"; import { SourceTracker } from "./helpers/source-tracker"; import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; +import { search } from "../../search"; +import { buildRephraseToSerpPrompt } from "./build-prompts"; interface ExtractServiceOptions { request: ExtractRequest; @@ -84,16 +86,43 @@ export async function performExtraction( let totalUrlsScraped = 0; let sources: Record = {}; + const logger = _logger.child({ module: "extract", method: "performExtraction", extractId, }); - if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) { + // If no URLs are provided, generate URLs from the prompt + if ((!request.urls || request.urls.length === 0) && request.prompt) { + logger.debug("Generating URLs from prompt...", { + prompt: request.prompt, + }); + const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt)); + const searchResults = await search({ + query: rephrasedPrompt.replace('"', "").replace("'", ""), + num_results: 10, + }); + + request.urls = searchResults.map(result => result.url) as string[]; + } + if (request.urls && request.urls.length === 0) { + logger.error("No search results found", { + query: request.prompt, + }); + return { + success: false, + error: "No search results found", + extractId, + }; + } + + const urls = request.urls || ([] as string[]); + + if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) { logger.debug("Loading cached docs..."); try { - const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey); + const cache = await getCachedDocs(urls, request.__experimental_cacheKey); for (const doc of cache) { if (doc.metadata.url) { docsMap.set(normalizeUrl(doc.metadata.url), doc); @@ -122,11 +151,10 @@ export async function performExtraction( let startMap = Date.now(); let aggMapLinks: string[] = []; logger.debug("Processing URLs...", { - urlCount: request.urls.length, + urlCount: request.urls?.length || 0, }); - // Process URLs - const urlPromises = request.urls.map((url) => + const urlPromises = urls.map((url) => processUrl( { url, @@ -746,7 +774,7 @@ export async function performExtraction( time_taken: (new Date().getTime() - Date.now()) / 1000, team_id: teamId, mode: "extract", - url: request.urls.join(", "), + url: request.urls?.join(", ") || "", scrapeOptions: request, origin: request.origin ?? "api", num_tokens: totalTokensUsed, diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 3175615d..98fdf696 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1119,14 +1119,14 @@ export default class FirecrawlApp { /** * Extracts information from URLs using the Firecrawl API. * Currently in Beta. Expect breaking changes on future minor versions. - * @param url - The URL to extract information from. + * @param urls - The URLs to extract information from. Optional if using other methods for data extraction. * @param params - Additional parameters for the extract request. * @returns The response from the extract operation. */ - async extract(urls: string[], params?: ExtractParams): Promise> | ErrorResponse> { + async extract(urls?: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - let jsonData: { urls: string[] } & ExtractParams = { urls, ...params }; + let jsonData: { urls?: string[] } & ExtractParams = { urls: urls || [], ...params }; let jsonSchema: any; try { if (!params?.schema) { diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index d79b174c..4108868e 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -646,12 +646,12 @@ class FirecrawlApp: else: self._handle_error(response, "check batch scrape errors") - def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: + def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any: """ Extracts information from a URL using the Firecrawl API. Args: - urls (List[str]): The URLs to extract information from. + urls (Optional[List[str]]): The URLs to extract information from. params (Optional[ExtractParams]): Additional parameters for the extract request. Returns: @@ -662,6 +662,9 @@ class FirecrawlApp: if not params or (not params.get('prompt') and not params.get('schema')): raise ValueError("Either prompt or schema is required") + if not urls and not params.get('prompt'): + raise ValueError("Either urls or prompt is required") + schema = params.get('schema') if schema: if hasattr(schema, 'model_json_schema'): From 0fb9c1f32e9bb5f9cfc43190def468ebfe9f47d2 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 22:28:47 -0400 Subject: [PATCH 2/3] Update index.ts --- apps/js-sdk/firecrawl/src/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 98fdf696..54bb6f4f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1126,7 +1126,7 @@ export default class FirecrawlApp { async extract(urls?: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - let jsonData: { urls?: string[] } & ExtractParams = { urls: urls || [], ...params }; + let jsonData: { urls?: string[] } & ExtractParams = { urls: urls, ...params }; let jsonSchema: any; try { if (!params?.schema) { From 6d250360c29d368d5f2610c5976a92c68faf02a0 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 16 Mar 2025 22:45:16 -0400 Subject: [PATCH 3/3] Nick: bump --- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 07fdf64c..3ff8c921 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.19.2", + "version": "1.20.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index 273fc1a6..5dd9b2c7 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.13.5" +__version__ = "1.14.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl")