(feat/extract) URLs can now be optional in /extract (#1346)

* Nick: urls optional on extract

* Update index.ts
This commit is contained in:
Nicolas 2025-03-16 22:29:25 -04:00 committed by GitHub
parent 200de9e7e7
commit 20c93db43f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 58 additions and 13 deletions

View File

@ -314,7 +314,8 @@ export const extractV1Options = z
.object({ .object({
urls: url urls: url
.array() .array()
.max(10, "Maximum of 10 URLs allowed per request while in beta."), .max(10, "Maximum of 10 URLs allowed per request while in beta.")
.optional(),
prompt: z.string().max(10000).optional(), prompt: z.string().max(10000).optional(),
systemPrompt: z.string().max(10000).optional(), systemPrompt: z.string().max(10000).optional(),
schema: z schema: z
@ -354,6 +355,12 @@ export const extractV1Options = z
.optional(), .optional(),
}) })
.strict(strictMessage) .strict(strictMessage)
.refine(
(obj) => obj.urls || obj.prompt,
{
message: "Either 'urls' or 'prompt' must be provided.",
},
)
.transform((obj) => ({ .transform((obj) => ({
...obj, ...obj,
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch, allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,

View File

@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt(
export function buildBatchExtractPrompt(prompt: string): string { export function buildBatchExtractPrompt(prompt: string): string {
return `Today is: ${new Date().toISOString()}\n${prompt}`; return `Today is: ${new Date().toISOString()}\n${prompt}`;
} }
export function buildRephraseToSerpPrompt(prompt: string): string {
return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results:
Original Prompt: "${prompt}"`;
}

View File

@ -6,7 +6,7 @@ import {
} from "../../controllers/v1/types"; } from "../../controllers/v1/types";
import { PlanType } from "../../types"; import { PlanType } from "../../types";
import { logger as _logger } from "../logger"; import { logger as _logger } from "../logger";
import { processUrl } from "./url-processor"; import { generateBasicCompletion, processUrl } from "./url-processor";
import { scrapeDocument } from "./document-scraper"; import { scrapeDocument } from "./document-scraper";
import { import {
generateCompletions, generateCompletions,
@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer";
import { SourceTracker } from "./helpers/source-tracker"; import { SourceTracker } from "./helpers/source-tracker";
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
import { normalizeUrl } from "../canonical-url"; import { normalizeUrl } from "../canonical-url";
import { search } from "../../search";
import { buildRephraseToSerpPrompt } from "./build-prompts";
interface ExtractServiceOptions { interface ExtractServiceOptions {
request: ExtractRequest; request: ExtractRequest;
@ -84,16 +86,43 @@ export async function performExtraction(
let totalUrlsScraped = 0; let totalUrlsScraped = 0;
let sources: Record<string, string[]> = {}; let sources: Record<string, string[]> = {};
const logger = _logger.child({ const logger = _logger.child({
module: "extract", module: "extract",
method: "performExtraction", method: "performExtraction",
extractId, extractId,
}); });
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) { // If no URLs are provided, generate URLs from the prompt
if ((!request.urls || request.urls.length === 0) && request.prompt) {
logger.debug("Generating URLs from prompt...", {
prompt: request.prompt,
});
const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt));
const searchResults = await search({
query: rephrasedPrompt.replace('"', "").replace("'", ""),
num_results: 10,
});
request.urls = searchResults.map(result => result.url) as string[];
}
if (request.urls && request.urls.length === 0) {
logger.error("No search results found", {
query: request.prompt,
});
return {
success: false,
error: "No search results found",
extractId,
};
}
const urls = request.urls || ([] as string[]);
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
logger.debug("Loading cached docs..."); logger.debug("Loading cached docs...");
try { try {
const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey); const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
for (const doc of cache) { for (const doc of cache) {
if (doc.metadata.url) { if (doc.metadata.url) {
docsMap.set(normalizeUrl(doc.metadata.url), doc); docsMap.set(normalizeUrl(doc.metadata.url), doc);
@ -122,11 +151,10 @@ export async function performExtraction(
let startMap = Date.now(); let startMap = Date.now();
let aggMapLinks: string[] = []; let aggMapLinks: string[] = [];
logger.debug("Processing URLs...", { logger.debug("Processing URLs...", {
urlCount: request.urls.length, urlCount: request.urls?.length || 0,
}); });
// Process URLs const urlPromises = urls.map((url) =>
const urlPromises = request.urls.map((url) =>
processUrl( processUrl(
{ {
url, url,
@ -746,7 +774,7 @@ export async function performExtraction(
time_taken: (new Date().getTime() - Date.now()) / 1000, time_taken: (new Date().getTime() - Date.now()) / 1000,
team_id: teamId, team_id: teamId,
mode: "extract", mode: "extract",
url: request.urls.join(", "), url: request.urls?.join(", ") || "",
scrapeOptions: request, scrapeOptions: request,
origin: request.origin ?? "api", origin: request.origin ?? "api",
num_tokens: totalTokensUsed, num_tokens: totalTokensUsed,

View File

@ -1119,14 +1119,14 @@ export default class FirecrawlApp {
/** /**
* Extracts information from URLs using the Firecrawl API. * Extracts information from URLs using the Firecrawl API.
* Currently in Beta. Expect breaking changes on future minor versions. * Currently in Beta. Expect breaking changes on future minor versions.
* @param url - The URL to extract information from. * @param urls - The URLs to extract information from. Optional if using other methods for data extraction.
* @param params - Additional parameters for the extract request. * @param params - Additional parameters for the extract request.
* @returns The response from the extract operation. * @returns The response from the extract operation.
*/ */
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> { async extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders();
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params }; let jsonData: { urls?: string[] } & ExtractParams<T> = { urls: urls, ...params };
let jsonSchema: any; let jsonSchema: any;
try { try {
if (!params?.schema) { if (!params?.schema) {

View File

@ -646,12 +646,12 @@ class FirecrawlApp:
else: else:
self._handle_error(response, "check batch scrape errors") self._handle_error(response, "check batch scrape errors")
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any: def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any:
""" """
Extracts information from a URL using the Firecrawl API. Extracts information from a URL using the Firecrawl API.
Args: Args:
urls (List[str]): The URLs to extract information from. urls (Optional[List[str]]): The URLs to extract information from.
params (Optional[ExtractParams]): Additional parameters for the extract request. params (Optional[ExtractParams]): Additional parameters for the extract request.
Returns: Returns:
@ -662,6 +662,9 @@ class FirecrawlApp:
if not params or (not params.get('prompt') and not params.get('schema')): if not params or (not params.get('prompt') and not params.get('schema')):
raise ValueError("Either prompt or schema is required") raise ValueError("Either prompt or schema is required")
if not urls and not params.get('prompt'):
raise ValueError("Either urls or prompt is required")
schema = params.get('schema') schema = params.get('schema')
if schema: if schema:
if hasattr(schema, 'model_json_schema'): if hasattr(schema, 'model_json_schema'):