mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:35:56 +08:00
(feat/extract) URLs can now be optional in /extract (#1346)
* Nick: urls optional on extract * Update index.ts
This commit is contained in:
parent
200de9e7e7
commit
20c93db43f
@ -314,7 +314,8 @@ export const extractV1Options = z
|
|||||||
.object({
|
.object({
|
||||||
urls: url
|
urls: url
|
||||||
.array()
|
.array()
|
||||||
.max(10, "Maximum of 10 URLs allowed per request while in beta."),
|
.max(10, "Maximum of 10 URLs allowed per request while in beta.")
|
||||||
|
.optional(),
|
||||||
prompt: z.string().max(10000).optional(),
|
prompt: z.string().max(10000).optional(),
|
||||||
systemPrompt: z.string().max(10000).optional(),
|
systemPrompt: z.string().max(10000).optional(),
|
||||||
schema: z
|
schema: z
|
||||||
@ -354,6 +355,12 @@ export const extractV1Options = z
|
|||||||
.optional(),
|
.optional(),
|
||||||
})
|
})
|
||||||
.strict(strictMessage)
|
.strict(strictMessage)
|
||||||
|
.refine(
|
||||||
|
(obj) => obj.urls || obj.prompt,
|
||||||
|
{
|
||||||
|
message: "Either 'urls' or 'prompt' must be provided.",
|
||||||
|
},
|
||||||
|
)
|
||||||
.transform((obj) => ({
|
.transform((obj) => ({
|
||||||
...obj,
|
...obj,
|
||||||
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
allowExternalLinks: obj.allowExternalLinks || obj.enableWebSearch,
|
||||||
|
@ -105,3 +105,10 @@ export function buildBatchExtractSystemPrompt(
|
|||||||
export function buildBatchExtractPrompt(prompt: string): string {
|
export function buildBatchExtractPrompt(prompt: string): string {
|
||||||
return `Today is: ${new Date().toISOString()}\n${prompt}`;
|
return `Today is: ${new Date().toISOString()}\n${prompt}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
export function buildRephraseToSerpPrompt(prompt: string): string {
|
||||||
|
return `Rephrase the following prompt to be suitable for a search engine results page (SERP) query. Make sure the rephrased prompt is concise and focused on retrieving relevant search results:
|
||||||
|
|
||||||
|
Original Prompt: "${prompt}"`;
|
||||||
|
}
|
||||||
|
@ -6,7 +6,7 @@ import {
|
|||||||
} from "../../controllers/v1/types";
|
} from "../../controllers/v1/types";
|
||||||
import { PlanType } from "../../types";
|
import { PlanType } from "../../types";
|
||||||
import { logger as _logger } from "../logger";
|
import { logger as _logger } from "../logger";
|
||||||
import { processUrl } from "./url-processor";
|
import { generateBasicCompletion, processUrl } from "./url-processor";
|
||||||
import { scrapeDocument } from "./document-scraper";
|
import { scrapeDocument } from "./document-scraper";
|
||||||
import {
|
import {
|
||||||
generateCompletions,
|
generateCompletions,
|
||||||
@ -38,6 +38,8 @@ import { singleAnswerCompletion } from "./completions/singleAnswer";
|
|||||||
import { SourceTracker } from "./helpers/source-tracker";
|
import { SourceTracker } from "./helpers/source-tracker";
|
||||||
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||||
import { normalizeUrl } from "../canonical-url";
|
import { normalizeUrl } from "../canonical-url";
|
||||||
|
import { search } from "../../search";
|
||||||
|
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
@ -84,16 +86,43 @@ export async function performExtraction(
|
|||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract",
|
module: "extract",
|
||||||
method: "performExtraction",
|
method: "performExtraction",
|
||||||
extractId,
|
extractId,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey) {
|
// If no URLs are provided, generate URLs from the prompt
|
||||||
|
if ((!request.urls || request.urls.length === 0) && request.prompt) {
|
||||||
|
logger.debug("Generating URLs from prompt...", {
|
||||||
|
prompt: request.prompt,
|
||||||
|
});
|
||||||
|
const rephrasedPrompt = await generateBasicCompletion(buildRephraseToSerpPrompt(request.prompt));
|
||||||
|
const searchResults = await search({
|
||||||
|
query: rephrasedPrompt.replace('"', "").replace("'", ""),
|
||||||
|
num_results: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
request.urls = searchResults.map(result => result.url) as string[];
|
||||||
|
}
|
||||||
|
if (request.urls && request.urls.length === 0) {
|
||||||
|
logger.error("No search results found", {
|
||||||
|
query: request.prompt,
|
||||||
|
});
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
error: "No search results found",
|
||||||
|
extractId,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const urls = request.urls || ([] as string[]);
|
||||||
|
|
||||||
|
if (request.__experimental_cacheMode == "load" && request.__experimental_cacheKey && urls) {
|
||||||
logger.debug("Loading cached docs...");
|
logger.debug("Loading cached docs...");
|
||||||
try {
|
try {
|
||||||
const cache = await getCachedDocs(request.urls, request.__experimental_cacheKey);
|
const cache = await getCachedDocs(urls, request.__experimental_cacheKey);
|
||||||
for (const doc of cache) {
|
for (const doc of cache) {
|
||||||
if (doc.metadata.url) {
|
if (doc.metadata.url) {
|
||||||
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
docsMap.set(normalizeUrl(doc.metadata.url), doc);
|
||||||
@ -122,11 +151,10 @@ export async function performExtraction(
|
|||||||
let startMap = Date.now();
|
let startMap = Date.now();
|
||||||
let aggMapLinks: string[] = [];
|
let aggMapLinks: string[] = [];
|
||||||
logger.debug("Processing URLs...", {
|
logger.debug("Processing URLs...", {
|
||||||
urlCount: request.urls.length,
|
urlCount: request.urls?.length || 0,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Process URLs
|
const urlPromises = urls.map((url) =>
|
||||||
const urlPromises = request.urls.map((url) =>
|
|
||||||
processUrl(
|
processUrl(
|
||||||
{
|
{
|
||||||
url,
|
url,
|
||||||
@ -746,7 +774,7 @@ export async function performExtraction(
|
|||||||
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
time_taken: (new Date().getTime() - Date.now()) / 1000,
|
||||||
team_id: teamId,
|
team_id: teamId,
|
||||||
mode: "extract",
|
mode: "extract",
|
||||||
url: request.urls.join(", "),
|
url: request.urls?.join(", ") || "",
|
||||||
scrapeOptions: request,
|
scrapeOptions: request,
|
||||||
origin: request.origin ?? "api",
|
origin: request.origin ?? "api",
|
||||||
num_tokens: totalTokensUsed,
|
num_tokens: totalTokensUsed,
|
||||||
|
@ -1119,14 +1119,14 @@ export default class FirecrawlApp {
|
|||||||
/**
|
/**
|
||||||
* Extracts information from URLs using the Firecrawl API.
|
* Extracts information from URLs using the Firecrawl API.
|
||||||
* Currently in Beta. Expect breaking changes on future minor versions.
|
* Currently in Beta. Expect breaking changes on future minor versions.
|
||||||
* @param url - The URL to extract information from.
|
* @param urls - The URLs to extract information from. Optional if using other methods for data extraction.
|
||||||
* @param params - Additional parameters for the extract request.
|
* @param params - Additional parameters for the extract request.
|
||||||
* @returns The response from the extract operation.
|
* @returns The response from the extract operation.
|
||||||
*/
|
*/
|
||||||
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
async extract<T extends zt.ZodSchema = any>(urls?: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
|
||||||
const headers = this.prepareHeaders();
|
const headers = this.prepareHeaders();
|
||||||
|
|
||||||
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
|
let jsonData: { urls?: string[] } & ExtractParams<T> = { urls: urls, ...params };
|
||||||
let jsonSchema: any;
|
let jsonSchema: any;
|
||||||
try {
|
try {
|
||||||
if (!params?.schema) {
|
if (!params?.schema) {
|
||||||
|
@ -646,12 +646,12 @@ class FirecrawlApp:
|
|||||||
else:
|
else:
|
||||||
self._handle_error(response, "check batch scrape errors")
|
self._handle_error(response, "check batch scrape errors")
|
||||||
|
|
||||||
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
|
def extract(self, urls: Optional[List[str]] = None, params: Optional[ExtractParams] = None) -> Any:
|
||||||
"""
|
"""
|
||||||
Extracts information from a URL using the Firecrawl API.
|
Extracts information from a URL using the Firecrawl API.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
urls (List[str]): The URLs to extract information from.
|
urls (Optional[List[str]]): The URLs to extract information from.
|
||||||
params (Optional[ExtractParams]): Additional parameters for the extract request.
|
params (Optional[ExtractParams]): Additional parameters for the extract request.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@ -662,6 +662,9 @@ class FirecrawlApp:
|
|||||||
if not params or (not params.get('prompt') and not params.get('schema')):
|
if not params or (not params.get('prompt') and not params.get('schema')):
|
||||||
raise ValueError("Either prompt or schema is required")
|
raise ValueError("Either prompt or schema is required")
|
||||||
|
|
||||||
|
if not urls and not params.get('prompt'):
|
||||||
|
raise ValueError("Either urls or prompt is required")
|
||||||
|
|
||||||
schema = params.get('schema')
|
schema = params.get('schema')
|
||||||
if schema:
|
if schema:
|
||||||
if hasattr(schema, 'model_json_schema'):
|
if hasattr(schema, 'model_json_schema'):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user