mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 19:29:01 +08:00
Nick: v1 search
This commit is contained in:
parent
c822e34d37
commit
d2742bec4d
229
apps/api/src/controllers/v1/search.ts
Normal file
229
apps/api/src/controllers/v1/search.ts
Normal file
@ -0,0 +1,229 @@
|
||||
import { Response } from "express";
|
||||
import { logger } from "../../lib/logger";
|
||||
import {
|
||||
Document,
|
||||
RequestWithAuth,
|
||||
SearchRequest,
|
||||
SearchResponse,
|
||||
searchRequestSchema,
|
||||
ScrapeOptions,
|
||||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { getJobPriority } from "../../lib/job-priority";
|
||||
import { PlanType, Mode } from "../../types";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { search } from "../../search";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import * as Sentry from "@sentry/node";
|
||||
|
||||
async function scrapeSearchResult(
|
||||
searchResult: { url: string; title: string; description: string },
|
||||
options: {
|
||||
teamId: string;
|
||||
plan: PlanType | undefined;
|
||||
origin: string;
|
||||
timeout: number;
|
||||
scrapeOptions: ScrapeOptions;
|
||||
},
|
||||
): Promise<Document> {
|
||||
const jobId = uuidv4();
|
||||
const jobPriority = await getJobPriority({
|
||||
plan: options.plan as PlanType,
|
||||
team_id: options.teamId,
|
||||
basePriority: 10,
|
||||
});
|
||||
|
||||
try {
|
||||
await addScrapeJob(
|
||||
{
|
||||
url: searchResult.url,
|
||||
mode: "single_urls" as Mode,
|
||||
team_id: options.teamId,
|
||||
scrapeOptions: options.scrapeOptions,
|
||||
internalOptions: {},
|
||||
plan: options.plan || "free",
|
||||
origin: options.origin,
|
||||
is_scrape: true,
|
||||
},
|
||||
{},
|
||||
jobId,
|
||||
jobPriority,
|
||||
);
|
||||
|
||||
const doc = await waitForJob<Document>(jobId, options.timeout);
|
||||
await getScrapeQueue().remove(jobId);
|
||||
|
||||
// Move SERP results to top level
|
||||
return {
|
||||
title: searchResult.title,
|
||||
description: searchResult.description,
|
||||
url: searchResult.url,
|
||||
...doc,
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Error in scrapeSearchResult: ${error}`, {
|
||||
url: searchResult.url,
|
||||
teamId: options.teamId,
|
||||
});
|
||||
|
||||
// Return a minimal document with SERP results at top level
|
||||
return {
|
||||
metadata: {
|
||||
title: searchResult.title,
|
||||
description: searchResult.description,
|
||||
sourceURL: searchResult.url,
|
||||
statusCode: 0,
|
||||
error: error.message,
|
||||
},
|
||||
title: searchResult.title,
|
||||
description: searchResult.description,
|
||||
url: searchResult.url,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
export async function searchController(
|
||||
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
|
||||
res: Response<SearchResponse>,
|
||||
) {
|
||||
try {
|
||||
req.body = searchRequestSchema.parse(req.body);
|
||||
|
||||
const jobId = uuidv4();
|
||||
const startTime = new Date().getTime();
|
||||
|
||||
let limit = req.body.limit;
|
||||
if (req.auth.team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
|
||||
limit = 1;
|
||||
}
|
||||
|
||||
// Buffer results by 50% to account for filtered URLs
|
||||
const num_results_buffer = Math.floor(limit * 1.5);
|
||||
|
||||
let searchResults = await search({
|
||||
query: req.body.query,
|
||||
advanced: false,
|
||||
num_results: num_results_buffer,
|
||||
tbs: req.body.tbs,
|
||||
filter: req.body.filter,
|
||||
lang: req.body.lang,
|
||||
country: req.body.country,
|
||||
location: req.body.location,
|
||||
});
|
||||
|
||||
// Filter blocked URLs early to avoid unnecessary billing
|
||||
searchResults = searchResults.filter((r) => !isUrlBlocked(r.url));
|
||||
if (searchResults.length > limit) {
|
||||
searchResults = searchResults.slice(0, limit);
|
||||
}
|
||||
|
||||
if (searchResults.length === 0) {
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: [],
|
||||
warning: "No search results found",
|
||||
});
|
||||
}
|
||||
|
||||
if (
|
||||
!req.body.scrapeOptions.formats ||
|
||||
req.body.scrapeOptions.formats.length === 0
|
||||
) {
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
|
||||
(error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
|
||||
);
|
||||
},
|
||||
);
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: searchResults.map((r) => ({
|
||||
url: r.url,
|
||||
title: r.title,
|
||||
description: r.description,
|
||||
metadata: {
|
||||
title: r.title,
|
||||
description: r.description,
|
||||
sourceURL: r.url,
|
||||
statusCode: 0,
|
||||
},
|
||||
})) as Document[],
|
||||
});
|
||||
}
|
||||
|
||||
// Scrape each result, handling timeouts individually
|
||||
const scrapePromises = searchResults.map((result) =>
|
||||
scrapeSearchResult(result, {
|
||||
teamId: req.auth.team_id,
|
||||
plan: req.auth.plan,
|
||||
origin: req.body.origin,
|
||||
timeout: req.body.timeout,
|
||||
scrapeOptions: req.body.scrapeOptions,
|
||||
}),
|
||||
);
|
||||
|
||||
const docs = await Promise.all(scrapePromises);
|
||||
|
||||
// Bill for successful scrapes only
|
||||
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
|
||||
logger.error(
|
||||
`Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
|
||||
);
|
||||
});
|
||||
|
||||
// Filter out empty content but keep docs with SERP results
|
||||
const filteredDocs = docs.filter(
|
||||
(doc) =>
|
||||
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
|
||||
);
|
||||
|
||||
if (filteredDocs.length === 0) {
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: docs,
|
||||
warning: "No content found in search results",
|
||||
});
|
||||
}
|
||||
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
||||
logJob({
|
||||
job_id: jobId,
|
||||
success: true,
|
||||
num_docs: filteredDocs.length,
|
||||
docs: filteredDocs,
|
||||
time_taken: timeTakenInSeconds,
|
||||
team_id: req.auth.team_id,
|
||||
mode: "search",
|
||||
url: req.body.query,
|
||||
origin: req.body.origin,
|
||||
});
|
||||
|
||||
return res.status(200).json({
|
||||
success: true,
|
||||
data: filteredDocs,
|
||||
});
|
||||
} catch (error) {
|
||||
if (
|
||||
error instanceof Error &&
|
||||
(error.message.startsWith("Job wait") || error.message === "timeout")
|
||||
) {
|
||||
return res.status(408).json({
|
||||
success: false,
|
||||
error: "Request timed out",
|
||||
});
|
||||
}
|
||||
|
||||
Sentry.captureException(error);
|
||||
logger.error("Unhandled error occurred in search", { error });
|
||||
return res.status(500).json({
|
||||
success: false,
|
||||
error: error.message,
|
||||
});
|
||||
}
|
||||
}
|
@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
|
||||
export type MapRequest = z.infer<typeof mapRequestSchema>;
|
||||
|
||||
export type Document = {
|
||||
title?: string;
|
||||
description?: string;
|
||||
url?: string;
|
||||
markdown?: string;
|
||||
html?: string;
|
||||
rawHtml?: string;
|
||||
@ -426,6 +429,11 @@ export type Document = {
|
||||
error?: string;
|
||||
[key: string]: string | string[] | number | undefined;
|
||||
};
|
||||
serpResults?: {
|
||||
title: string;
|
||||
description: string;
|
||||
url: string;
|
||||
};
|
||||
}
|
||||
|
||||
export type ErrorResponse = {
|
||||
@ -757,3 +765,36 @@ export function toLegacyDocument(
|
||||
warning: document.warning,
|
||||
};
|
||||
}
|
||||
|
||||
export const searchRequestSchema = z.object({
|
||||
query: z.string(),
|
||||
limit: z.number().int().positive().finite().safe().optional().default(5),
|
||||
tbs: z.string().optional(),
|
||||
filter: z.string().optional(),
|
||||
lang: z.string().optional().default("en"),
|
||||
country: z.string().optional().default("us"),
|
||||
location: z.string().optional(),
|
||||
origin: z.string().optional().default("api"),
|
||||
timeout: z.number().int().positive().finite().safe().default(60000),
|
||||
scrapeOptions: scrapeOptions.extend({
|
||||
formats: z.array(z.enum([
|
||||
"markdown",
|
||||
"html",
|
||||
"rawHtml",
|
||||
"links",
|
||||
"screenshot",
|
||||
"screenshot@fullPage",
|
||||
"extract"
|
||||
])).default([])
|
||||
}).default({}),
|
||||
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
|
||||
|
||||
export type SearchRequest = z.infer<typeof searchRequestSchema>;
|
||||
|
||||
export type SearchResponse =
|
||||
| ErrorResponse
|
||||
| {
|
||||
success: true;
|
||||
warning?: string;
|
||||
data: Document[];
|
||||
};
|
||||
|
@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
|
||||
// import { readinessController } from "../controllers/v1/readiness";
|
||||
import { creditUsageController } from "../controllers/v1/credit-usage";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { searchController } from "../controllers/v1/search";
|
||||
|
||||
function checkCreditsMiddleware(
|
||||
minimum?: number,
|
||||
@ -169,6 +170,13 @@ v1Router.post(
|
||||
wrap(batchScrapeController),
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/search",
|
||||
authMiddleware(RateLimiterMode.Search),
|
||||
checkCreditsMiddleware(),
|
||||
wrap(searchController),
|
||||
);
|
||||
|
||||
v1Router.post(
|
||||
"/map",
|
||||
authMiddleware(RateLimiterMode.Map),
|
||||
@ -231,3 +239,6 @@ v1Router.get(
|
||||
authMiddleware(RateLimiterMode.CrawlStatus),
|
||||
wrap(creditUsageController),
|
||||
);
|
||||
|
||||
|
||||
|
||||
|
@ -8,7 +8,7 @@ import { serper_search } from "./serper";
|
||||
export async function search({
|
||||
query,
|
||||
advanced = false,
|
||||
num_results = 7,
|
||||
num_results = 5,
|
||||
tbs = undefined,
|
||||
filter = undefined,
|
||||
lang = "en",
|
||||
|
@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";
|
||||
|
||||
type Mode = "crawl" | "single_urls" | "sitemap";
|
||||
|
||||
export { Mode };
|
||||
|
||||
export interface CrawlResult {
|
||||
source: string;
|
||||
content: string;
|
||||
|
@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
|
||||
screenshot?: string;
|
||||
metadata?: FirecrawlDocumentMetadata;
|
||||
actions: ActionsSchema;
|
||||
// v1 search only
|
||||
title?: string;
|
||||
description?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -282,6 +285,34 @@ export class FirecrawlError extends Error {
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parameters for search operations.
|
||||
* Defines options for searching and scraping search results.
|
||||
*/
|
||||
export interface SearchParams {
|
||||
query: string;
|
||||
limit?: number;
|
||||
tbs?: string;
|
||||
filter?: string;
|
||||
lang?: string;
|
||||
country?: string;
|
||||
location?: string;
|
||||
origin?: string;
|
||||
timeout?: number;
|
||||
scrapeOptions?: ScrapeParams;
|
||||
}
|
||||
|
||||
/**
|
||||
* Response interface for search operations.
|
||||
* Defines the structure of the response received after a search operation.
|
||||
*/
|
||||
export interface SearchResponse {
|
||||
success: boolean;
|
||||
data: FirecrawlDocument<undefined>[];
|
||||
warning?: string;
|
||||
error?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Main class for interacting with the Firecrawl API.
|
||||
* Provides methods for scraping, searching, crawling, and mapping web content.
|
||||
@ -369,16 +400,79 @@ export default class FirecrawlApp {
|
||||
}
|
||||
|
||||
/**
|
||||
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
|
||||
* @param query - The search query string.
|
||||
* @param params - Additional parameters for the search.
|
||||
* @returns Throws an error advising to use version 0 of the API.
|
||||
* Searches using the Firecrawl API and optionally scrapes the results.
|
||||
* @param params - Parameters for the search request.
|
||||
* @returns The response from the search operation.
|
||||
*/
|
||||
async search(
|
||||
query: string,
|
||||
params?: any
|
||||
): Promise<any> {
|
||||
throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400);
|
||||
async search(params: SearchParams): Promise<SearchResponse> {
|
||||
const headers: AxiosRequestHeaders = {
|
||||
"Content-Type": "application/json",
|
||||
Authorization: `Bearer ${this.apiKey}`,
|
||||
} as AxiosRequestHeaders;
|
||||
|
||||
let jsonData: any = {
|
||||
query: params.query,
|
||||
limit: params.limit ?? 5,
|
||||
tbs: params.tbs,
|
||||
filter: params.filter,
|
||||
lang: params.lang ?? "en",
|
||||
country: params.country ?? "us",
|
||||
location: params.location,
|
||||
origin: params.origin ?? "api",
|
||||
timeout: params.timeout ?? 60000,
|
||||
scrapeOptions: params.scrapeOptions ?? { formats: [] },
|
||||
};
|
||||
|
||||
if (jsonData?.scrapeOptions?.extract?.schema) {
|
||||
let schema = jsonData.scrapeOptions.extract.schema;
|
||||
|
||||
// Try parsing the schema as a Zod schema
|
||||
try {
|
||||
schema = zodToJsonSchema(schema);
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
scrapeOptions: {
|
||||
...jsonData.scrapeOptions,
|
||||
extract: {
|
||||
...jsonData.scrapeOptions.extract,
|
||||
schema: schema,
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
this.apiUrl + `/v1/search`,
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
|
||||
if (response.status === 200) {
|
||||
const responseData = response.data;
|
||||
if (responseData.success) {
|
||||
return {
|
||||
success: true,
|
||||
data: responseData.data,
|
||||
warning: responseData.warning,
|
||||
};
|
||||
} else {
|
||||
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
|
||||
}
|
||||
} else {
|
||||
this.handleError(response, "search");
|
||||
}
|
||||
} catch (error: any) {
|
||||
if (error.response?.data?.error) {
|
||||
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
|
||||
} else {
|
||||
throw new FirecrawlError(error.message, 500);
|
||||
}
|
||||
}
|
||||
return { success: false, error: "Internal server error.", data: [] };
|
||||
}
|
||||
|
||||
/**
|
||||
|
Loading…
x
Reference in New Issue
Block a user