Merge pull request #1032 from mendableai/nsc/v1-search

(feat/v1) Search
This commit is contained in:
Nicolas 2025-01-02 19:58:44 -03:00 committed by GitHub
commit b61a1ccfd3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 533 additions and 21 deletions

View File

@ -0,0 +1,226 @@
import { Response } from "express";
import { logger } from "../../lib/logger";
import {
Document,
RequestWithAuth,
SearchRequest,
SearchResponse,
searchRequestSchema,
ScrapeOptions,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType, Mode } from "../../types";
import { getScrapeQueue } from "../../services/queue-service";
import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import * as Sentry from "@sentry/node";
async function scrapeSearchResult(
searchResult: { url: string; title: string; description: string },
options: {
teamId: string;
plan: PlanType | undefined;
origin: string;
timeout: number;
scrapeOptions: ScrapeOptions;
},
): Promise<Document> {
const jobId = uuidv4();
const jobPriority = await getJobPriority({
plan: options.plan as PlanType,
team_id: options.teamId,
basePriority: 10,
});
try {
await addScrapeJob(
{
url: searchResult.url,
mode: "single_urls" as Mode,
team_id: options.teamId,
scrapeOptions: options.scrapeOptions,
internalOptions: {},
plan: options.plan || "free",
origin: options.origin,
is_scrape: true,
},
{},
jobId,
jobPriority,
);
const doc = await waitForJob<Document>(jobId, options.timeout);
await getScrapeQueue().remove(jobId);
// Move SERP results to top level
return {
title: searchResult.title,
description: searchResult.description,
url: searchResult.url,
...doc,
};
} catch (error) {
logger.error(`Error in scrapeSearchResult: ${error}`, {
url: searchResult.url,
teamId: options.teamId,
});
// Return a minimal document with SERP results at top level
return {
title: searchResult.title,
description: searchResult.description,
url: searchResult.url,
metadata: {
title: searchResult.title,
description: searchResult.description,
sourceURL: searchResult.url,
statusCode: 0,
error: error.message,
},
};
}
}
export async function searchController(
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
res: Response<SearchResponse>,
) {
try {
req.body = searchRequestSchema.parse(req.body);
const jobId = uuidv4();
const startTime = new Date().getTime();
let limit = req.body.limit;
// Buffer results by 50% to account for filtered URLs
const num_results_buffer = Math.floor(limit * 1.5);
let searchResults = await search({
query: req.body.query,
advanced: false,
num_results: num_results_buffer,
tbs: req.body.tbs,
filter: req.body.filter,
lang: req.body.lang,
country: req.body.country,
location: req.body.location,
});
// Filter blocked URLs early to avoid unnecessary billing
searchResults = searchResults.filter((r) => !isUrlBlocked(r.url));
if (searchResults.length > limit) {
searchResults = searchResults.slice(0, limit);
}
if (searchResults.length === 0) {
return res.status(200).json({
success: true,
data: [],
warning: "No search results found",
});
}
if (
!req.body.scrapeOptions.formats ||
req.body.scrapeOptions.formats.length === 0
) {
billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
(error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
);
},
);
return res.status(200).json({
success: true,
data: searchResults.map((r) => ({
url: r.url,
title: r.title,
description: r.description,
metadata: {
title: r.title,
description: r.description,
sourceURL: r.url,
statusCode: 0,
},
})) as Document[],
});
}
// Scrape each result, handling timeouts individually
const scrapePromises = searchResults.map((result) =>
scrapeSearchResult(result, {
teamId: req.auth.team_id,
plan: req.auth.plan,
origin: req.body.origin,
timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions,
}),
);
const docs = await Promise.all(scrapePromises);
// Bill for successful scrapes only
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
);
});
// Filter out empty content but keep docs with SERP results
const filteredDocs = docs.filter(
(doc) =>
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
);
if (filteredDocs.length === 0) {
return res.status(200).json({
success: true,
data: docs,
warning: "No content found in search results",
});
}
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({
job_id: jobId,
success: true,
num_docs: filteredDocs.length,
docs: filteredDocs,
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "search",
url: req.body.query,
origin: req.body.origin,
});
return res.status(200).json({
success: true,
data: filteredDocs,
});
} catch (error) {
if (
error instanceof Error &&
(error.message.startsWith("Job wait") || error.message === "timeout")
) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
}
Sentry.captureException(error);
logger.error("Unhandled error occurred in search", { error });
return res.status(500).json({
success: false,
error: error.message,
});
}
}

View File

@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = {
title?: string;
description?: string;
url?: string;
markdown?: string;
html?: string;
rawHtml?: string;
@ -426,6 +429,11 @@ export type Document = {
error?: string;
[key: string]: string | string[] | number | undefined;
};
serpResults?: {
title: string;
description: string;
url: string;
};
}
export type ErrorResponse = {
@ -757,3 +765,36 @@ export function toLegacyDocument(
warning: document.warning,
};
}
export const searchRequestSchema = z.object({
query: z.string(),
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
tbs: z.string().optional(),
filter: z.string().optional(),
lang: z.string().optional().default("en"),
country: z.string().optional().default("us"),
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions.extend({
formats: z.array(z.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
"extract"
])).default([])
}).default({}),
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
export type SearchRequest = z.infer<typeof searchRequestSchema>;
export type SearchResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document[];
};

View File

@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
// import { readinessController } from "../controllers/v1/readiness";
import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { searchController } from "../controllers/v1/search";
function checkCreditsMiddleware(
minimum?: number,
@ -169,6 +170,13 @@ v1Router.post(
wrap(batchScrapeController),
);
v1Router.post(
"/search",
authMiddleware(RateLimiterMode.Search),
checkCreditsMiddleware(),
wrap(searchController),
);
v1Router.post(
"/map",
authMiddleware(RateLimiterMode.Map),
@ -231,3 +239,6 @@ v1Router.get(
authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController),
);

View File

@ -8,7 +8,7 @@ import { serper_search } from "./serper";
export async function search({
query,
advanced = false,
num_results = 7,
num_results = 5,
tbs = undefined,
filter = undefined,
lang = "en",

View File

@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";
type Mode = "crawl" | "single_urls" | "sitemap";
export { Mode };
export interface CrawlResult {
source: string;
content: string;

View File

@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
expect(filteredLinks?.length).toBeGreaterThan(0);
}, 30000); // 30 seconds timeout
test('should throw NotImplementedError for search on v1', async () => {
test('should search with string query', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
const response = await app.search("firecrawl");
expect(response.success).toBe(true);
console.log(response.data);
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]?.markdown).toBeDefined();
expect(response.data?.[0]?.metadata).toBeDefined();
expect(response.data?.[0]?.metadata?.title).toBeDefined();
expect(response.data?.[0]?.metadata?.description).toBeDefined();
});
test('should search with params object', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
const response = await app.search("firecrawl", {
limit: 3,
lang: 'en',
country: 'us',
scrapeOptions: {
formats: ['markdown', 'html', 'links'],
onlyMainContent: true
}
});
expect(response.success).toBe(true);
expect(response.data.length).toBeLessThanOrEqual(3);
for (const doc of response.data) {
expect(doc.markdown).toBeDefined();
expect(doc.html).toBeDefined();
expect(doc.links).toBeDefined();
expect(doc.metadata).toBeDefined();
expect(doc.metadata?.title).toBeDefined();
expect(doc.metadata?.description).toBeDefined();
}
});
test('should handle invalid API key for search', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
});
});

View File

@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
screenshot?: string;
metadata?: FirecrawlDocumentMetadata;
actions: ActionsSchema;
// v1 search only
title?: string;
description?: string;
}
/**
@ -282,6 +285,33 @@ export class FirecrawlError extends Error {
}
}
/**
* Parameters for search operations.
* Defines options for searching and scraping search results.
*/
export interface SearchParams {
limit?: number;
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
origin?: string;
timeout?: number;
scrapeOptions?: ScrapeParams;
}
/**
* Response interface for search operations.
* Defines the structure of the response received after a search operation.
*/
export interface SearchResponse {
success: boolean;
data: FirecrawlDocument<undefined>[];
warning?: string;
error?: string;
}
/**
* Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content.
@ -369,16 +399,80 @@ export default class FirecrawlApp {
}
/**
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
* Searches using the Firecrawl API and optionally scrapes the results.
* @param query - The search query string.
* @param params - Additional parameters for the search.
* @returns Throws an error advising to use version 0 of the API.
* @param params - Optional parameters for the search request.
* @returns The response from the search operation.
*/
async search(
query: string,
params?: any
): Promise<any> {
throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400);
async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
const headers: AxiosRequestHeaders = {
"Content-Type": "application/json",
Authorization: `Bearer ${this.apiKey}`,
} as AxiosRequestHeaders;
let jsonData: any = {
query,
limit: params?.limit ?? 5,
tbs: params?.tbs,
filter: params?.filter,
lang: params?.lang ?? "en",
country: params?.country ?? "us",
location: params?.location,
origin: params?.origin ?? "api",
timeout: params?.timeout ?? 60000,
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
};
if (jsonData?.scrapeOptions?.extract?.schema) {
let schema = jsonData.scrapeOptions.extract.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
scrapeOptions: {
...jsonData.scrapeOptions,
extract: {
...jsonData.scrapeOptions.extract,
schema: schema,
},
},
};
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/search`,
jsonData,
headers
);
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return {
success: true,
data: responseData.data as FirecrawlDocument<any>[],
warning: responseData.warning,
};
} else {
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
}
} else {
this.handleError(response, "search");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error.", data: [] };
}
/**

View File

@ -371,4 +371,70 @@ def test_search_e2e():
# assert isinstance(llm_extraction['supports_sso'], bool)
# assert isinstance(llm_extraction['is_open_source'], bool)
def test_search_with_string_query():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("firecrawl")
assert response["success"] is True
assert len(response["data"]) > 0
assert response["data"][0]["markdown"] is not None
assert response["data"][0]["metadata"] is not None
assert response["data"][0]["metadata"]["title"] is not None
assert response["data"][0]["metadata"]["description"] is not None
def test_search_with_params_dict():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("firecrawl", {
"limit": 3,
"lang": "en",
"country": "us",
"scrapeOptions": {
"formats": ["markdown", "html", "links"],
"onlyMainContent": True
}
})
assert response["success"] is True
assert len(response["data"]) <= 3
for doc in response["data"]:
assert doc["markdown"] is not None
assert doc["html"] is not None
assert doc["links"] is not None
assert doc["metadata"] is not None
assert doc["metadata"]["title"] is not None
assert doc["metadata"]["description"] is not None
def test_search_with_params_object():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
params = SearchParams(
query="firecrawl",
limit=3,
lang="en",
country="us",
scrapeOptions={
"formats": ["markdown", "html", "links"],
"onlyMainContent": True
}
)
response = app.search(params.query, params)
assert response["success"] is True
assert len(response["data"]) <= 3
for doc in response["data"]:
assert doc["markdown"] is not None
assert doc["html"] is not None
assert doc["links"] is not None
assert doc["metadata"] is not None
assert doc["metadata"]["title"] is not None
assert doc["metadata"]["description"] is not None
def test_search_invalid_api_key():
app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as e:
app.search("test query")
assert "404" in str(e.value)
def test_search_with_invalid_params():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as e:
app.search("test query", {"invalid_param": "value"})
assert "ValidationError" in str(e.value)

View File

@ -21,7 +21,28 @@ import websockets
logger : logging.Logger = logging.getLogger("firecrawl")
class SearchParams(pydantic.BaseModel):
query: str
limit: Optional[int] = 5
tbs: Optional[str] = None
filter: Optional[str] = None
lang: Optional[str] = "en"
country: Optional[str] = "us"
location: Optional[str] = None
origin: Optional[str] = "api"
timeout: Optional[int] = 60000
scrapeOptions: Optional[Dict[str, Any]] = None
class FirecrawlApp:
class SearchResponse(pydantic.BaseModel):
"""
Response from the search operation.
"""
success: bool
data: List[Dict[str, Any]]
warning: Optional[str] = None
error: Optional[str] = None
class ExtractParams(pydantic.BaseModel):
"""
Parameters for the extract operation.
@ -109,22 +130,36 @@ class FirecrawlApp:
else:
self._handle_error(response, 'scrape URL')
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
"""
Perform a search using the Firecrawl API.
Search for content using the Firecrawl API.
Args:
query (str): The search query.
params (Optional[Dict[str, Any]]): Additional parameters for the search request.
query (str): The search query string.
params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
Returns:
Any: The search results if the request is successful.
Raises:
NotImplementedError: If the search request is attempted on API version v1.
Exception: If the search request fails.
Dict[str, Any]: The search response containing success status and search results.
"""
raise NotImplementedError("Search is not supported in v1.")
if params is None:
params = {}
if isinstance(params, dict):
search_params = SearchParams(query=query, **params)
else:
search_params = params
search_params.query = query
response = requests.post(
f"{self.api_url}/v1/search",
headers={"Authorization": f"Bearer {self.api_key}"},
json=search_params.dict(exclude_none=True)
)
if response.status_code != 200:
raise Exception(f"Request failed with status code {response.status_code}")
return response.json()
def crawl_url(self, url: str,
params: Optional[Dict[str, Any]] = None,