Merge branch 'main' into nsc/semantic-index-extract

This commit is contained in:
Nicolas 2025-01-03 19:57:29 -03:00
commit 81cf05885b
12 changed files with 551 additions and 46 deletions

View File

@ -365,19 +365,18 @@ curl -X POST https://api.firecrawl.dev/v1/batch/scrape \
}' }'
``` ```
### Search (v0) (Beta) ### Search
Used to search the web, get the most relevant results, scrape each page and return the markdown. The search endpoint combines web search with Firecrawls scraping capabilities to return full page content for any query.
Include `scrapeOptions` with `formats: ["markdown"]` to get complete markdown content for each search result otherwise it defaults to getting SERP results (url, title, description).
```bash ```bash
curl -X POST https://api.firecrawl.dev/v0/search \ curl -X POST https://api.firecrawl.dev/v1/search \
-H 'Content-Type: application/json' \ -H 'Content-Type: application/json' \
-H 'Authorization: Bearer YOUR_API_KEY' \ -H 'Authorization: Bearer YOUR_API_KEY' \
-d '{ -d '{
"query": "firecrawl", "query": "What is Mendable?"
"pageOptions": {
"fetchPageContent": true // false for a fast serp api
}
}' }'
``` ```
@ -387,14 +386,8 @@ curl -X POST https://api.firecrawl.dev/v0/search \
"data": [ "data": [
{ {
"url": "https://mendable.ai", "url": "https://mendable.ai",
"markdown": "# Markdown Content", "title": "Mendable | AI for CX and Sales",
"provider": "web-scraper", "description": "AI for CX and Sales"
"metadata": {
"title": "Mendable | AI for CX and Sales",
"description": "AI for CX and Sales",
"language": null,
"sourceURL": "https://www.mendable.ai/"
}
} }
] ]
} }

View File

@ -0,0 +1,224 @@
import { Response } from "express";
import { logger } from "../../lib/logger";
import {
Document,
RequestWithAuth,
SearchRequest,
SearchResponse,
searchRequestSchema,
ScrapeOptions,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
import { logJob } from "../../services/logging/log_job";
import { getJobPriority } from "../../lib/job-priority";
import { PlanType, Mode } from "../../types";
import { getScrapeQueue } from "../../services/queue-service";
import { search } from "../../search";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import * as Sentry from "@sentry/node";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
async function scrapeSearchResult(
searchResult: { url: string; title: string; description: string },
options: {
teamId: string;
plan: PlanType | undefined;
origin: string;
timeout: number;
scrapeOptions: ScrapeOptions;
},
): Promise<Document> {
const jobId = uuidv4();
const jobPriority = await getJobPriority({
plan: options.plan as PlanType,
team_id: options.teamId,
basePriority: 10,
});
try {
if (isUrlBlocked(searchResult.url)) {
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
}
await addScrapeJob(
{
url: searchResult.url,
mode: "single_urls" as Mode,
team_id: options.teamId,
scrapeOptions: options.scrapeOptions,
internalOptions: {},
plan: options.plan || "free",
origin: options.origin,
is_scrape: true,
},
{},
jobId,
jobPriority,
);
const doc = await waitForJob<Document>(jobId, options.timeout);
await getScrapeQueue().remove(jobId);
// Move SERP results to top level
return {
title: searchResult.title,
description: searchResult.description,
url: searchResult.url,
...doc,
};
} catch (error) {
logger.error(`Error in scrapeSearchResult: ${error}`, {
url: searchResult.url,
teamId: options.teamId,
});
let statusCode = 0;
if (error.message.includes("Could not scrape url")) {
statusCode = 403;
}
// Return a minimal document with SERP results at top level
return {
title: searchResult.title,
description: searchResult.description,
url: searchResult.url,
metadata: {
statusCode,
error: error.message,
},
};
}
}
export async function searchController(
req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
res: Response<SearchResponse>,
) {
try {
req.body = searchRequestSchema.parse(req.body);
const jobId = uuidv4();
const startTime = new Date().getTime();
let limit = req.body.limit;
// Buffer results by 50% to account for filtered URLs
const num_results_buffer = Math.floor(limit * 1.5);
let searchResults = await search({
query: req.body.query,
advanced: false,
num_results: num_results_buffer,
tbs: req.body.tbs,
filter: req.body.filter,
lang: req.body.lang,
country: req.body.country,
location: req.body.location,
});
// Filter blocked URLs early to avoid unnecessary billing
if (searchResults.length > limit) {
searchResults = searchResults.slice(0, limit);
}
if (searchResults.length === 0) {
return res.status(200).json({
success: true,
data: [],
warning: "No search results found",
});
}
if (
!req.body.scrapeOptions.formats ||
req.body.scrapeOptions.formats.length === 0
) {
billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
(error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
);
},
);
return res.status(200).json({
success: true,
data: searchResults.map((r) => ({
url: r.url,
title: r.title,
description: r.description,
})) as Document[],
});
}
// Scrape each non-blocked result, handling timeouts individually
const scrapePromises = searchResults.map((result) =>
scrapeSearchResult(result, {
teamId: req.auth.team_id,
plan: req.auth.plan,
origin: req.body.origin,
timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions,
}),
);
const docs = await Promise.all(scrapePromises);
// Bill for successful scrapes only
billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
);
});
// Filter out empty content but keep docs with SERP results
const filteredDocs = docs.filter(
(doc) =>
doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
);
if (filteredDocs.length === 0) {
return res.status(200).json({
success: true,
data: docs,
warning: "No content found in search results",
});
}
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({
job_id: jobId,
success: true,
num_docs: filteredDocs.length,
docs: filteredDocs,
time_taken: timeTakenInSeconds,
team_id: req.auth.team_id,
mode: "search",
url: req.body.query,
origin: req.body.origin,
});
return res.status(200).json({
success: true,
data: filteredDocs,
});
} catch (error) {
if (
error instanceof Error &&
(error.message.startsWith("Job wait") || error.message === "timeout")
) {
return res.status(408).json({
success: false,
error: "Request timed out",
});
}
Sentry.captureException(error);
logger.error("Unhandled error occurred in search", { error });
return res.status(500).json({
success: false,
error: error.message,
});
}
}

View File

@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
export type MapRequest = z.infer<typeof mapRequestSchema>; export type MapRequest = z.infer<typeof mapRequestSchema>;
export type Document = { export type Document = {
title?: string;
description?: string;
url?: string;
markdown?: string; markdown?: string;
html?: string; html?: string;
rawHtml?: string; rawHtml?: string;
@ -426,6 +429,11 @@ export type Document = {
error?: string; error?: string;
[key: string]: string | string[] | number | undefined; [key: string]: string | string[] | number | undefined;
}; };
serpResults?: {
title: string;
description: string;
url: string;
};
} }
export type ErrorResponse = { export type ErrorResponse = {
@ -757,3 +765,36 @@ export function toLegacyDocument(
warning: document.warning, warning: document.warning,
}; };
} }
export const searchRequestSchema = z.object({
query: z.string(),
limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
tbs: z.string().optional(),
filter: z.string().optional(),
lang: z.string().optional().default("en"),
country: z.string().optional().default("us"),
location: z.string().optional(),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000),
scrapeOptions: scrapeOptions.extend({
formats: z.array(z.enum([
"markdown",
"html",
"rawHtml",
"links",
"screenshot",
"screenshot@fullPage",
"extract"
])).default([])
}).default({}),
}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
export type SearchRequest = z.infer<typeof searchRequestSchema>;
export type SearchResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: Document[];
};

View File

@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
// import { readinessController } from "../controllers/v1/readiness"; // import { readinessController } from "../controllers/v1/readiness";
import { creditUsageController } from "../controllers/v1/credit-usage"; import { creditUsageController } from "../controllers/v1/credit-usage";
import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
import { searchController } from "../controllers/v1/search";
function checkCreditsMiddleware( function checkCreditsMiddleware(
minimum?: number, minimum?: number,
@ -169,6 +170,13 @@ v1Router.post(
wrap(batchScrapeController), wrap(batchScrapeController),
); );
v1Router.post(
"/search",
authMiddleware(RateLimiterMode.Search),
checkCreditsMiddleware(),
wrap(searchController),
);
v1Router.post( v1Router.post(
"/map", "/map",
authMiddleware(RateLimiterMode.Map), authMiddleware(RateLimiterMode.Map),
@ -231,3 +239,6 @@ v1Router.get(
authMiddleware(RateLimiterMode.CrawlStatus), authMiddleware(RateLimiterMode.CrawlStatus),
wrap(creditUsageController), wrap(creditUsageController),
); );

View File

@ -8,7 +8,7 @@ import { serper_search } from "./serper";
export async function search({ export async function search({
query, query,
advanced = false, advanced = false,
num_results = 7, num_results = 5,
tbs = undefined, tbs = undefined,
filter = undefined, filter = undefined,
lang = "en", lang = "en",

View File

@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";
type Mode = "crawl" | "single_urls" | "sitemap"; type Mode = "crawl" | "single_urls" | "sitemap";
export { Mode };
export interface CrawlResult { export interface CrawlResult {
source: string; source: string;
content: string; content: string;

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.10.1", "version": "1.11.2",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
expect(filteredLinks?.length).toBeGreaterThan(0); expect(filteredLinks?.length).toBeGreaterThan(0);
}, 30000); // 30 seconds timeout }, 30000); // 30 seconds timeout
test('should throw NotImplementedError for search on v1', async () => {
test('should search with string query', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); const response = await app.search("firecrawl");
expect(response.success).toBe(true);
console.log(response.data);
expect(response.data?.length).toBeGreaterThan(0);
expect(response.data?.[0]?.markdown).toBeDefined();
expect(response.data?.[0]?.metadata).toBeDefined();
expect(response.data?.[0]?.metadata?.title).toBeDefined();
expect(response.data?.[0]?.metadata?.description).toBeDefined();
});
test('should search with params object', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
const response = await app.search("firecrawl", {
limit: 3,
lang: 'en',
country: 'us',
scrapeOptions: {
formats: ['markdown', 'html', 'links'],
onlyMainContent: true
}
});
expect(response.success).toBe(true);
expect(response.data.length).toBeLessThanOrEqual(3);
for (const doc of response.data) {
expect(doc.markdown).toBeDefined();
expect(doc.html).toBeDefined();
expect(doc.links).toBeDefined();
expect(doc.metadata).toBeDefined();
expect(doc.metadata?.title).toBeDefined();
expect(doc.metadata?.description).toBeDefined();
}
});
test('should handle invalid API key for search', async () => {
const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
}); });
}); });

View File

@ -1,5 +1,5 @@
import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios";
import type * as zt from "zod"; import * as zt from "zod";
import { zodToJsonSchema } from "zod-to-json-schema"; import { zodToJsonSchema } from "zod-to-json-schema";
import { WebSocket } from "isows"; import { WebSocket } from "isows";
import { TypedEventTarget } from "typescript-event-target"; import { TypedEventTarget } from "typescript-event-target";
@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
screenshot?: string; screenshot?: string;
metadata?: FirecrawlDocumentMetadata; metadata?: FirecrawlDocumentMetadata;
actions: ActionsSchema; actions: ActionsSchema;
// v1 search only
title?: string;
description?: string;
} }
/** /**
@ -244,7 +247,7 @@ export interface MapResponse {
*/ */
export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> { export interface ExtractParams<LLMSchema extends zt.ZodSchema = any> {
prompt?: string; prompt?: string;
schema?: LLMSchema; schema?: LLMSchema | object;
systemPrompt?: string; systemPrompt?: string;
allowExternalLinks?: boolean; allowExternalLinks?: boolean;
includeSubdomains?: boolean; includeSubdomains?: boolean;
@ -282,6 +285,33 @@ export class FirecrawlError extends Error {
} }
} }
/**
* Parameters for search operations.
* Defines options for searching and scraping search results.
*/
export interface SearchParams {
limit?: number;
tbs?: string;
filter?: string;
lang?: string;
country?: string;
location?: string;
origin?: string;
timeout?: number;
scrapeOptions?: ScrapeParams;
}
/**
* Response interface for search operations.
* Defines the structure of the response received after a search operation.
*/
export interface SearchResponse {
success: boolean;
data: FirecrawlDocument<undefined>[];
warning?: string;
error?: string;
}
/** /**
* Main class for interacting with the Firecrawl API. * Main class for interacting with the Firecrawl API.
* Provides methods for scraping, searching, crawling, and mapping web content. * Provides methods for scraping, searching, crawling, and mapping web content.
@ -369,16 +399,80 @@ export default class FirecrawlApp {
} }
/** /**
* This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. * Searches using the Firecrawl API and optionally scrapes the results.
* @param query - The search query string. * @param query - The search query string.
* @param params - Additional parameters for the search. * @param params - Optional parameters for the search request.
* @returns Throws an error advising to use version 0 of the API. * @returns The response from the search operation.
*/ */
async search( async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
query: string, const headers: AxiosRequestHeaders = {
params?: any "Content-Type": "application/json",
): Promise<any> { Authorization: `Bearer ${this.apiKey}`,
throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400); } as AxiosRequestHeaders;
let jsonData: any = {
query,
limit: params?.limit ?? 5,
tbs: params?.tbs,
filter: params?.filter,
lang: params?.lang ?? "en",
country: params?.country ?? "us",
location: params?.location,
origin: params?.origin ?? "api",
timeout: params?.timeout ?? 60000,
scrapeOptions: params?.scrapeOptions ?? { formats: [] },
};
if (jsonData?.scrapeOptions?.extract?.schema) {
let schema = jsonData.scrapeOptions.extract.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
scrapeOptions: {
...jsonData.scrapeOptions,
extract: {
...jsonData.scrapeOptions.extract,
schema: schema,
},
},
};
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/search`,
jsonData,
headers
);
if (response.status === 200) {
const responseData = response.data;
if (responseData.success) {
return {
success: true,
data: responseData.data as FirecrawlDocument<any>[],
warning: responseData.warning,
};
} else {
throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
}
} else {
this.handleError(response, "search");
}
} catch (error: any) {
if (error.response?.data?.error) {
throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
} else {
throw new FirecrawlError(error.message, 500);
}
}
return { success: false, error: "Internal server error.", data: [] };
} }
/** /**
@ -741,16 +835,18 @@ export default class FirecrawlApp {
async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> { async extract<T extends zt.ZodSchema = any>(urls: string[], params?: ExtractParams<T>): Promise<ExtractResponse<zt.infer<T>> | ErrorResponse> {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders();
if (!params?.prompt) {
throw new FirecrawlError("Prompt is required", 400);
}
let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params }; let jsonData: { urls: string[] } & ExtractParams<T> = { urls, ...params };
let jsonSchema: any; let jsonSchema: any;
try { try {
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined; if (!params?.schema) {
jsonSchema = undefined;
} else if (params.schema instanceof zt.ZodType) {
jsonSchema = zodToJsonSchema(params.schema);
} else {
jsonSchema = params.schema;
}
} catch (error: any) { } catch (error: any) {
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400); throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400);
} }
try { try {

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.7.1" __version__ = "1.8.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -371,4 +371,70 @@ def test_search_e2e():
# assert isinstance(llm_extraction['supports_sso'], bool) # assert isinstance(llm_extraction['supports_sso'], bool)
# assert isinstance(llm_extraction['is_open_source'], bool) # assert isinstance(llm_extraction['is_open_source'], bool)
def test_search_with_string_query():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("firecrawl")
assert response["success"] is True
assert len(response["data"]) > 0
assert response["data"][0]["markdown"] is not None
assert response["data"][0]["metadata"] is not None
assert response["data"][0]["metadata"]["title"] is not None
assert response["data"][0]["metadata"]["description"] is not None
def test_search_with_params_dict():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
response = app.search("firecrawl", {
"limit": 3,
"lang": "en",
"country": "us",
"scrapeOptions": {
"formats": ["markdown", "html", "links"],
"onlyMainContent": True
}
})
assert response["success"] is True
assert len(response["data"]) <= 3
for doc in response["data"]:
assert doc["markdown"] is not None
assert doc["html"] is not None
assert doc["links"] is not None
assert doc["metadata"] is not None
assert doc["metadata"]["title"] is not None
assert doc["metadata"]["description"] is not None
def test_search_with_params_object():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
params = SearchParams(
query="firecrawl",
limit=3,
lang="en",
country="us",
scrapeOptions={
"formats": ["markdown", "html", "links"],
"onlyMainContent": True
}
)
response = app.search(params.query, params)
assert response["success"] is True
assert len(response["data"]) <= 3
for doc in response["data"]:
assert doc["markdown"] is not None
assert doc["html"] is not None
assert doc["links"] is not None
assert doc["metadata"] is not None
assert doc["metadata"]["title"] is not None
assert doc["metadata"]["description"] is not None
def test_search_invalid_api_key():
app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
with pytest.raises(Exception) as e:
app.search("test query")
assert "404" in str(e.value)
def test_search_with_invalid_params():
app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
with pytest.raises(Exception) as e:
app.search("test query", {"invalid_param": "value"})
assert "ValidationError" in str(e.value)

View File

@ -21,7 +21,28 @@ import websockets
logger : logging.Logger = logging.getLogger("firecrawl") logger : logging.Logger = logging.getLogger("firecrawl")
class SearchParams(pydantic.BaseModel):
query: str
limit: Optional[int] = 5
tbs: Optional[str] = None
filter: Optional[str] = None
lang: Optional[str] = "en"
country: Optional[str] = "us"
location: Optional[str] = None
origin: Optional[str] = "api"
timeout: Optional[int] = 60000
scrapeOptions: Optional[Dict[str, Any]] = None
class FirecrawlApp: class FirecrawlApp:
class SearchResponse(pydantic.BaseModel):
"""
Response from the search operation.
"""
success: bool
data: List[Dict[str, Any]]
warning: Optional[str] = None
error: Optional[str] = None
class ExtractParams(pydantic.BaseModel): class ExtractParams(pydantic.BaseModel):
""" """
Parameters for the extract operation. Parameters for the extract operation.
@ -109,22 +130,36 @@ class FirecrawlApp:
else: else:
self._handle_error(response, 'scrape URL') self._handle_error(response, 'scrape URL')
def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
""" """
Perform a search using the Firecrawl API. Search for content using the Firecrawl API.
Args: Args:
query (str): The search query. query (str): The search query string.
params (Optional[Dict[str, Any]]): Additional parameters for the search request. params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.
Returns: Returns:
Any: The search results if the request is successful. Dict[str, Any]: The search response containing success status and search results.
Raises:
NotImplementedError: If the search request is attempted on API version v1.
Exception: If the search request fails.
""" """
raise NotImplementedError("Search is not supported in v1.") if params is None:
params = {}
if isinstance(params, dict):
search_params = SearchParams(query=query, **params)
else:
search_params = params
search_params.query = query
response = requests.post(
f"{self.api_url}/v1/search",
headers={"Authorization": f"Bearer {self.api_key}"},
json=search_params.dict(exclude_none=True)
)
if response.status_code != 200:
raise Exception(f"Request failed with status code {response.status_code}")
return response.json()
def crawl_url(self, url: str, def crawl_url(self, url: str,
params: Optional[Dict[str, Any]] = None, params: Optional[Dict[str, Any]] = None,