diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts new file mode 100644 index 00000000..37b469c9 --- /dev/null +++ b/apps/api/src/controllers/v1/search.ts @@ -0,0 +1,226 @@ +import { Response } from "express"; +import { logger } from "../../lib/logger"; +import { + Document, + RequestWithAuth, + SearchRequest, + SearchResponse, + searchRequestSchema, + ScrapeOptions, +} from "./types"; +import { billTeam } from "../../services/billing/credit_billing"; +import { v4 as uuidv4 } from "uuid"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { logJob } from "../../services/logging/log_job"; +import { getJobPriority } from "../../lib/job-priority"; +import { PlanType, Mode } from "../../types"; +import { getScrapeQueue } from "../../services/queue-service"; +import { search } from "../../search"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import * as Sentry from "@sentry/node"; + +async function scrapeSearchResult( + searchResult: { url: string; title: string; description: string }, + options: { + teamId: string; + plan: PlanType | undefined; + origin: string; + timeout: number; + scrapeOptions: ScrapeOptions; + }, +): Promise { + const jobId = uuidv4(); + const jobPriority = await getJobPriority({ + plan: options.plan as PlanType, + team_id: options.teamId, + basePriority: 10, + }); + + try { + await addScrapeJob( + { + url: searchResult.url, + mode: "single_urls" as Mode, + team_id: options.teamId, + scrapeOptions: options.scrapeOptions, + internalOptions: {}, + plan: options.plan || "free", + origin: options.origin, + is_scrape: true, + }, + {}, + jobId, + jobPriority, + ); + + const doc = await waitForJob(jobId, options.timeout); + await getScrapeQueue().remove(jobId); + + // Move SERP results to top level + return { + title: searchResult.title, + description: searchResult.description, + url: searchResult.url, + ...doc, + }; + } catch (error) { + logger.error(`Error in scrapeSearchResult: ${error}`, { + url: searchResult.url, + teamId: options.teamId, + }); + + // Return a minimal document with SERP results at top level + return { + title: searchResult.title, + description: searchResult.description, + url: searchResult.url, + metadata: { + title: searchResult.title, + description: searchResult.description, + sourceURL: searchResult.url, + statusCode: 0, + error: error.message, + }, + }; + } +} + +export async function searchController( + req: RequestWithAuth<{}, SearchResponse, SearchRequest>, + res: Response, +) { + try { + req.body = searchRequestSchema.parse(req.body); + + const jobId = uuidv4(); + const startTime = new Date().getTime(); + + let limit = req.body.limit; + + // Buffer results by 50% to account for filtered URLs + const num_results_buffer = Math.floor(limit * 1.5); + + let searchResults = await search({ + query: req.body.query, + advanced: false, + num_results: num_results_buffer, + tbs: req.body.tbs, + filter: req.body.filter, + lang: req.body.lang, + country: req.body.country, + location: req.body.location, + }); + + // Filter blocked URLs early to avoid unnecessary billing + searchResults = searchResults.filter((r) => !isUrlBlocked(r.url)); + if (searchResults.length > limit) { + searchResults = searchResults.slice(0, limit); + } + + if (searchResults.length === 0) { + return res.status(200).json({ + success: true, + data: [], + warning: "No search results found", + }); + } + + if ( + !req.body.scrapeOptions.formats || + req.body.scrapeOptions.formats.length === 0 + ) { + billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch( + (error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`, + ); + }, + ); + return res.status(200).json({ + success: true, + data: searchResults.map((r) => ({ + url: r.url, + title: r.title, + description: r.description, + metadata: { + title: r.title, + description: r.description, + sourceURL: r.url, + statusCode: 0, + }, + })) as Document[], + }); + } + + // Scrape each result, handling timeouts individually + const scrapePromises = searchResults.map((result) => + scrapeSearchResult(result, { + teamId: req.auth.team_id, + plan: req.auth.plan, + origin: req.body.origin, + timeout: req.body.timeout, + scrapeOptions: req.body.scrapeOptions, + }), + ); + + const docs = await Promise.all(scrapePromises); + + // Bill for successful scrapes only + billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`, + ); + }); + + // Filter out empty content but keep docs with SERP results + const filteredDocs = docs.filter( + (doc) => + doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0), + ); + + if (filteredDocs.length === 0) { + return res.status(200).json({ + success: true, + data: docs, + warning: "No content found in search results", + }); + } + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + + logJob({ + job_id: jobId, + success: true, + num_docs: filteredDocs.length, + docs: filteredDocs, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "search", + url: req.body.query, + origin: req.body.origin, + }); + + return res.status(200).json({ + success: true, + data: filteredDocs, + }); + } catch (error) { + if ( + error instanceof Error && + (error.message.startsWith("Job wait") || error.message === "timeout") + ) { + return res.status(408).json({ + success: false, + error: "Request timed out", + }); + } + + Sentry.captureException(error); + logger.error("Unhandled error occurred in search", { error }); + return res.status(500).json({ + success: false, + error: error.message, + }); + } +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 99c3aa6f..ccb11586 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions export type MapRequest = z.infer; export type Document = { + title?: string; + description?: string; + url?: string; markdown?: string; html?: string; rawHtml?: string; @@ -426,6 +429,11 @@ export type Document = { error?: string; [key: string]: string | string[] | number | undefined; }; + serpResults?: { + title: string; + description: string; + url: string; + }; } export type ErrorResponse = { @@ -757,3 +765,36 @@ export function toLegacyDocument( warning: document.warning, }; } + +export const searchRequestSchema = z.object({ + query: z.string(), + limit: z.number().int().positive().finite().safe().max(10).optional().default(5), + tbs: z.string().optional(), + filter: z.string().optional(), + lang: z.string().optional().default("en"), + country: z.string().optional().default("us"), + location: z.string().optional(), + origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(60000), + scrapeOptions: scrapeOptions.extend({ + formats: z.array(z.enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract" + ])).default([]) + }).default({}), +}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes"); + +export type SearchRequest = z.infer; + +export type SearchResponse = + | ErrorResponse + | { + success: true; + warning?: string; + data: Document[]; + }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 1ee191ef..b6ab2ee8 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract"; // import { readinessController } from "../controllers/v1/readiness"; import { creditUsageController } from "../controllers/v1/credit-usage"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; +import { searchController } from "../controllers/v1/search"; function checkCreditsMiddleware( minimum?: number, @@ -169,6 +170,13 @@ v1Router.post( wrap(batchScrapeController), ); +v1Router.post( + "/search", + authMiddleware(RateLimiterMode.Search), + checkCreditsMiddleware(), + wrap(searchController), +); + v1Router.post( "/map", authMiddleware(RateLimiterMode.Map), @@ -231,3 +239,6 @@ v1Router.get( authMiddleware(RateLimiterMode.CrawlStatus), wrap(creditUsageController), ); + + + diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 82a6b68f..d4e6ce9d 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -8,7 +8,7 @@ import { serper_search } from "./serper"; export async function search({ query, advanced = false, - num_results = 7, + num_results = 5, tbs = undefined, filter = undefined, lang = "en", diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 9db79bc5..8f6a39d9 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL"; type Mode = "crawl" | "single_urls" | "sitemap"; +export { Mode }; + export interface CrawlResult { source: string; content: string; diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index e5c04209..2e601dc4 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => { expect(filteredLinks?.length).toBeGreaterThan(0); }, 30000); // 30 seconds timeout - test('should throw NotImplementedError for search on v1', async () => { + + + test('should search with string query', async () => { const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); - await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + const response = await app.search("firecrawl"); + expect(response.success).toBe(true); + console.log(response.data); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]?.markdown).toBeDefined(); + expect(response.data?.[0]?.metadata).toBeDefined(); + expect(response.data?.[0]?.metadata?.title).toBeDefined(); + expect(response.data?.[0]?.metadata?.description).toBeDefined(); + }); + + test('should search with params object', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + const response = await app.search("firecrawl", { + limit: 3, + lang: 'en', + country: 'us', + scrapeOptions: { + formats: ['markdown', 'html', 'links'], + onlyMainContent: true + } + }); + expect(response.success).toBe(true); + expect(response.data.length).toBeLessThanOrEqual(3); + for (const doc of response.data) { + expect(doc.markdown).toBeDefined(); + expect(doc.html).toBeDefined(); + expect(doc.links).toBeDefined(); + expect(doc.metadata).toBeDefined(); + expect(doc.metadata?.title).toBeDefined(); + expect(doc.metadata?.description).toBeDefined(); + } + }); + + test('should handle invalid API key for search', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" }); + await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404"); }); }); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 29fabf5d..af9dbc75 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -68,6 +68,9 @@ export interface FirecrawlDocument[]; + warning?: string; + error?: string; +} + /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. @@ -369,16 +399,80 @@ export default class FirecrawlApp { } /** - * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. + * Searches using the Firecrawl API and optionally scrapes the results. * @param query - The search query string. - * @param params - Additional parameters for the search. - * @returns Throws an error advising to use version 0 of the API. + * @param params - Optional parameters for the search request. + * @returns The response from the search operation. */ - async search( - query: string, - params?: any - ): Promise { - throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400); + async search(query: string, params?: SearchParams | Record): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + + let jsonData: any = { + query, + limit: params?.limit ?? 5, + tbs: params?.tbs, + filter: params?.filter, + lang: params?.lang ?? "en", + country: params?.country ?? "us", + location: params?.location, + origin: params?.origin ?? "api", + timeout: params?.timeout ?? 60000, + scrapeOptions: params?.scrapeOptions ?? { formats: [] }, + }; + + if (jsonData?.scrapeOptions?.extract?.schema) { + let schema = jsonData.scrapeOptions.extract.schema; + + // Try parsing the schema as a Zod schema + try { + schema = zodToJsonSchema(schema); + } catch (error) { + + } + jsonData = { + ...jsonData, + scrapeOptions: { + ...jsonData.scrapeOptions, + extract: { + ...jsonData.scrapeOptions.extract, + schema: schema, + }, + }, + }; + } + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/search`, + jsonData, + headers + ); + + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return { + success: true, + data: responseData.data as FirecrawlDocument[], + warning: responseData.warning, + }; + } else { + throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status); + } + } else { + this.handleError(response, "search"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error.", data: [] }; } /** diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index d25d43f3..eacec8da 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -371,4 +371,70 @@ def test_search_e2e(): # assert isinstance(llm_extraction['supports_sso'], bool) # assert isinstance(llm_extraction['is_open_source'], bool) +def test_search_with_string_query(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.search("firecrawl") + assert response["success"] is True + assert len(response["data"]) > 0 + assert response["data"][0]["markdown"] is not None + assert response["data"][0]["metadata"] is not None + assert response["data"][0]["metadata"]["title"] is not None + assert response["data"][0]["metadata"]["description"] is not None + +def test_search_with_params_dict(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.search("firecrawl", { + "limit": 3, + "lang": "en", + "country": "us", + "scrapeOptions": { + "formats": ["markdown", "html", "links"], + "onlyMainContent": True + } + }) + assert response["success"] is True + assert len(response["data"]) <= 3 + for doc in response["data"]: + assert doc["markdown"] is not None + assert doc["html"] is not None + assert doc["links"] is not None + assert doc["metadata"] is not None + assert doc["metadata"]["title"] is not None + assert doc["metadata"]["description"] is not None + +def test_search_with_params_object(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + params = SearchParams( + query="firecrawl", + limit=3, + lang="en", + country="us", + scrapeOptions={ + "formats": ["markdown", "html", "links"], + "onlyMainContent": True + } + ) + response = app.search(params.query, params) + assert response["success"] is True + assert len(response["data"]) <= 3 + for doc in response["data"]: + assert doc["markdown"] is not None + assert doc["html"] is not None + assert doc["links"] is not None + assert doc["metadata"] is not None + assert doc["metadata"]["title"] is not None + assert doc["metadata"]["description"] is not None + +def test_search_invalid_api_key(): + app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as e: + app.search("test query") + assert "404" in str(e.value) + +def test_search_with_invalid_params(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(Exception) as e: + app.search("test query", {"invalid_param": "value"}) + assert "ValidationError" in str(e.value) + diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 33d43b99..271a13f0 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -21,7 +21,28 @@ import websockets logger : logging.Logger = logging.getLogger("firecrawl") +class SearchParams(pydantic.BaseModel): + query: str + limit: Optional[int] = 5 + tbs: Optional[str] = None + filter: Optional[str] = None + lang: Optional[str] = "en" + country: Optional[str] = "us" + location: Optional[str] = None + origin: Optional[str] = "api" + timeout: Optional[int] = 60000 + scrapeOptions: Optional[Dict[str, Any]] = None + class FirecrawlApp: + class SearchResponse(pydantic.BaseModel): + """ + Response from the search operation. + """ + success: bool + data: List[Dict[str, Any]] + warning: Optional[str] = None + error: Optional[str] = None + class ExtractParams(pydantic.BaseModel): """ Parameters for the extract operation. @@ -109,22 +130,36 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: + def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]: """ - Perform a search using the Firecrawl API. + Search for content using the Firecrawl API. Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. + query (str): The search query string. + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters. Returns: - Any: The search results if the request is successful. - - Raises: - NotImplementedError: If the search request is attempted on API version v1. - Exception: If the search request fails. + Dict[str, Any]: The search response containing success status and search results. """ - raise NotImplementedError("Search is not supported in v1.") + if params is None: + params = {} + + if isinstance(params, dict): + search_params = SearchParams(query=query, **params) + else: + search_params = params + search_params.query = query + + response = requests.post( + f"{self.api_url}/v1/search", + headers={"Authorization": f"Bearer {self.api_key}"}, + json=search_params.dict(exclude_none=True) + ) + + if response.status_code != 200: + raise Exception(f"Request failed with status code {response.status_code}") + + return response.json() def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None,