From d2742bec4d0d405d73ee104f2e414f2026cb6bef Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:31:03 -0300 Subject: [PATCH 01/16] Nick: v1 search --- apps/api/src/controllers/v1/search.ts | 229 ++++++++++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 41 +++++ apps/api/src/routes/v1.ts | 11 ++ apps/api/src/search/index.ts | 2 +- apps/api/src/types.ts | 2 + apps/js-sdk/firecrawl/src/index.ts | 112 ++++++++++++- 6 files changed, 387 insertions(+), 10 deletions(-) create mode 100644 apps/api/src/controllers/v1/search.ts diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts new file mode 100644 index 00000000..b124df9d --- /dev/null +++ b/apps/api/src/controllers/v1/search.ts @@ -0,0 +1,229 @@ +import { Response } from "express"; +import { logger } from "../../lib/logger"; +import { + Document, + RequestWithAuth, + SearchRequest, + SearchResponse, + searchRequestSchema, + ScrapeOptions, +} from "./types"; +import { billTeam } from "../../services/billing/credit_billing"; +import { v4 as uuidv4 } from "uuid"; +import { addScrapeJob, waitForJob } from "../../services/queue-jobs"; +import { logJob } from "../../services/logging/log_job"; +import { getJobPriority } from "../../lib/job-priority"; +import { PlanType, Mode } from "../../types"; +import { getScrapeQueue } from "../../services/queue-service"; +import { search } from "../../search"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; +import * as Sentry from "@sentry/node"; + +async function scrapeSearchResult( + searchResult: { url: string; title: string; description: string }, + options: { + teamId: string; + plan: PlanType | undefined; + origin: string; + timeout: number; + scrapeOptions: ScrapeOptions; + }, +): Promise { + const jobId = uuidv4(); + const jobPriority = await getJobPriority({ + plan: options.plan as PlanType, + team_id: options.teamId, + basePriority: 10, + }); + + try { + await addScrapeJob( + { + url: searchResult.url, + mode: "single_urls" as Mode, + team_id: options.teamId, + scrapeOptions: options.scrapeOptions, + internalOptions: {}, + plan: options.plan || "free", + origin: options.origin, + is_scrape: true, + }, + {}, + jobId, + jobPriority, + ); + + const doc = await waitForJob(jobId, options.timeout); + await getScrapeQueue().remove(jobId); + + // Move SERP results to top level + return { + title: searchResult.title, + description: searchResult.description, + url: searchResult.url, + ...doc, + }; + } catch (error) { + logger.error(`Error in scrapeSearchResult: ${error}`, { + url: searchResult.url, + teamId: options.teamId, + }); + + // Return a minimal document with SERP results at top level + return { + metadata: { + title: searchResult.title, + description: searchResult.description, + sourceURL: searchResult.url, + statusCode: 0, + error: error.message, + }, + title: searchResult.title, + description: searchResult.description, + url: searchResult.url, + }; + } +} + +export async function searchController( + req: RequestWithAuth<{}, SearchResponse, SearchRequest>, + res: Response, +) { + try { + req.body = searchRequestSchema.parse(req.body); + + const jobId = uuidv4(); + const startTime = new Date().getTime(); + + let limit = req.body.limit; + if (req.auth.team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") { + limit = 1; + } + + // Buffer results by 50% to account for filtered URLs + const num_results_buffer = Math.floor(limit * 1.5); + + let searchResults = await search({ + query: req.body.query, + advanced: false, + num_results: num_results_buffer, + tbs: req.body.tbs, + filter: req.body.filter, + lang: req.body.lang, + country: req.body.country, + location: req.body.location, + }); + + // Filter blocked URLs early to avoid unnecessary billing + searchResults = searchResults.filter((r) => !isUrlBlocked(r.url)); + if (searchResults.length > limit) { + searchResults = searchResults.slice(0, limit); + } + + if (searchResults.length === 0) { + return res.status(200).json({ + success: true, + data: [], + warning: "No search results found", + }); + } + + if ( + !req.body.scrapeOptions.formats || + req.body.scrapeOptions.formats.length === 0 + ) { + billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch( + (error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`, + ); + }, + ); + return res.status(200).json({ + success: true, + data: searchResults.map((r) => ({ + url: r.url, + title: r.title, + description: r.description, + metadata: { + title: r.title, + description: r.description, + sourceURL: r.url, + statusCode: 0, + }, + })) as Document[], + }); + } + + // Scrape each result, handling timeouts individually + const scrapePromises = searchResults.map((result) => + scrapeSearchResult(result, { + teamId: req.auth.team_id, + plan: req.auth.plan, + origin: req.body.origin, + timeout: req.body.timeout, + scrapeOptions: req.body.scrapeOptions, + }), + ); + + const docs = await Promise.all(scrapePromises); + + // Bill for successful scrapes only + billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => { + logger.error( + `Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`, + ); + }); + + // Filter out empty content but keep docs with SERP results + const filteredDocs = docs.filter( + (doc) => + doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0), + ); + + if (filteredDocs.length === 0) { + return res.status(200).json({ + success: true, + data: docs, + warning: "No content found in search results", + }); + } + + const endTime = new Date().getTime(); + const timeTakenInSeconds = (endTime - startTime) / 1000; + + logJob({ + job_id: jobId, + success: true, + num_docs: filteredDocs.length, + docs: filteredDocs, + time_taken: timeTakenInSeconds, + team_id: req.auth.team_id, + mode: "search", + url: req.body.query, + origin: req.body.origin, + }); + + return res.status(200).json({ + success: true, + data: filteredDocs, + }); + } catch (error) { + if ( + error instanceof Error && + (error.message.startsWith("Job wait") || error.message === "timeout") + ) { + return res.status(408).json({ + success: false, + error: "Request timed out", + }); + } + + Sentry.captureException(error); + logger.error("Unhandled error occurred in search", { error }); + return res.status(500).json({ + success: false, + error: error.message, + }); + } +} diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 99c3aa6f..fe532859 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions export type MapRequest = z.infer; export type Document = { + title?: string; + description?: string; + url?: string; markdown?: string; html?: string; rawHtml?: string; @@ -426,6 +429,11 @@ export type Document = { error?: string; [key: string]: string | string[] | number | undefined; }; + serpResults?: { + title: string; + description: string; + url: string; + }; } export type ErrorResponse = { @@ -757,3 +765,36 @@ export function toLegacyDocument( warning: document.warning, }; } + +export const searchRequestSchema = z.object({ + query: z.string(), + limit: z.number().int().positive().finite().safe().optional().default(5), + tbs: z.string().optional(), + filter: z.string().optional(), + lang: z.string().optional().default("en"), + country: z.string().optional().default("us"), + location: z.string().optional(), + origin: z.string().optional().default("api"), + timeout: z.number().int().positive().finite().safe().default(60000), + scrapeOptions: scrapeOptions.extend({ + formats: z.array(z.enum([ + "markdown", + "html", + "rawHtml", + "links", + "screenshot", + "screenshot@fullPage", + "extract" + ])).default([]) + }).default({}), +}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes"); + +export type SearchRequest = z.infer; + +export type SearchResponse = + | ErrorResponse + | { + success: true; + warning?: string; + data: Document[]; + }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 1ee191ef..b6ab2ee8 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract"; // import { readinessController } from "../controllers/v1/readiness"; import { creditUsageController } from "../controllers/v1/credit-usage"; import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; +import { searchController } from "../controllers/v1/search"; function checkCreditsMiddleware( minimum?: number, @@ -169,6 +170,13 @@ v1Router.post( wrap(batchScrapeController), ); +v1Router.post( + "/search", + authMiddleware(RateLimiterMode.Search), + checkCreditsMiddleware(), + wrap(searchController), +); + v1Router.post( "/map", authMiddleware(RateLimiterMode.Map), @@ -231,3 +239,6 @@ v1Router.get( authMiddleware(RateLimiterMode.CrawlStatus), wrap(creditUsageController), ); + + + diff --git a/apps/api/src/search/index.ts b/apps/api/src/search/index.ts index 82a6b68f..d4e6ce9d 100644 --- a/apps/api/src/search/index.ts +++ b/apps/api/src/search/index.ts @@ -8,7 +8,7 @@ import { serper_search } from "./serper"; export async function search({ query, advanced = false, - num_results = 7, + num_results = 5, tbs = undefined, filter = undefined, lang = "en", diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 9db79bc5..8f6a39d9 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL"; type Mode = "crawl" | "single_urls" | "sitemap"; +export { Mode }; + export interface CrawlResult { source: string; content: string; diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 29fabf5d..e97a2624 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -68,6 +68,9 @@ export interface FirecrawlDocument[]; + warning?: string; + error?: string; +} + /** * Main class for interacting with the Firecrawl API. * Provides methods for scraping, searching, crawling, and mapping web content. @@ -369,16 +400,79 @@ export default class FirecrawlApp { } /** - * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API. - * @param query - The search query string. - * @param params - Additional parameters for the search. - * @returns Throws an error advising to use version 0 of the API. + * Searches using the Firecrawl API and optionally scrapes the results. + * @param params - Parameters for the search request. + * @returns The response from the search operation. */ - async search( - query: string, - params?: any - ): Promise { - throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400); + async search(params: SearchParams): Promise { + const headers: AxiosRequestHeaders = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + } as AxiosRequestHeaders; + + let jsonData: any = { + query: params.query, + limit: params.limit ?? 5, + tbs: params.tbs, + filter: params.filter, + lang: params.lang ?? "en", + country: params.country ?? "us", + location: params.location, + origin: params.origin ?? "api", + timeout: params.timeout ?? 60000, + scrapeOptions: params.scrapeOptions ?? { formats: [] }, + }; + + if (jsonData?.scrapeOptions?.extract?.schema) { + let schema = jsonData.scrapeOptions.extract.schema; + + // Try parsing the schema as a Zod schema + try { + schema = zodToJsonSchema(schema); + } catch (error) { + + } + jsonData = { + ...jsonData, + scrapeOptions: { + ...jsonData.scrapeOptions, + extract: { + ...jsonData.scrapeOptions.extract, + schema: schema, + }, + }, + }; + } + + try { + const response: AxiosResponse = await this.postRequest( + this.apiUrl + `/v1/search`, + jsonData, + headers + ); + + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return { + success: true, + data: responseData.data, + warning: responseData.warning, + }; + } else { + throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status); + } + } else { + this.handleError(response, "search"); + } + } catch (error: any) { + if (error.response?.data?.error) { + throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status); + } else { + throw new FirecrawlError(error.message, 500); + } + } + return { success: false, error: "Internal server error.", data: [] }; } /** From 35d720289427ab93379dbf5cc5fa24c659ce011f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:33:21 -0300 Subject: [PATCH 02/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index b124df9d..0bee71cd 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -96,9 +96,6 @@ export async function searchController( const startTime = new Date().getTime(); let limit = req.body.limit; - if (req.auth.team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") { - limit = 1; - } // Buffer results by 50% to account for filtered URLs const num_results_buffer = Math.floor(limit * 1.5); From 07a6ba5d91ac40655d66523230f0f95482fdb51e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:34:37 -0300 Subject: [PATCH 03/16] Nick: --- apps/python-sdk/firecrawl/firecrawl.py | 78 +++++++++++++++++++++++--- 1 file changed, 70 insertions(+), 8 deletions(-) diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 33d43b99..249e94af 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -22,6 +22,30 @@ import websockets logger : logging.Logger = logging.getLogger("firecrawl") class FirecrawlApp: + class SearchParams(pydantic.BaseModel): + """ + Parameters for the search operation. + """ + query: str + limit: Optional[int] = 5 + tbs: Optional[str] = None + filter: Optional[str] = None + lang: Optional[str] = "en" + country: Optional[str] = "us" + location: Optional[str] = None + origin: Optional[str] = "api" + timeout: Optional[int] = 60000 + scrapeOptions: Optional[Dict[str, Any]] = None + + class SearchResponse(pydantic.BaseModel): + """ + Response from the search operation. + """ + success: bool + data: List[Dict[str, Any]] + warning: Optional[str] = None + error: Optional[str] = None + class ExtractParams(pydantic.BaseModel): """ Parameters for the extract operation. @@ -109,22 +133,60 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any: + def search(self, params: Union[str, SearchParams, Dict[str, Any]]) -> SearchResponse: """ - Perform a search using the Firecrawl API. + Search using the Firecrawl API and optionally scrape the results. Args: - query (str): The search query. - params (Optional[Dict[str, Any]]): Additional parameters for the search request. + params (Union[str, SearchParams, Dict[str, Any]]): Search parameters. Can be: + - A string representing the search query + - A SearchParams object + - A dictionary with search parameters Returns: - Any: The search results if the request is successful. + SearchResponse: The response from the search operation. Raises: - NotImplementedError: If the search request is attempted on API version v1. - Exception: If the search request fails. + Exception: If the search operation fails. """ - raise NotImplementedError("Search is not supported in v1.") + # Convert string query to SearchParams + if isinstance(params, str): + params = self.SearchParams(query=params) + # Convert dict to SearchParams + elif isinstance(params, dict): + params = self.SearchParams(**params) + + # Validate params + if not isinstance(params, self.SearchParams): + raise ValueError("Invalid search parameters") + + # Convert to dict for request + json_data = params.model_dump(exclude_none=True) + + # Handle schema conversion if present in scrapeOptions + if json_data.get('scrapeOptions', {}).get('extract', {}).get('schema'): + try: + schema = json_data['scrapeOptions']['extract']['schema'] + if isinstance(schema, dict): + # Already a JSON schema + pass + else: + # Try to convert from a Pydantic model + schema = schema.model_json_schema() + json_data['scrapeOptions']['extract']['schema'] = schema + except Exception as e: + logger.warning(f"Failed to convert schema: {e}") + + headers = self._prepare_headers() + response = self._post_request(f'{self.api_url}/v1/search', json_data, headers) + + if response.status_code == 200: + response_data = response.json() + return self.SearchResponse(**response_data) + else: + self._handle_error(response, 'search') + + return self.SearchResponse(success=False, data=[], error="Internal server error.") def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, From eae393afb501e540c22f0225eeb2d45ee36e925a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:52:50 -0300 Subject: [PATCH 04/16] Nick: fixed js sdk --- apps/js-sdk/firecrawl/src/index.ts | 28 ++++---- apps/python-sdk/firecrawl/firecrawl.py | 93 +++++++++----------------- 2 files changed, 47 insertions(+), 74 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index e97a2624..af9dbc75 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -290,7 +290,6 @@ export class FirecrawlError extends Error { * Defines options for searching and scraping search results. */ export interface SearchParams { - query: string; limit?: number; tbs?: string; filter?: string; @@ -401,26 +400,27 @@ export default class FirecrawlApp { /** * Searches using the Firecrawl API and optionally scrapes the results. - * @param params - Parameters for the search request. + * @param query - The search query string. + * @param params - Optional parameters for the search request. * @returns The response from the search operation. */ - async search(params: SearchParams): Promise { + async search(query: string, params?: SearchParams | Record): Promise { const headers: AxiosRequestHeaders = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { - query: params.query, - limit: params.limit ?? 5, - tbs: params.tbs, - filter: params.filter, - lang: params.lang ?? "en", - country: params.country ?? "us", - location: params.location, - origin: params.origin ?? "api", - timeout: params.timeout ?? 60000, - scrapeOptions: params.scrapeOptions ?? { formats: [] }, + query, + limit: params?.limit ?? 5, + tbs: params?.tbs, + filter: params?.filter, + lang: params?.lang ?? "en", + country: params?.country ?? "us", + location: params?.location, + origin: params?.origin ?? "api", + timeout: params?.timeout ?? 60000, + scrapeOptions: params?.scrapeOptions ?? { formats: [] }, }; if (jsonData?.scrapeOptions?.extract?.schema) { @@ -456,7 +456,7 @@ export default class FirecrawlApp { if (responseData.success) { return { success: true, - data: responseData.data, + data: responseData.data as FirecrawlDocument[], warning: responseData.warning, }; } else { diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 249e94af..271a13f0 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -21,22 +21,19 @@ import websockets logger : logging.Logger = logging.getLogger("firecrawl") -class FirecrawlApp: - class SearchParams(pydantic.BaseModel): - """ - Parameters for the search operation. - """ - query: str - limit: Optional[int] = 5 - tbs: Optional[str] = None - filter: Optional[str] = None - lang: Optional[str] = "en" - country: Optional[str] = "us" - location: Optional[str] = None - origin: Optional[str] = "api" - timeout: Optional[int] = 60000 - scrapeOptions: Optional[Dict[str, Any]] = None +class SearchParams(pydantic.BaseModel): + query: str + limit: Optional[int] = 5 + tbs: Optional[str] = None + filter: Optional[str] = None + lang: Optional[str] = "en" + country: Optional[str] = "us" + location: Optional[str] = None + origin: Optional[str] = "api" + timeout: Optional[int] = 60000 + scrapeOptions: Optional[Dict[str, Any]] = None +class FirecrawlApp: class SearchResponse(pydantic.BaseModel): """ Response from the search operation. @@ -133,60 +130,36 @@ class FirecrawlApp: else: self._handle_error(response, 'scrape URL') - def search(self, params: Union[str, SearchParams, Dict[str, Any]]) -> SearchResponse: + def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]: """ - Search using the Firecrawl API and optionally scrape the results. + Search for content using the Firecrawl API. Args: - params (Union[str, SearchParams, Dict[str, Any]]): Search parameters. Can be: - - A string representing the search query - - A SearchParams object - - A dictionary with search parameters + query (str): The search query string. + params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters. Returns: - SearchResponse: The response from the search operation. - - Raises: - Exception: If the search operation fails. + Dict[str, Any]: The search response containing success status and search results. """ - # Convert string query to SearchParams - if isinstance(params, str): - params = self.SearchParams(query=params) - # Convert dict to SearchParams - elif isinstance(params, dict): - params = self.SearchParams(**params) - - # Validate params - if not isinstance(params, self.SearchParams): - raise ValueError("Invalid search parameters") + if params is None: + params = {} - # Convert to dict for request - json_data = params.model_dump(exclude_none=True) - - # Handle schema conversion if present in scrapeOptions - if json_data.get('scrapeOptions', {}).get('extract', {}).get('schema'): - try: - schema = json_data['scrapeOptions']['extract']['schema'] - if isinstance(schema, dict): - # Already a JSON schema - pass - else: - # Try to convert from a Pydantic model - schema = schema.model_json_schema() - json_data['scrapeOptions']['extract']['schema'] = schema - except Exception as e: - logger.warning(f"Failed to convert schema: {e}") - - headers = self._prepare_headers() - response = self._post_request(f'{self.api_url}/v1/search', json_data, headers) - - if response.status_code == 200: - response_data = response.json() - return self.SearchResponse(**response_data) + if isinstance(params, dict): + search_params = SearchParams(query=query, **params) else: - self._handle_error(response, 'search') + search_params = params + search_params.query = query - return self.SearchResponse(success=False, data=[], error="Internal server error.") + response = requests.post( + f"{self.api_url}/v1/search", + headers={"Authorization": f"Bearer {self.api_key}"}, + json=search_params.dict(exclude_none=True) + ) + + if response.status_code != 200: + raise Exception(f"Request failed with status code {response.status_code}") + + return response.json() def crawl_url(self, url: str, params: Optional[Dict[str, Any]] = None, From 25da20efd2bb346034736c4f13f1388e5cd7a80e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:53:54 -0300 Subject: [PATCH 05/16] Nick: e2e --- .../__tests__/v1/e2e_withAuth/index.test.ts | 41 +++++++++++- .../__tests__/v1/e2e_withAuth/test.py | 66 +++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index e5c04209..2e601dc4 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => { expect(filteredLinks?.length).toBeGreaterThan(0); }, 30000); // 30 seconds timeout - test('should throw NotImplementedError for search on v1', async () => { + + + test('should search with string query', async () => { const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); - await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1"); + const response = await app.search("firecrawl"); + expect(response.success).toBe(true); + console.log(response.data); + expect(response.data?.length).toBeGreaterThan(0); + expect(response.data?.[0]?.markdown).toBeDefined(); + expect(response.data?.[0]?.metadata).toBeDefined(); + expect(response.data?.[0]?.metadata?.title).toBeDefined(); + expect(response.data?.[0]?.metadata?.description).toBeDefined(); + }); + + test('should search with params object', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY }); + const response = await app.search("firecrawl", { + limit: 3, + lang: 'en', + country: 'us', + scrapeOptions: { + formats: ['markdown', 'html', 'links'], + onlyMainContent: true + } + }); + expect(response.success).toBe(true); + expect(response.data.length).toBeLessThanOrEqual(3); + for (const doc of response.data) { + expect(doc.markdown).toBeDefined(); + expect(doc.html).toBeDefined(); + expect(doc.links).toBeDefined(); + expect(doc.metadata).toBeDefined(); + expect(doc.metadata?.title).toBeDefined(); + expect(doc.metadata?.description).toBeDefined(); + } + }); + + test('should handle invalid API key for search', async () => { + const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" }); + await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404"); }); }); diff --git a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py index d25d43f3..eacec8da 100644 --- a/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py +++ b/apps/python-sdk/firecrawl/__tests__/v1/e2e_withAuth/test.py @@ -371,4 +371,70 @@ def test_search_e2e(): # assert isinstance(llm_extraction['supports_sso'], bool) # assert isinstance(llm_extraction['is_open_source'], bool) +def test_search_with_string_query(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.search("firecrawl") + assert response["success"] is True + assert len(response["data"]) > 0 + assert response["data"][0]["markdown"] is not None + assert response["data"][0]["metadata"] is not None + assert response["data"][0]["metadata"]["title"] is not None + assert response["data"][0]["metadata"]["description"] is not None + +def test_search_with_params_dict(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + response = app.search("firecrawl", { + "limit": 3, + "lang": "en", + "country": "us", + "scrapeOptions": { + "formats": ["markdown", "html", "links"], + "onlyMainContent": True + } + }) + assert response["success"] is True + assert len(response["data"]) <= 3 + for doc in response["data"]: + assert doc["markdown"] is not None + assert doc["html"] is not None + assert doc["links"] is not None + assert doc["metadata"] is not None + assert doc["metadata"]["title"] is not None + assert doc["metadata"]["description"] is not None + +def test_search_with_params_object(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + params = SearchParams( + query="firecrawl", + limit=3, + lang="en", + country="us", + scrapeOptions={ + "formats": ["markdown", "html", "links"], + "onlyMainContent": True + } + ) + response = app.search(params.query, params) + assert response["success"] is True + assert len(response["data"]) <= 3 + for doc in response["data"]: + assert doc["markdown"] is not None + assert doc["html"] is not None + assert doc["links"] is not None + assert doc["metadata"] is not None + assert doc["metadata"]["title"] is not None + assert doc["metadata"]["description"] is not None + +def test_search_invalid_api_key(): + app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key") + with pytest.raises(Exception) as e: + app.search("test query") + assert "404" in str(e.value) + +def test_search_with_invalid_params(): + app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY) + with pytest.raises(Exception) as e: + app.search("test query", {"invalid_param": "value"}) + assert "ValidationError" in str(e.value) + From a0dbf20c40c15730af91e1bc8354ebb1c7ee3e4b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:55:28 -0300 Subject: [PATCH 06/16] Update types.ts --- apps/api/src/controllers/v1/types.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index fe532859..ccb11586 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -768,7 +768,7 @@ export function toLegacyDocument( export const searchRequestSchema = z.object({ query: z.string(), - limit: z.number().int().positive().finite().safe().optional().default(5), + limit: z.number().int().positive().finite().safe().max(10).optional().default(5), tbs: z.string().optional(), filter: z.string().optional(), lang: z.string().optional().default("en"), From 22ae1730bdb0880bebb85522e4e1880782fe7ac4 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:57:41 -0300 Subject: [PATCH 07/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 0bee71cd..1e5c0881 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -71,6 +71,9 @@ async function scrapeSearchResult( // Return a minimal document with SERP results at top level return { + title: searchResult.title, + description: searchResult.description, + url: searchResult.url, metadata: { title: searchResult.title, description: searchResult.description, @@ -78,9 +81,7 @@ async function scrapeSearchResult( statusCode: 0, error: error.message, }, - title: searchResult.title, - description: searchResult.description, - url: searchResult.url, + }; } } From 21bf89b6ccb51ded60b92c0443853adfdbfea44b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 19:57:51 -0300 Subject: [PATCH 08/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 1e5c0881..37b469c9 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -71,7 +71,7 @@ async function scrapeSearchResult( // Return a minimal document with SERP results at top level return { - title: searchResult.title, + title: searchResult.title, description: searchResult.description, url: searchResult.url, metadata: { @@ -81,7 +81,6 @@ async function scrapeSearchResult( statusCode: 0, error: error.message, }, - }; } } From a4b6dfecd16170cf3d0f48c9442e5342838ff342 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 20:02:07 -0300 Subject: [PATCH 09/16] Nick: v1.8.0 - added /v1/search support --- apps/js-sdk/firecrawl/package.json | 2 +- apps/python-sdk/firecrawl/__init__.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 29679b8b..9aab848a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.10.1", + "version": "1.11.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index c860967a..d4d246e9 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.7.1" +__version__ = "1.8.0" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") From b244afbc82dfc1b73d56b09279f5ed16c6cb4902 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 20:26:51 -0300 Subject: [PATCH 10/16] Update README.md --- README.md | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d44bde8c..658fb8be 100644 --- a/README.md +++ b/README.md @@ -365,19 +365,18 @@ curl -X POST https://api.firecrawl.dev/v1/batch/scrape \ }' ``` -### Search (v0) (Beta) +### Search -Used to search the web, get the most relevant results, scrape each page and return the markdown. +The search endpoint combines web search with Firecrawl’s scraping capabilities to return full page content for any query. + +Include `scrapeOptions` with `formats: ["markdown"]` to get complete markdown content for each search result otherwise it defaults to getting SERP results (url, title, description). ```bash -curl -X POST https://api.firecrawl.dev/v0/search \ +curl -X POST https://api.firecrawl.dev/v1/search \ -H 'Content-Type: application/json' \ -H 'Authorization: Bearer YOUR_API_KEY' \ -d '{ - "query": "firecrawl", - "pageOptions": { - "fetchPageContent": true // false for a fast serp api - } + "query": "What is Mendable?" }' ``` @@ -387,14 +386,8 @@ curl -X POST https://api.firecrawl.dev/v0/search \ "data": [ { "url": "https://mendable.ai", - "markdown": "# Markdown Content", - "provider": "web-scraper", - "metadata": { - "title": "Mendable | AI for CX and Sales", - "description": "AI for CX and Sales", - "language": null, - "sourceURL": "https://www.mendable.ai/" - } + "title": "Mendable | AI for CX and Sales", + "description": "AI for CX and Sales" } ] } From 7ce780ac8113e7eccd97759d6272dca740b20bc6 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 20:40:38 -0300 Subject: [PATCH 11/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 37b469c9..ab1975c6 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -112,7 +112,6 @@ export async function searchController( }); // Filter blocked URLs early to avoid unnecessary billing - searchResults = searchResults.filter((r) => !isUrlBlocked(r.url)); if (searchResults.length > limit) { searchResults = searchResults.slice(0, limit); } @@ -152,7 +151,10 @@ export async function searchController( }); } - // Scrape each result, handling timeouts individually + // Filter out blocked URLs before scraping + searchResults = searchResults.filter((r) => !isUrlBlocked(r.url)); + + // Scrape each non-blocked result, handling timeouts individually const scrapePromises = searchResults.map((result) => scrapeSearchResult(result, { teamId: req.auth.team_id, From 8b64e915b37258d799a3681ea66d00f87567b692 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 21:02:55 -0300 Subject: [PATCH 12/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index ab1975c6..eecbf360 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -144,8 +144,7 @@ export async function searchController( metadata: { title: r.title, description: r.description, - sourceURL: r.url, - statusCode: 0, + sourceURL: r.url }, })) as Document[], }); From e37ab8431a0d6cf55e11f4e169e75737d45dcb5a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 21:07:14 -0300 Subject: [PATCH 13/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 5 ----- 1 file changed, 5 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index eecbf360..1648f3d2 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -141,11 +141,6 @@ export async function searchController( url: r.url, title: r.title, description: r.description, - metadata: { - title: r.title, - description: r.description, - sourceURL: r.url - }, })) as Document[], }); } From cbe07164397e047af67b7b2719723b859aacc29f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 21:13:24 -0300 Subject: [PATCH 14/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 1648f3d2..7d7bf9d3 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -18,6 +18,7 @@ import { getScrapeQueue } from "../../services/queue-service"; import { search } from "../../search"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import * as Sentry from "@sentry/node"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; async function scrapeSearchResult( searchResult: { url: string; title: string; description: string }, @@ -37,6 +38,9 @@ async function scrapeSearchResult( }); try { + if (isUrlBlocked(searchResult.url)) { + throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE); + } await addScrapeJob( { url: searchResult.url, @@ -75,9 +79,6 @@ async function scrapeSearchResult( description: searchResult.description, url: searchResult.url, metadata: { - title: searchResult.title, - description: searchResult.description, - sourceURL: searchResult.url, statusCode: 0, error: error.message, }, @@ -145,10 +146,7 @@ export async function searchController( }); } - // Filter out blocked URLs before scraping - searchResults = searchResults.filter((r) => !isUrlBlocked(r.url)); - - // Scrape each non-blocked result, handling timeouts individually + // Scrape each non-blocked result, handling timeouts individually const scrapePromises = searchResults.map((result) => scrapeSearchResult(result, { teamId: req.auth.team_id, From ad49503f8abaafc06ac909e09c5a83f42525ac68 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 2 Jan 2025 21:15:47 -0300 Subject: [PATCH 15/16] Update search.ts --- apps/api/src/controllers/v1/search.ts | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 7d7bf9d3..1d8c59eb 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -73,13 +73,17 @@ async function scrapeSearchResult( teamId: options.teamId, }); + let statusCode = 0; + if (error.message.includes("Could not scrape url")) { + statusCode = 403; + } // Return a minimal document with SERP results at top level return { title: searchResult.title, description: searchResult.description, url: searchResult.url, metadata: { - statusCode: 0, + statusCode, error: error.message, }, }; From 87757d9b8e6bacc658b48832deb47c51eaf7412a Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 3 Jan 2025 13:19:08 -0300 Subject: [PATCH 16/16] Nick: fixed schemas on extract for node --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 18 ++++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 9aab848a..68140437 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.11.0", + "version": "1.11.2", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index af9dbc75..60f485d0 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,5 +1,5 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; -import type * as zt from "zod"; +import * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; @@ -247,7 +247,7 @@ export interface MapResponse { */ export interface ExtractParams { prompt?: string; - schema?: LLMSchema; + schema?: LLMSchema | object; systemPrompt?: string; allowExternalLinks?: boolean; includeSubdomains?: boolean; @@ -835,16 +835,18 @@ export default class FirecrawlApp { async extract(urls: string[], params?: ExtractParams): Promise> | ErrorResponse> { const headers = this.prepareHeaders(); - if (!params?.prompt) { - throw new FirecrawlError("Prompt is required", 400); - } - let jsonData: { urls: string[] } & ExtractParams = { urls, ...params }; let jsonSchema: any; try { - jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined; + if (!params?.schema) { + jsonSchema = undefined; + } else if (params.schema instanceof zt.ZodType) { + jsonSchema = zodToJsonSchema(params.schema); + } else { + jsonSchema = params.schema; + } } catch (error: any) { - throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400); + throw new FirecrawlError("Invalid schema. Schema must be either a valid Zod schema or JSON schema object.", 400); } try {