Merge pull request #1032 from mendableai/nsc/v1-search

(feat/v1) Search
2025-08-12 02:49:03 +08:00 · 2025-01-02 19:58:44 -03:00 · 2025-01-02 19:58:44 -03:00 · b61a1ccfd3
commit b61a1ccfd3
parent c822e34d37 21bf89b6cc
9 changed files with 533 additions and 21 deletions
--- a/apps/api/src/controllers/v1/search.ts
+++ b/apps/api/src/controllers/v1/search.ts
@ -0,0 +1,226 @@
+import { Response } from "express";
+import { logger } from "../../lib/logger";
+import {
+  Document,
+  RequestWithAuth,
+  SearchRequest,
+  SearchResponse,
+  searchRequestSchema,
+  ScrapeOptions,
+} from "./types";
+import { billTeam } from "../../services/billing/credit_billing";
+import { v4 as uuidv4 } from "uuid";
+import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
+import { logJob } from "../../services/logging/log_job";
+import { getJobPriority } from "../../lib/job-priority";
+import { PlanType, Mode } from "../../types";
+import { getScrapeQueue } from "../../services/queue-service";
+import { search } from "../../search";
+import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
+import * as Sentry from "@sentry/node";
+
+async function scrapeSearchResult(
+  searchResult: { url: string; title: string; description: string },
+  options: {
+    teamId: string;
+    plan: PlanType | undefined;
+    origin: string;
+    timeout: number;
+    scrapeOptions: ScrapeOptions;
+  },
+): Promise<Document> {
+  const jobId = uuidv4();
+  const jobPriority = await getJobPriority({
+    plan: options.plan as PlanType,
+    team_id: options.teamId,
+    basePriority: 10,
+  });
+
+  try {
+    await addScrapeJob(
+      {
+        url: searchResult.url,
+        mode: "single_urls" as Mode,
+        team_id: options.teamId,
+        scrapeOptions: options.scrapeOptions,
+        internalOptions: {},
+        plan: options.plan || "free",
+        origin: options.origin,
+        is_scrape: true,
+      },
+      {},
+      jobId,
+      jobPriority,
+    );
+
+    const doc = await waitForJob<Document>(jobId, options.timeout);
+    await getScrapeQueue().remove(jobId);
+
+    // Move SERP results to top level
+    return {
+      title: searchResult.title,
+      description: searchResult.description,
+      url: searchResult.url,
+      ...doc,
+    };
+  } catch (error) {
+    logger.error(`Error in scrapeSearchResult: ${error}`, {
+      url: searchResult.url,
+      teamId: options.teamId,
+    });
+
+    // Return a minimal document with SERP results at top level
+    return {
+      title: searchResult.title,
+      description: searchResult.description,
+      url: searchResult.url,
+      metadata: {
+        title: searchResult.title,
+        description: searchResult.description,
+        sourceURL: searchResult.url,
+        statusCode: 0,
+        error: error.message,
+      },
+    };
+  }
+}
+
+export async function searchController(
+  req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
+  res: Response<SearchResponse>,
+) {
+  try {
+    req.body = searchRequestSchema.parse(req.body);
+
+    const jobId = uuidv4();
+    const startTime = new Date().getTime();
+
+    let limit = req.body.limit;
+
+    // Buffer results by 50% to account for filtered URLs
+    const num_results_buffer = Math.floor(limit * 1.5);
+
+    let searchResults = await search({
+      query: req.body.query,
+      advanced: false,
+      num_results: num_results_buffer,
+      tbs: req.body.tbs,
+      filter: req.body.filter,
+      lang: req.body.lang,
+      country: req.body.country,
+      location: req.body.location,
+    });
+
+    // Filter blocked URLs early to avoid unnecessary billing
+    searchResults = searchResults.filter((r) => !isUrlBlocked(r.url));
+    if (searchResults.length > limit) {
+      searchResults = searchResults.slice(0, limit);
+    }
+
+    if (searchResults.length === 0) {
+      return res.status(200).json({
+        success: true,
+        data: [],
+        warning: "No search results found",
+      });
+    }
+
+    if (
+      !req.body.scrapeOptions.formats ||
+      req.body.scrapeOptions.formats.length === 0
+    ) {
+      billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
+        (error) => {
+          logger.error(
+            `Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
+          );
+        },
+      );
+      return res.status(200).json({
+        success: true,
+        data: searchResults.map((r) => ({
+          url: r.url,
+          title: r.title,
+          description: r.description,
+          metadata: {
+            title: r.title,
+            description: r.description,
+            sourceURL: r.url,
+            statusCode: 0,
+          },
+        })) as Document[],
+      });
+    }
+
+    // Scrape each result, handling timeouts individually
+    const scrapePromises = searchResults.map((result) =>
+      scrapeSearchResult(result, {
+        teamId: req.auth.team_id,
+        plan: req.auth.plan,
+        origin: req.body.origin,
+        timeout: req.body.timeout,
+        scrapeOptions: req.body.scrapeOptions,
+      }),
+    );
+
+    const docs = await Promise.all(scrapePromises);
+
+    // Bill for successful scrapes only
+    billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
+      logger.error(
+        `Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
+      );
+    });
+
+    // Filter out empty content but keep docs with SERP results
+    const filteredDocs = docs.filter(
+      (doc) =>
+        doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
+    );
+
+    if (filteredDocs.length === 0) {
+      return res.status(200).json({
+        success: true,
+        data: docs,
+        warning: "No content found in search results",
+      });
+    }
+
+    const endTime = new Date().getTime();
+    const timeTakenInSeconds = (endTime - startTime) / 1000;
+
+    logJob({
+      job_id: jobId,
+      success: true,
+      num_docs: filteredDocs.length,
+      docs: filteredDocs,
+      time_taken: timeTakenInSeconds,
+      team_id: req.auth.team_id,
+      mode: "search",
+      url: req.body.query,
+      origin: req.body.origin,
+    });
+
+    return res.status(200).json({
+      success: true,
+      data: filteredDocs,
+    });
+  } catch (error) {
+    if (
+      error instanceof Error &&
+      (error.message.startsWith("Job wait") || error.message === "timeout")
+    ) {
+      return res.status(408).json({
+        success: false,
+        error: "Request timed out",
+      });
+    }
+
+    Sentry.captureException(error);
+    logger.error("Unhandled error occurred in search", { error });
+    return res.status(500).json({
+      success: false,
+      error: error.message,
+    });
+  }
+}
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
 export type MapRequest = z.infer<typeof mapRequestSchema>;

 export type Document = {
+  title?: string;
+  description?: string;
+  url?: string;
  markdown?: string;
  html?: string;
  rawHtml?: string;
@ -426,6 +429,11 @@ export type Document = {
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
+  serpResults?: {
+    title: string;
+    description: string;
+    url: string;
+  };
 }

 export type ErrorResponse = {
@ -757,3 +765,36 @@ export function toLegacyDocument(
    warning: document.warning,
  };
 }
+
+export const searchRequestSchema = z.object({
+  query: z.string(),
+  limit: z.number().int().positive().finite().safe().max(10).optional().default(5),
+  tbs: z.string().optional(),
+  filter: z.string().optional(),
+  lang: z.string().optional().default("en"),
+  country: z.string().optional().default("us"),
+  location: z.string().optional(),
+  origin: z.string().optional().default("api"),
+  timeout: z.number().int().positive().finite().safe().default(60000),
+  scrapeOptions: scrapeOptions.extend({
+    formats: z.array(z.enum([
+      "markdown",
+      "html", 
+      "rawHtml",
+      "links",
+      "screenshot",
+      "screenshot@fullPage",
+      "extract"
+    ])).default([])
+  }).default({}),
+}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
+
+export type SearchRequest = z.infer<typeof searchRequestSchema>;
+
+export type SearchResponse =
+  | ErrorResponse
+  | {
+      success: true;
+      warning?: string;
+      data: Document[];
+    };
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
 // import { readinessController } from "../controllers/v1/readiness";
 import { creditUsageController } from "../controllers/v1/credit-usage";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
+import { searchController } from "../controllers/v1/search";

 function checkCreditsMiddleware(
  minimum?: number,
@ -169,6 +170,13 @@ v1Router.post(
  wrap(batchScrapeController),
 );

+v1Router.post(
+  "/search",
+  authMiddleware(RateLimiterMode.Search),
+  checkCreditsMiddleware(),
+  wrap(searchController),
+);
+
 v1Router.post(
  "/map",
  authMiddleware(RateLimiterMode.Map),
@ -231,3 +239,6 @@ v1Router.get(
  authMiddleware(RateLimiterMode.CrawlStatus),
  wrap(creditUsageController),
 );
+
+
+
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -8,7 +8,7 @@ import { serper_search } from "./serper";
 export async function search({
  query,
  advanced = false,
-  num_results = 7,
+  num_results = 5,
  tbs = undefined,
  filter = undefined,
  lang = "en",
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";

 type Mode = "crawl" | "single_urls" | "sitemap";

+export { Mode };
+
 export interface CrawlResult {
  source: string;
  content: string;
--- a/apps/js-sdk/firecrawl/src/tests/v1/e2e_withAuth/index.test.ts
+++ b/apps/js-sdk/firecrawl/src/tests/v1/e2e_withAuth/index.test.ts
@ -381,8 +381,45 @@ describe('FirecrawlApp E2E Tests', () => {
    expect(filteredLinks?.length).toBeGreaterThan(0);
  }, 30000); // 30 seconds timeout

-  test('should throw NotImplementedError for search on v1', async () => {
+  
+
+  test('should search with string query', async () => {
    const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
-    await expect(app.search("test query")).rejects.toThrow("Search is not supported in v1");
+    const response = await app.search("firecrawl");
+    expect(response.success).toBe(true);
+    console.log(response.data);
+    expect(response.data?.length).toBeGreaterThan(0);
+    expect(response.data?.[0]?.markdown).toBeDefined();
+    expect(response.data?.[0]?.metadata).toBeDefined();
+    expect(response.data?.[0]?.metadata?.title).toBeDefined();
+    expect(response.data?.[0]?.metadata?.description).toBeDefined();
+  });
+
+  test('should search with params object', async () => {
+    const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: TEST_API_KEY });
+    const response = await app.search("firecrawl", {
+      limit: 3,
+      lang: 'en',
+      country: 'us',
+      scrapeOptions: {
+        formats: ['markdown', 'html', 'links'],
+        onlyMainContent: true
+      }
+    });
+    expect(response.success).toBe(true);
+    expect(response.data.length).toBeLessThanOrEqual(3);
+    for (const doc of response.data) {
+      expect(doc.markdown).toBeDefined();
+      expect(doc.html).toBeDefined();
+      expect(doc.links).toBeDefined();
+      expect(doc.metadata).toBeDefined();
+      expect(doc.metadata?.title).toBeDefined();
+      expect(doc.metadata?.description).toBeDefined();
+    }
+  });
+
+  test('should handle invalid API key for search', async () => {
+    const app = new FirecrawlApp({ apiUrl: API_URL, apiKey: "invalid_api_key" });
+    await expect(app.search("test query")).rejects.toThrow("Request failed with status code 404");
  });
 });
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
  screenshot?: string;
  metadata?: FirecrawlDocumentMetadata;
  actions: ActionsSchema;
+  // v1 search only
+  title?: string;
+  description?: string;
 }

 /**
@ -282,6 +285,33 @@ export class FirecrawlError extends Error {
  }
 }

+/**
+ * Parameters for search operations.
+ * Defines options for searching and scraping search results.
+ */
+export interface SearchParams {
+  limit?: number;
+  tbs?: string;
+  filter?: string;
+  lang?: string;
+  country?: string;
+  location?: string;
+  origin?: string;
+  timeout?: number;
+  scrapeOptions?: ScrapeParams;
+}
+
+/**
+ * Response interface for search operations.
+ * Defines the structure of the response received after a search operation.
+ */
+export interface SearchResponse {
+  success: boolean;
+  data: FirecrawlDocument<undefined>[];
+  warning?: string;
+  error?: string;
+}
+
 /**
 * Main class for interacting with the Firecrawl API.
 * Provides methods for scraping, searching, crawling, and mapping web content.
@ -369,16 +399,80 @@ export default class FirecrawlApp {
  }

  /**
-   * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
+   * Searches using the Firecrawl API and optionally scrapes the results.
   * @param query - The search query string.
-   * @param params - Additional parameters for the search.
-   * @returns Throws an error advising to use version 0 of the API.
+   * @param params - Optional parameters for the search request.
+   * @returns The response from the search operation.
   */
-  async search(
-    query: string,
-    params?: any
-  ): Promise<any> {
-    throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400);
+  async search(query: string, params?: SearchParams | Record<string, any>): Promise<SearchResponse> {
+    const headers: AxiosRequestHeaders = {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${this.apiKey}`,
+    } as AxiosRequestHeaders;
+
+    let jsonData: any = {
+      query,
+      limit: params?.limit ?? 5,
+      tbs: params?.tbs,
+      filter: params?.filter,
+      lang: params?.lang ?? "en",
+      country: params?.country ?? "us",
+      location: params?.location,
+      origin: params?.origin ?? "api",
+      timeout: params?.timeout ?? 60000,
+      scrapeOptions: params?.scrapeOptions ?? { formats: [] },
+    };
+
+    if (jsonData?.scrapeOptions?.extract?.schema) {
+      let schema = jsonData.scrapeOptions.extract.schema;
+
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        scrapeOptions: {
+          ...jsonData.scrapeOptions,
+          extract: {
+            ...jsonData.scrapeOptions.extract,
+            schema: schema,
+          },
+        },
+      };
+    }
+
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/search`,
+        jsonData,
+        headers
+      );
+
+      if (response.status === 200) {
+        const responseData = response.data;
+        if (responseData.success) {
+          return {
+            success: true,
+            data: responseData.data as FirecrawlDocument<any>[],
+            warning: responseData.warning,
+          };
+        } else {
+          throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
+        }
+      } else {
+        this.handleError(response, "search");
+      }
+    } catch (error: any) {
+      if (error.response?.data?.error) {
+        throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
+      } else {
+        throw new FirecrawlError(error.message, 500);
+      }
+    }
+    return { success: false, error: "Internal server error.", data: [] };
  }

  /**
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
@ -371,4 +371,70 @@ def test_search_e2e():
 #     assert isinstance(llm_extraction['supports_sso'], bool)
 #     assert isinstance(llm_extraction['is_open_source'], bool)

+def test_search_with_string_query():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.search("firecrawl")
+    assert response["success"] is True
+    assert len(response["data"]) > 0
+    assert response["data"][0]["markdown"] is not None
+    assert response["data"][0]["metadata"] is not None
+    assert response["data"][0]["metadata"]["title"] is not None
+    assert response["data"][0]["metadata"]["description"] is not None
+
+def test_search_with_params_dict():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    response = app.search("firecrawl", {
+        "limit": 3,
+        "lang": "en",
+        "country": "us",
+        "scrapeOptions": {
+            "formats": ["markdown", "html", "links"],
+            "onlyMainContent": True
+        }
+    })
+    assert response["success"] is True
+    assert len(response["data"]) <= 3
+    for doc in response["data"]:
+        assert doc["markdown"] is not None
+        assert doc["html"] is not None
+        assert doc["links"] is not None
+        assert doc["metadata"] is not None
+        assert doc["metadata"]["title"] is not None
+        assert doc["metadata"]["description"] is not None
+
+def test_search_with_params_object():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    params = SearchParams(
+        query="firecrawl",
+        limit=3,
+        lang="en",
+        country="us",
+        scrapeOptions={
+            "formats": ["markdown", "html", "links"],
+            "onlyMainContent": True
+        }
+    )
+    response = app.search(params.query, params)
+    assert response["success"] is True
+    assert len(response["data"]) <= 3
+    for doc in response["data"]:
+        assert doc["markdown"] is not None
+        assert doc["html"] is not None
+        assert doc["links"] is not None
+        assert doc["metadata"] is not None
+        assert doc["metadata"]["title"] is not None
+        assert doc["metadata"]["description"] is not None
+
+def test_search_invalid_api_key():
+    app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
+    with pytest.raises(Exception) as e:
+        app.search("test query")
+    assert "404" in str(e.value)
+
+def test_search_with_invalid_params():
+    app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
+    with pytest.raises(Exception) as e:
+        app.search("test query", {"invalid_param": "value"})
+    assert "ValidationError" in str(e.value)
+

--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -21,7 +21,28 @@ import websockets

 logger : logging.Logger = logging.getLogger("firecrawl")

+class SearchParams(pydantic.BaseModel):
+    query: str
+    limit: Optional[int] = 5
+    tbs: Optional[str] = None
+    filter: Optional[str] = None
+    lang: Optional[str] = "en"
+    country: Optional[str] = "us"
+    location: Optional[str] = None
+    origin: Optional[str] = "api"
+    timeout: Optional[int] = 60000
+    scrapeOptions: Optional[Dict[str, Any]] = None
+
 class FirecrawlApp:
+    class SearchResponse(pydantic.BaseModel):
+        """
+        Response from the search operation.
+        """
+        success: bool
+        data: List[Dict[str, Any]]
+        warning: Optional[str] = None
+        error: Optional[str] = None
+
    class ExtractParams(pydantic.BaseModel):
        """
        Parameters for the extract operation.
@ -109,22 +130,36 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'scrape URL')

-    def search(self, query: str, params: Optional[Dict[str, Any]] = None) -> Any:
+    def search(self, query: str, params: Optional[Union[Dict[str, Any], SearchParams]] = None) -> Dict[str, Any]:
        """
-        Perform a search using the Firecrawl API.
+        Search for content using the Firecrawl API.

        Args:
-            query (str): The search query.
-            params (Optional[Dict[str, Any]]): Additional parameters for the search request.
+            query (str): The search query string.
+            params (Optional[Union[Dict[str, Any], SearchParams]]): Additional search parameters.

        Returns:
-            Any: The search results if the request is successful.
-
-        Raises:
-            NotImplementedError: If the search request is attempted on API version v1.
-            Exception: If the search request fails.
+            Dict[str, Any]: The search response containing success status and search results.
        """
-        raise NotImplementedError("Search is not supported in v1.")
+        if params is None:
+            params = {}
+
+        if isinstance(params, dict):
+            search_params = SearchParams(query=query, **params)
+        else:
+            search_params = params
+            search_params.query = query
+
+        response = requests.post(
+            f"{self.api_url}/v1/search",
+            headers={"Authorization": f"Bearer {self.api_key}"},
+            json=search_params.dict(exclude_none=True)
+        )
+
+        if response.status_code != 200:
+            raise Exception(f"Request failed with status code {response.status_code}")
+
+        return response.json()

    def crawl_url(self, url: str,
                  params: Optional[Dict[str, Any]] = None,