Nick: v1 search

2025-08-12 19:29:01 +08:00 · 2025-01-02 19:31:03 -03:00 · 2025-01-02 19:31:03 -03:00 · d2742bec4d
commit d2742bec4d
parent c822e34d37
6 changed files with 387 additions and 10 deletions
--- a/apps/api/src/controllers/v1/search.ts
+++ b/apps/api/src/controllers/v1/search.ts
@ -0,0 +1,229 @@
+import { Response } from "express";
+import { logger } from "../../lib/logger";
+import {
+  Document,
+  RequestWithAuth,
+  SearchRequest,
+  SearchResponse,
+  searchRequestSchema,
+  ScrapeOptions,
+} from "./types";
+import { billTeam } from "../../services/billing/credit_billing";
+import { v4 as uuidv4 } from "uuid";
+import { addScrapeJob, waitForJob } from "../../services/queue-jobs";
+import { logJob } from "../../services/logging/log_job";
+import { getJobPriority } from "../../lib/job-priority";
+import { PlanType, Mode } from "../../types";
+import { getScrapeQueue } from "../../services/queue-service";
+import { search } from "../../search";
+import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
+import * as Sentry from "@sentry/node";
+
+async function scrapeSearchResult(
+  searchResult: { url: string; title: string; description: string },
+  options: {
+    teamId: string;
+    plan: PlanType | undefined;
+    origin: string;
+    timeout: number;
+    scrapeOptions: ScrapeOptions;
+  },
+): Promise<Document> {
+  const jobId = uuidv4();
+  const jobPriority = await getJobPriority({
+    plan: options.plan as PlanType,
+    team_id: options.teamId,
+    basePriority: 10,
+  });
+
+  try {
+    await addScrapeJob(
+      {
+        url: searchResult.url,
+        mode: "single_urls" as Mode,
+        team_id: options.teamId,
+        scrapeOptions: options.scrapeOptions,
+        internalOptions: {},
+        plan: options.plan || "free",
+        origin: options.origin,
+        is_scrape: true,
+      },
+      {},
+      jobId,
+      jobPriority,
+    );
+
+    const doc = await waitForJob<Document>(jobId, options.timeout);
+    await getScrapeQueue().remove(jobId);
+
+    // Move SERP results to top level
+    return {
+      title: searchResult.title,
+      description: searchResult.description,
+      url: searchResult.url,
+      ...doc,
+    };
+  } catch (error) {
+    logger.error(`Error in scrapeSearchResult: ${error}`, {
+      url: searchResult.url,
+      teamId: options.teamId,
+    });
+
+    // Return a minimal document with SERP results at top level
+    return {
+      metadata: {
+        title: searchResult.title,
+        description: searchResult.description,
+        sourceURL: searchResult.url,
+        statusCode: 0,
+        error: error.message,
+      },
+      title: searchResult.title,
+      description: searchResult.description,
+      url: searchResult.url,
+    };
+  }
+}
+
+export async function searchController(
+  req: RequestWithAuth<{}, SearchResponse, SearchRequest>,
+  res: Response<SearchResponse>,
+) {
+  try {
+    req.body = searchRequestSchema.parse(req.body);
+
+    const jobId = uuidv4();
+    const startTime = new Date().getTime();
+
+    let limit = req.body.limit;
+    if (req.auth.team_id === "d97c4ceb-290b-4957-8432-2b2a02727d95") {
+      limit = 1;
+    }
+
+    // Buffer results by 50% to account for filtered URLs
+    const num_results_buffer = Math.floor(limit * 1.5);
+
+    let searchResults = await search({
+      query: req.body.query,
+      advanced: false,
+      num_results: num_results_buffer,
+      tbs: req.body.tbs,
+      filter: req.body.filter,
+      lang: req.body.lang,
+      country: req.body.country,
+      location: req.body.location,
+    });
+
+    // Filter blocked URLs early to avoid unnecessary billing
+    searchResults = searchResults.filter((r) => !isUrlBlocked(r.url));
+    if (searchResults.length > limit) {
+      searchResults = searchResults.slice(0, limit);
+    }
+
+    if (searchResults.length === 0) {
+      return res.status(200).json({
+        success: true,
+        data: [],
+        warning: "No search results found",
+      });
+    }
+
+    if (
+      !req.body.scrapeOptions.formats ||
+      req.body.scrapeOptions.formats.length === 0
+    ) {
+      billTeam(req.auth.team_id, req.acuc?.sub_id, searchResults.length).catch(
+        (error) => {
+          logger.error(
+            `Failed to bill team ${req.auth.team_id} for ${searchResults.length} credits: ${error}`,
+          );
+        },
+      );
+      return res.status(200).json({
+        success: true,
+        data: searchResults.map((r) => ({
+          url: r.url,
+          title: r.title,
+          description: r.description,
+          metadata: {
+            title: r.title,
+            description: r.description,
+            sourceURL: r.url,
+            statusCode: 0,
+          },
+        })) as Document[],
+      });
+    }
+
+    // Scrape each result, handling timeouts individually
+    const scrapePromises = searchResults.map((result) =>
+      scrapeSearchResult(result, {
+        teamId: req.auth.team_id,
+        plan: req.auth.plan,
+        origin: req.body.origin,
+        timeout: req.body.timeout,
+        scrapeOptions: req.body.scrapeOptions,
+      }),
+    );
+
+    const docs = await Promise.all(scrapePromises);
+
+    // Bill for successful scrapes only
+    billTeam(req.auth.team_id, req.acuc?.sub_id, docs.length).catch((error) => {
+      logger.error(
+        `Failed to bill team ${req.auth.team_id} for ${docs.length} credits: ${error}`,
+      );
+    });
+
+    // Filter out empty content but keep docs with SERP results
+    const filteredDocs = docs.filter(
+      (doc) =>
+        doc.serpResults || (doc.markdown && doc.markdown.trim().length > 0),
+    );
+
+    if (filteredDocs.length === 0) {
+      return res.status(200).json({
+        success: true,
+        data: docs,
+        warning: "No content found in search results",
+      });
+    }
+
+    const endTime = new Date().getTime();
+    const timeTakenInSeconds = (endTime - startTime) / 1000;
+
+    logJob({
+      job_id: jobId,
+      success: true,
+      num_docs: filteredDocs.length,
+      docs: filteredDocs,
+      time_taken: timeTakenInSeconds,
+      team_id: req.auth.team_id,
+      mode: "search",
+      url: req.body.query,
+      origin: req.body.origin,
+    });
+
+    return res.status(200).json({
+      success: true,
+      data: filteredDocs,
+    });
+  } catch (error) {
+    if (
+      error instanceof Error &&
+      (error.message.startsWith("Job wait") || error.message === "timeout")
+    ) {
+      return res.status(408).json({
+        success: false,
+        error: "Request timed out",
+      });
+    }
+
+    Sentry.captureException(error);
+    logger.error("Unhandled error occurred in search", { error });
+    return res.status(500).json({
+      success: false,
+      error: error.message,
+    });
+  }
+}
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -379,6 +379,9 @@ export const mapRequestSchema = crawlerOptions
 export type MapRequest = z.infer<typeof mapRequestSchema>;

 export type Document = {
+  title?: string;
+  description?: string;
+  url?: string;
  markdown?: string;
  html?: string;
  rawHtml?: string;
@ -426,6 +429,11 @@ export type Document = {
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
+  serpResults?: {
+    title: string;
+    description: string;
+    url: string;
+  };
 }

 export type ErrorResponse = {
@ -757,3 +765,36 @@ export function toLegacyDocument(
    warning: document.warning,
  };
 }
+
+export const searchRequestSchema = z.object({
+  query: z.string(),
+  limit: z.number().int().positive().finite().safe().optional().default(5),
+  tbs: z.string().optional(),
+  filter: z.string().optional(),
+  lang: z.string().optional().default("en"),
+  country: z.string().optional().default("us"),
+  location: z.string().optional(),
+  origin: z.string().optional().default("api"),
+  timeout: z.number().int().positive().finite().safe().default(60000),
+  scrapeOptions: scrapeOptions.extend({
+    formats: z.array(z.enum([
+      "markdown",
+      "html", 
+      "rawHtml",
+      "links",
+      "screenshot",
+      "screenshot@fullPage",
+      "extract"
+    ])).default([])
+  }).default({}),
+}).strict("Unrecognized key in body -- please review the v1 API documentation for request body changes");
+
+export type SearchRequest = z.infer<typeof searchRequestSchema>;
+
+export type SearchResponse =
+  | ErrorResponse
+  | {
+      success: true;
+      warning?: string;
+      data: Document[];
+    };
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -33,6 +33,7 @@ import { extractController } from "../controllers/v1/extract";
 // import { readinessController } from "../controllers/v1/readiness";
 import { creditUsageController } from "../controllers/v1/credit-usage";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
+import { searchController } from "../controllers/v1/search";

 function checkCreditsMiddleware(
  minimum?: number,
@ -169,6 +170,13 @@ v1Router.post(
  wrap(batchScrapeController),
 );

+v1Router.post(
+  "/search",
+  authMiddleware(RateLimiterMode.Search),
+  checkCreditsMiddleware(),
+  wrap(searchController),
+);
+
 v1Router.post(
  "/map",
  authMiddleware(RateLimiterMode.Map),
@ -231,3 +239,6 @@ v1Router.get(
  authMiddleware(RateLimiterMode.CrawlStatus),
  wrap(creditUsageController),
 );
+
+
+
--- a/apps/api/src/search/index.ts
+++ b/apps/api/src/search/index.ts
@ -8,7 +8,7 @@ import { serper_search } from "./serper";
 export async function search({
  query,
  advanced = false,
-  num_results = 7,
+  num_results = 5,
  tbs = undefined,
  filter = undefined,
  lang = "en",
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -10,6 +10,8 @@ import { InternalOptions } from "./scraper/scrapeURL";

 type Mode = "crawl" | "single_urls" | "sitemap";

+export { Mode };
+
 export interface CrawlResult {
  source: string;
  content: string;
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -68,6 +68,9 @@ export interface FirecrawlDocument<T = any, ActionsSchema extends (ActionsResult
  screenshot?: string;
  metadata?: FirecrawlDocumentMetadata;
  actions: ActionsSchema;
+  // v1 search only
+  title?: string;
+  description?: string;
 }

 /**
@ -282,6 +285,34 @@ export class FirecrawlError extends Error {
  }
 }

+/**
+ * Parameters for search operations.
+ * Defines options for searching and scraping search results.
+ */
+export interface SearchParams {
+  query: string;
+  limit?: number;
+  tbs?: string;
+  filter?: string;
+  lang?: string;
+  country?: string;
+  location?: string;
+  origin?: string;
+  timeout?: number;
+  scrapeOptions?: ScrapeParams;
+}
+
+/**
+ * Response interface for search operations.
+ * Defines the structure of the response received after a search operation.
+ */
+export interface SearchResponse {
+  success: boolean;
+  data: FirecrawlDocument<undefined>[];
+  warning?: string;
+  error?: string;
+}
+
 /**
 * Main class for interacting with the Firecrawl API.
 * Provides methods for scraping, searching, crawling, and mapping web content.
@ -369,16 +400,79 @@ export default class FirecrawlApp {
  }

  /**
-   * This method is intended to search for a query using the Firecrawl API. However, it is not supported in version 1 of the API.
-   * @param query - The search query string.
-   * @param params - Additional parameters for the search.
-   * @returns Throws an error advising to use version 0 of the API.
+   * Searches using the Firecrawl API and optionally scrapes the results.
+   * @param params - Parameters for the search request.
+   * @returns The response from the search operation.
   */
-  async search(
-    query: string,
-    params?: any
-  ): Promise<any> {
-    throw new FirecrawlError("Search is not supported in v1, please downgrade Firecrawl to 0.0.36.", 400);
+  async search(params: SearchParams): Promise<SearchResponse> {
+    const headers: AxiosRequestHeaders = {
+      "Content-Type": "application/json",
+      Authorization: `Bearer ${this.apiKey}`,
+    } as AxiosRequestHeaders;
+
+    let jsonData: any = {
+      query: params.query,
+      limit: params.limit ?? 5,
+      tbs: params.tbs,
+      filter: params.filter,
+      lang: params.lang ?? "en",
+      country: params.country ?? "us",
+      location: params.location,
+      origin: params.origin ?? "api",
+      timeout: params.timeout ?? 60000,
+      scrapeOptions: params.scrapeOptions ?? { formats: [] },
+    };
+
+    if (jsonData?.scrapeOptions?.extract?.schema) {
+      let schema = jsonData.scrapeOptions.extract.schema;
+
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        scrapeOptions: {
+          ...jsonData.scrapeOptions,
+          extract: {
+            ...jsonData.scrapeOptions.extract,
+            schema: schema,
+          },
+        },
+      };
+    }
+
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/search`,
+        jsonData,
+        headers
+      );
+
+      if (response.status === 200) {
+        const responseData = response.data;
+        if (responseData.success) {
+          return {
+            success: true,
+            data: responseData.data,
+            warning: responseData.warning,
+          };
+        } else {
+          throw new FirecrawlError(`Failed to search. Error: ${responseData.error}`, response.status);
+        }
+      } else {
+        this.handleError(response, "search");
+      }
+    } catch (error: any) {
+      if (error.response?.data?.error) {
+        throw new FirecrawlError(`Request failed with status code ${error.response.status}. Error: ${error.response.data.error} ${error.response.data.details ? ` - ${JSON.stringify(error.response.data.details)}` : ''}`, error.response.status);
+      } else {
+        throw new FirecrawlError(error.message, 500);
+      }
+    }
+    return { success: false, error: "Internal server error.", data: [] };
  }

  /**