Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper

2025-08-13 06:39:07 +08:00 · 2024-08-16 14:16:35 -04:00 · 2024-08-16 14:16:35 -04:00 · 200ce8e2ce
commit 200ce8e2ce
parent 21d3798e49 3f998b688d
6 changed files with 123 additions and 213 deletions
--- a/apps/api/src/controllers/v1/auth.ts
+++ b/apps/api/src/controllers/v1/auth.ts
@ -26,13 +26,7 @@ export async function supaAuthenticateUser(
  req,
  res,
  mode?: RateLimiterMode
-): Promise<{
-  success: boolean;
-  team_id?: string;
-  error?: string;
-  status?: number;
-  plan?: string;
-}> {
+): Promise<AuthResponse> {
  const authHeader = req.headers.authorization;
  if (!authHeader) {
    return { success: false, error: "Unauthorized", status: 401 };
@ -106,7 +100,7 @@ export async function supaAuthenticateUser(
    setTrace(team_id, normalizedApi);
    subscriptionData = {
      team_id: team_id,
-      plan: plan
+      plan: plan,
    }
    switch (mode) {
      case RateLimiterMode.Crawl:
@ -195,7 +189,12 @@ export async function supaAuthenticateUser(
    subscriptionData = data[0];
  }

-  return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
+  return {
+    success: true,
+    team_id: subscriptionData.team_id,
+    plan: subscriptionData.plan ?? "",
+    api_key: normalizedApi
+  };
 }
 function getPlanByPriceId(price_id: string) {
  switch (price_id) {
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -1,217 +1,120 @@
 import { Request, Response } from "express";
 import { Logger } from '../../lib/logger';
-import { checkAndUpdateURL } from '../../lib/validateUrl';
-import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
+import { Document, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
+import { billTeam } from "../../services/billing/credit_billing";
+import { v4 as uuidv4 } from 'uuid';
+import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
+import { addScrapeJob } from "../../services/queue-jobs";
+import { scrapeQueueEvents } from '../../services/queue-service';
+import { logJob } from "../../services/logging/log_job";

 export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
-  req.body = scrapeRequestSchema.parse(req.body);
-  console.log(req.body);
-
-  // TODO: check req.body
-  // mockup req.body
-  // req.body = {
-  //   url: "test",
-  //   headers: {
-  //     "x-key": "test"
-  //   },
-  //   formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
-  //   includeTags: ["test"],
-  //   excludeTags: ["test"],
-  //   onlyMainContent: false,
-  //   timeout: 30000,
-  //   waitFor: number
-  // }
-
+  req.body = scrapeRequestSchema.parse(req.body);  
  let earlyReturn = false;
-  // make sure to authenticate user first, Bearer <token>

-  // check credits
+  const origin = req.body.origin;
+  const timeout = req.body.timeout;
+  const pageOptions = legacyScrapeOptions(req.body);

-  const result: ScrapeResponse = {
-    success: true,
-    warning: "test",
-    data: {
-      markdown: "test",
-      html: "test",
-      rawHtml: "test",
-      links: ["test1", "test2"],
-      screenshot: "test",
-      metadata: {
-        title: "test",
-        description: "test",
-        language: "test",
-        sourceURL: "test",
-        statusCode: 200,
-        error: "test"
-      }
+  const jobId = uuidv4();
+
+  const startTime = new Date().getTime();
+  const job = await addScrapeJob({
+    url: req.body.url,
+    mode: "single_urls",
+    crawlerOptions: {},
+    team_id: req.auth.team_id,
+    pageOptions,
+    extractorOptions: {},
+    origin: req.body.origin,
+  }, {}, jobId);
+
+  let doc: any | undefined;
+  try {
+    doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; // 60 seconds timeout
+  } catch (e) {
+    Logger.error(`Error in scrapeController: ${e}`);
+    if (e instanceof Error && e.message.startsWith("Job wait")) {
+      return res.status(408).json({
+        success: false,
+        error: "Request timed out",
+      });
+    } else {
+      return res.status(500).json({
+        success: false,
+        error: "Internal server error",
+      });
    }
  }

-  return res.status(200).json(result);
+  await job.remove();

-  // const crawlerOptions = req.body.crawlerOptions ?? {};
-  // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
-  // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
-  // const origin = req.body.origin ?? defaultOrigin;
-  // let timeout = req.body.timeout ?? defaultTimeout;
+  if (!doc) {
+    console.error("!!! PANIC DOC IS", doc, job);
+    return res.status(200).json({
+      success: true,
+      warning: "No page found",
+      data: doc
+    });
+  }

-  // if (extractorOptions.mode.includes("llm-extraction")) {
-  //   pageOptions.onlyMainContent = true;
-  //   timeout = req.body.timeout ?? 90000;
-  // }
+  delete doc.index;
+  delete doc.provider;

-  // const checkCredits = async () => {
-  //   try {
-  //     const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
-  //     if (!creditsCheckSuccess) {
-  //       earlyReturn = true;
-  //       return res.status(402).json({ error: "Insufficient credits" });
-  //     }
-  //   } catch (error) {
-  //     Logger.error(error);
-  //     earlyReturn = true;
-  //     return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
-  //   }
-  // };
+  const endTime = new Date().getTime();
+  const timeTakenInSeconds = (endTime - startTime) / 1000;
+  const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;

+  let creditsToBeBilled = 1; // Assuming 1 credit per document
+  if (earlyReturn) {
+    // Don't bill if we're early returning
+    return;
+  }

-  // await checkCredits();
+  const billingResult = await billTeam(
+    req.auth.team_id,
+    creditsToBeBilled
+  );
+  if (!billingResult.success) {
+    return res.status(402).json({
+      success: false,
+      error: "Failed to bill team. Insufficient credits or subscription not found.",
+    });
+  }

-  // const jobId = uuidv4();
+  logJob({
+    job_id: jobId,
+    success: true,
+    message: "Scrape completed",
+    num_docs: 1,
+    docs: [doc],
+    time_taken: timeTakenInSeconds,
+    team_id: req.auth.team_id,
+    mode: "scrape",
+    url: req.body.url,
+    crawlerOptions: {},
+    pageOptions: pageOptions,
+    origin: origin, 
+    extractor_options: { mode: "markdown" },
+    num_tokens: numTokens,
+  });

-  // const startTime = new Date().getTime();
-  // const result = await scrapeHelper(
-  //   jobId,
-  //   req,
-  //   team_id,
-  //   crawlerOptions,
-  //   pageOptions,
-  //   extractorOptions,
-  //   timeout,
-  //   plan
-  // );
-  // const endTime = new Date().getTime();
-  // const timeTakenInSeconds = (endTime - startTime) / 1000;
-  // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
-
-  // if (result.success) {
-  //   let creditsToBeBilled = 1; // Assuming 1 credit per document
-  //   const creditsPerLLMExtract = 50;
-
-  //   if (extractorOptions.mode.includes("llm-extraction")) {
-  //     // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
-  //     creditsToBeBilled += creditsPerLLMExtract;
-  //   }
-
-  //   let startTimeBilling = new Date().getTime();
-
-  //   if (earlyReturn) {
-  //     // Don't bill if we're early returning
-  //     return;
-  //   }
-  //   const billingResult = await billTeam(
-  //     team_id,
-  //     creditsToBeBilled
-  //   );
-  //   if (!billingResult.success) {
-  //     return res.status(402).json({
-  //       success: false,
-  //       error: "Failed to bill team. Insufficient credits or subscription not found.",
-  //     });
-  //   }
-  // }
-
-  // logJob({
-  //   job_id: jobId,
-  //   success: result.success,
-  //   message: result.error,
-  //   num_docs: 1,
-  //   docs: [result.data],
-  //   time_taken: timeTakenInSeconds,
-  //   team_id: team_id,
-  //   mode: "scrape",
-  //   url: req.body.url,
-  //   crawlerOptions: crawlerOptions,
-  //   pageOptions: pageOptions,
-  //   origin: origin, 
-  //   extractor_options: extractorOptions,
-  //   num_tokens: numTokens,
-  // });
-
-  
-  // return res.status(result.returnCode).json(result);
-}
-
-
-// export async function scrapeHelper(
-//   jobId: string,
-//   req: Request,
-//   team_id: string,
-//   crawlerOptions: any,
-//   pageOptions: PageOptions,
-//   extractorOptions: ExtractorOptions,
-//   timeout: number,
-//   plan?: string
-// ): Promise<{
-//   success: boolean;
-//   error?: string;
-//   data?: Document;
-//   returnCode: number;
-// }> {
-
-  // const url = req.body.url;
-  // if (!url) {
-  //   return { success: false, error: "Url is required", returnCode: 400 };
-  // }
-
-  // if (isUrlBlocked(url)) {
-  //   return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
-  // }
-
-  // const a = new WebScraperDataProvider();
-  // await a.setOptions({
-  //   jobId,
-  //   mode: "single_urls",
-  //   urls: [url],
-  //   crawlerOptions: {
-  //     ...crawlerOptions,
-  //   },
-  //   pageOptions: pageOptions,
-  //   extractorOptions: extractorOptions,
-  // });
-
-  // const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
-  //   setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
-  // );
-
-  // const docsPromise = a.getDocuments(false);
-
-  // let docs;
-  // try {
-  //   docs = await Promise.race([docsPromise, timeoutPromise]);
-  // } catch (error) {
-  //   return error;
-  // }
-
-  // // make sure doc.content is not empty
-  // let filteredDocs = docs.filter(
-  //   (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
-  // );
-  // if (filteredDocs.length === 0) {
-  //   return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
-  // }
-
- 
-  // // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
-  // if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
-  //   filteredDocs.forEach(doc => {
-  //     delete doc.rawHtml;
-  //   });
-  // }
-
-  // return {
-  //   success: true,
-  //   data: filteredDocs[0],
-  //   returnCode: 200,
-  // };
-// }
+  return res.status(200).json({
+    success: true,
+    data: {
+      markdown: doc.markdown,
+      links: doc.linksOnPage,
+      rawHtml: doc.rawHtml,
+      html: doc.html,
+      screenshot: doc.screenshot,
+      fullPageScreenshot: doc.fullPageScreenshot,
+      metadata: {
+        ...doc.metadata,
+        pageError: undefined,
+        pageStatusCode: undefined,
+        error: doc.metadata.pageError,
+        statusCode: doc.metadata.pageStatusCode,
+      },
+    } as Document
+  });
+}
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -204,5 +204,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
    removeTags: x.excludeTags,
    onlyMainContent: x.onlyMainContent,
    waitFor: x.waitFor,
+    includeLinks: x.formats.includes("links"),
+    screenshot: x.formats.includes("screenshot"),
+    fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
  };
 }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -24,6 +24,7 @@ export type PageOptions = {
  parsePDF?: boolean;
  removeTags?: string | string[];
  onlyIncludeTags?: string | string[];
+  includeLinks?: boolean;
 };

 export type ExtractorOptions = {
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -130,6 +130,7 @@ export async function scrapSingleUrl(
    screenshot: false,
    fullPageScreenshot: false,
    headers: undefined,
+    includeLinks: true
  },
  extractorOptions: ExtractorOptions = {
    mode: "llm-extraction-from-markdown",
@ -361,7 +362,9 @@ export async function scrapSingleUrl(

    let linksOnPage: string[] | undefined;

-    linksOnPage = extractLinks(rawHtml, urlToScrap);
+    if (pageOptions.includeLinks) {
+      linksOnPage = extractLinks(rawHtml, urlToScrap);
+    }

    let document: Document;
    if (screenshot && screenshot.length > 0) {
@ -374,7 +377,7 @@ export async function scrapSingleUrl(
            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
-        linksOnPage,
+        linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
        metadata: {
          ...metadata,
          screenshot: screenshot,
@ -399,7 +402,7 @@ export async function scrapSingleUrl(
          pageStatusCode: pageStatusCode,
          pageError: pageError,
        },
-        linksOnPage,
+        linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
      };
    }

@ -415,7 +418,7 @@ export async function scrapSingleUrl(
      content: "",
      markdown: "",
      html: "",
-      linksOnPage: [],
+      linksOnPage: pageOptions.includeLinks ? [] : undefined,
      metadata: {
        sourceURL: urlToScrap,
        pageStatusCode: pageStatusCode,
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -114,6 +114,7 @@ export interface AuthResponse {
  error?: string;
  status?: number;
  plan?: string;
+  api_key?: string;
 }