Merge branch 'v1-webscraper' of https://github.com/mendableai/firecrawl into v1-webscraper

2025-08-14 03:25:56 +08:00 · 2024-08-16 14:16:35 -04:00 · 2024-08-16 14:16:35 -04:00 · 200ce8e2ce
commit 200ce8e2ce
parent 21d3798e49 3f998b688d
6 changed files with 123 additions and 213 deletions
--- a/apps/api/src/controllers/v1/auth.ts
+++ b/apps/api/src/controllers/v1/auth.ts
@ -26,13 +26,7 @@ export async function supaAuthenticateUser(
  req,
  res,
  mode?: RateLimiterMode
-): Promise<{
+): Promise<AuthResponse> {
  success: boolean;
  team_id?: string;
  error?: string;
  status?: number;
  plan?: string;
 }> {
  const authHeader = req.headers.authorization;
  if (!authHeader) {
    return { success: false, error: "Unauthorized", status: 401 };
@ -106,7 +100,7 @@ export async function supaAuthenticateUser(
    setTrace(team_id, normalizedApi);
    subscriptionData = {
      team_id: team_id,
-      plan: plan
+      plan: plan,
    }
    switch (mode) {
      case RateLimiterMode.Crawl:
@ -195,7 +189,12 @@ export async function supaAuthenticateUser(
    subscriptionData = data[0];
  }
-  return { success: true, team_id: subscriptionData.team_id, plan: subscriptionData.plan ?? ""};
+  return {
    success: true,
    team_id: subscriptionData.team_id,
    plan: subscriptionData.plan ?? "",
    api_key: normalizedApi
  };
 }
 function getPlanByPriceId(price_id: string) {
  switch (price_id) {
--- a/apps/api/src/controllers/v1/scrape.ts
+++ b/apps/api/src/controllers/v1/scrape.ts
@ -1,217 +1,120 @@
 import { Request, Response } from "express";
 import { Logger } from '../../lib/logger';
-import { checkAndUpdateURL } from '../../lib/validateUrl';
+import { Document, legacyScrapeOptions, RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
-import { RequestWithAuth, ScrapeRequest, scrapeRequestSchema, ScrapeResponse } from "./types";
+import { billTeam } from "../../services/billing/credit_billing";
 import { v4 as uuidv4 } from 'uuid';
 import { numTokensFromString } from "../../lib/LLM-extraction/helpers";
 import { addScrapeJob } from "../../services/queue-jobs";
 import { scrapeQueueEvents } from '../../services/queue-service';
 import { logJob } from "../../services/logging/log_job";
 export async function scrapeController(req: RequestWithAuth<ScrapeResponse, ScrapeRequest>, res: Response<ScrapeResponse>) {
-  req.body = scrapeRequestSchema.parse(req.body);
+  req.body = scrapeRequestSchema.parse(req.body);  
  console.log(req.body);
  // TODO: check req.body
  // mockup req.body
  // req.body = {
  //   url: "test",
  //   headers: {
  //     "x-key": "test"
  //   },
  //   formats: ["markdown", "html", "rawHtml", "content", "linksOnPage", "screenshot", "fullPageScreenshot"],
  //   includeTags: ["test"],
  //   excludeTags: ["test"],
  //   onlyMainContent: false,
  //   timeout: 30000,
  //   waitFor: number
  // }
  let earlyReturn = false;
  // make sure to authenticate user first, Bearer <token>
-  // check credits
+  const origin = req.body.origin;
  const timeout = req.body.timeout;
  const pageOptions = legacyScrapeOptions(req.body);
-  const result: ScrapeResponse = {
+  const jobId = uuidv4();
-    success: true,
+
-    warning: "test",
+  const startTime = new Date().getTime();
-    data: {
+  const job = await addScrapeJob({
-      markdown: "test",
+    url: req.body.url,
-      html: "test",
+    mode: "single_urls",
-      rawHtml: "test",
+    crawlerOptions: {},
-      links: ["test1", "test2"],
+    team_id: req.auth.team_id,
-      screenshot: "test",
+    pageOptions,
-      metadata: {
+    extractorOptions: {},
-        title: "test",
+    origin: req.body.origin,
-        description: "test",
+  }, {}, jobId);
-        language: "test",
+
-        sourceURL: "test",
+  let doc: any | undefined;
-        statusCode: 200,
+  try {
-        error: "test"
+    doc = (await job.waitUntilFinished(scrapeQueueEvents, timeout))[0]; // 60 seconds timeout
-      }
+  } catch (e) {
    Logger.error(`Error in scrapeController: ${e}`);
    if (e instanceof Error && e.message.startsWith("Job wait")) {
      return res.status(408).json({
        success: false,
        error: "Request timed out",
      });
    } else {
      return res.status(500).json({
        success: false,
        error: "Internal server error",
      });
    }
  }
-  return res.status(200).json(result);
+  await job.remove();
-  // const crawlerOptions = req.body.crawlerOptions ?? {};
+  if (!doc) {
-  // const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions };
+    console.error("!!! PANIC DOC IS", doc, job);
-  // const extractorOptions = { ...defaultExtractorOptions, ...req.body.extractorOptions };
+    return res.status(200).json({
-  // const origin = req.body.origin ?? defaultOrigin;
+      success: true,
-  // let timeout = req.body.timeout ?? defaultTimeout;
+      warning: "No page found",
      data: doc
    });
  }
-  // if (extractorOptions.mode.includes("llm-extraction")) {
+  delete doc.index;
-  //   pageOptions.onlyMainContent = true;
+  delete doc.provider;
  //   timeout = req.body.timeout ?? 90000;
  // }
-  // const checkCredits = async () => {
+  const endTime = new Date().getTime();
-  //   try {
+  const timeTakenInSeconds = (endTime - startTime) / 1000;
-  //     const { success: creditsCheckSuccess, message: creditsCheckMessage } = await checkTeamCredits(team_id, 1);
+  const numTokens = (doc && doc.markdown) ? numTokensFromString(doc.markdown, "gpt-3.5-turbo") : 0;
  //     if (!creditsCheckSuccess) {
  //       earlyReturn = true;
  //       return res.status(402).json({ error: "Insufficient credits" });
  //     }
  //   } catch (error) {
  //     Logger.error(error);
  //     earlyReturn = true;
  //     return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." });
  //   }
  // };
  let creditsToBeBilled = 1; // Assuming 1 credit per document
  if (earlyReturn) {
    // Don't bill if we're early returning
    return;
  }
-  // await checkCredits();
+  const billingResult = await billTeam(
    req.auth.team_id,
    creditsToBeBilled
  );
  if (!billingResult.success) {
    return res.status(402).json({
      success: false,
      error: "Failed to bill team. Insufficient credits or subscription not found.",
    });
  }
-  // const jobId = uuidv4();
+  logJob({
    job_id: jobId,
    success: true,
    message: "Scrape completed",
    num_docs: 1,
    docs: [doc],
    time_taken: timeTakenInSeconds,
    team_id: req.auth.team_id,
    mode: "scrape",
    url: req.body.url,
    crawlerOptions: {},
    pageOptions: pageOptions,
    origin: origin, 
    extractor_options: { mode: "markdown" },
    num_tokens: numTokens,
  });
-  // const startTime = new Date().getTime();
+  return res.status(200).json({
-  // const result = await scrapeHelper(
+    success: true,
-  //   jobId,
+    data: {
-  //   req,
+      markdown: doc.markdown,
-  //   team_id,
+      links: doc.linksOnPage,
-  //   crawlerOptions,
+      rawHtml: doc.rawHtml,
-  //   pageOptions,
+      html: doc.html,
-  //   extractorOptions,
+      screenshot: doc.screenshot,
-  //   timeout,
+      fullPageScreenshot: doc.fullPageScreenshot,
-  //   plan
+      metadata: {
-  // );
+        ...doc.metadata,
-  // const endTime = new Date().getTime();
+        pageError: undefined,
-  // const timeTakenInSeconds = (endTime - startTime) / 1000;
+        pageStatusCode: undefined,
-  // const numTokens = (result.data && result.data.markdown) ? numTokensFromString(result.data.markdown, "gpt-3.5-turbo") : 0;
+        error: doc.metadata.pageError,
-
+        statusCode: doc.metadata.pageStatusCode,
-  // if (result.success) {
+      },
-  //   let creditsToBeBilled = 1; // Assuming 1 credit per document
+    } as Document
-  //   const creditsPerLLMExtract = 50;
+  });
-
+}
  //   if (extractorOptions.mode.includes("llm-extraction")) {
  //     // creditsToBeBilled = creditsToBeBilled + (creditsPerLLMExtract * filteredDocs.length);
  //     creditsToBeBilled += creditsPerLLMExtract;
  //   }
  //   let startTimeBilling = new Date().getTime();
  //   if (earlyReturn) {
  //     // Don't bill if we're early returning
  //     return;
  //   }
  //   const billingResult = await billTeam(
  //     team_id,
  //     creditsToBeBilled
  //   );
  //   if (!billingResult.success) {
  //     return res.status(402).json({
  //       success: false,
  //       error: "Failed to bill team. Insufficient credits or subscription not found.",
  //     });
  //   }
  // }
  // logJob({
  //   job_id: jobId,
  //   success: result.success,
  //   message: result.error,
  //   num_docs: 1,
  //   docs: [result.data],
  //   time_taken: timeTakenInSeconds,
  //   team_id: team_id,
  //   mode: "scrape",
  //   url: req.body.url,
  //   crawlerOptions: crawlerOptions,
  //   pageOptions: pageOptions,
  //   origin: origin, 
  //   extractor_options: extractorOptions,
  //   num_tokens: numTokens,
  // });
  // return res.status(result.returnCode).json(result);
 }
 // export async function scrapeHelper(
 //   jobId: string,
 //   req: Request,
 //   team_id: string,
 //   crawlerOptions: any,
 //   pageOptions: PageOptions,
 //   extractorOptions: ExtractorOptions,
 //   timeout: number,
 //   plan?: string
 // ): Promise<{
 //   success: boolean;
 //   error?: string;
 //   data?: Document;
 //   returnCode: number;
 // }> {
  // const url = req.body.url;
  // if (!url) {
  //   return { success: false, error: "Url is required", returnCode: 400 };
  // }
  // if (isUrlBlocked(url)) {
  //   return { success: false, error: "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", returnCode: 403 };
  // }
  // const a = new WebScraperDataProvider();
  // await a.setOptions({
  //   jobId,
  //   mode: "single_urls",
  //   urls: [url],
  //   crawlerOptions: {
  //     ...crawlerOptions,
  //   },
  //   pageOptions: pageOptions,
  //   extractorOptions: extractorOptions,
  // });
  // const timeoutPromise = new Promise<{ success: boolean; error?: string; returnCode: number }>((_, reject) =>
  //   setTimeout(() => reject({ success: false, error: "Request timed out. Increase the timeout by passing `timeout` param to the request.", returnCode: 408 }), timeout)
  // );
  // const docsPromise = a.getDocuments(false);
  // let docs;
  // try {
  //   docs = await Promise.race([docsPromise, timeoutPromise]);
  // } catch (error) {
  //   return error;
  // }
  // // make sure doc.content is not empty
  // let filteredDocs = docs.filter(
  //   (doc: { content?: string }) => doc.content && doc.content.trim().length > 0
  // );
  // if (filteredDocs.length === 0) {
  //   return { success: true, error: "No page found", returnCode: 200, data: docs[0] };
  // }
  // // Remove rawHtml if pageOptions.rawHtml is false and extractorOptions.mode is llm-extraction-from-raw-html
  // if (!pageOptions.includeRawHtml && extractorOptions.mode == "llm-extraction-from-raw-html") {
  //   filteredDocs.forEach(doc => {
  //     delete doc.rawHtml;
  //   });
  // }
  // return {
  //   success: true,
  //   data: filteredDocs[0],
  //   returnCode: 200,
  // };
 // }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -204,5 +204,8 @@ export function legacyScrapeOptions(x: ScrapeOptions): PageOptions {
    removeTags: x.excludeTags,
    onlyMainContent: x.onlyMainContent,
    waitFor: x.waitFor,
    includeLinks: x.formats.includes("links"),
    screenshot: x.formats.includes("screenshot"),
    fullPageScreenshot: x.formats.includes("screenshot@fullPage"),
  };
 }
--- a/apps/api/src/lib/entities.ts
+++ b/apps/api/src/lib/entities.ts
@ -24,6 +24,7 @@ export type PageOptions = {
  parsePDF?: boolean;
  removeTags?: string | string[];
  onlyIncludeTags?: string | string[];
  includeLinks?: boolean;
 };
 export type ExtractorOptions = {
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -130,6 +130,7 @@ export async function scrapSingleUrl(
    screenshot: false,
    fullPageScreenshot: false,
    headers: undefined,
    includeLinks: true
  },
  extractorOptions: ExtractorOptions = {
    mode: "llm-extraction-from-markdown",
@ -361,7 +362,9 @@ export async function scrapSingleUrl(
    let linksOnPage: string[] | undefined;
-    linksOnPage = extractLinks(rawHtml, urlToScrap);
+    if (pageOptions.includeLinks) {
      linksOnPage = extractLinks(rawHtml, urlToScrap);
    }
    let document: Document;
    if (screenshot && screenshot.length > 0) {
@ -374,7 +377,7 @@ export async function scrapSingleUrl(
            extractorOptions.mode === "llm-extraction-from-raw-html"
            ? rawHtml
            : undefined,
-        linksOnPage,
+        linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
        metadata: {
          ...metadata,
          screenshot: screenshot,
@ -399,7 +402,7 @@ export async function scrapSingleUrl(
          pageStatusCode: pageStatusCode,
          pageError: pageError,
        },
-        linksOnPage,
+        linksOnPage: pageOptions.includeLinks ? linksOnPage : undefined,
      };
    }
@ -415,7 +418,7 @@ export async function scrapSingleUrl(
      content: "",
      markdown: "",
      html: "",
-      linksOnPage: [],
+      linksOnPage: pageOptions.includeLinks ? [] : undefined,
      metadata: {
        sourceURL: urlToScrap,
        pageStatusCode: pageStatusCode,
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -114,6 +114,7 @@ export interface AuthResponse {
  error?: string;
  status?: number;
  plan?: string;
  api_key?: string;
 }