Merge remote-tracking branch 'origin/main' into nsc/new-extract

2025-08-15 21:45:53 +08:00 · 2024-11-19 09:34:08 -03:00 · 2024-11-19 09:34:08 -03:00 · 36cf49c959
commit 36cf49c959
parent 91f4fd815f a31336752c
25 changed files with 271 additions and 219 deletions
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@ -75,7 +75,7 @@ export async function crawlController(req: Request, res: Response) {
      await checkTeamCredits(chunk, team_id, limitCheck);
    if (!creditsCheckSuccess) {
-      return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at hello@firecrawl.com" });
+      return res.status(402).json({ error: "Insufficient credits. You may be requesting with a higher limit than the amount of credits you have left. If not, upgrade your plan at https://firecrawl.dev/pricing or contact us at help@firecrawl.com" });
    }
    // TODO: need to do this to v1
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -209,7 +209,7 @@ export async function scrapeController(req: Request, res: Response) {
      earlyReturn = true;
      return res.status(500).json({
        error:
-          "Error checking team credits. Please contact hello@firecrawl.com for help.",
+          "Error checking team credits. Please contact help@firecrawl.com for help.",
      });
    }
--- a/apps/api/src/controllers/v1/batch-scrape.ts
+++ b/apps/api/src/controllers/v1/batch-scrape.ts
@ -16,6 +16,7 @@ import { logCrawl } from "../../services/logging/crawl_log";
 import { getScrapeQueue } from "../../services/queue-service";
 import { getJobPriority } from "../../lib/job-priority";
 import { addScrapeJobs } from "../../services/queue-jobs";
 import { callWebhook } from "../../services/webhook";
 export async function batchScrapeController(
  req: RequestWithAuth<{}, CrawlResponse, BatchScrapeRequest>,
@ -66,6 +67,7 @@ export async function batchScrapeController(
        crawl_id: id,
        sitemapped: true,
        v1: true,
        webhook: req.body.webhook,
      },
      opts: {
        jobId: uuidv4(),
@ -85,6 +87,10 @@ export async function batchScrapeController(
  );
  await addScrapeJobs(jobs);
  if(req.body.webhook) {
    await callWebhook(req.auth.team_id, id, null, req.body.webhook, true, "batch_scrape.started");
  }
  const protocol = process.env.ENV === "local" ? req.protocol : "https";
  return res.status(200).json({
--- a/apps/api/src/controllers/v1/crawl-status-ws.ts
+++ b/apps/api/src/controllers/v1/crawl-status-ws.ts
@ -175,7 +175,7 @@ export async function crawlStatusWSController(ws: WebSocket, req: RequestWithAut
    logger.error("Error occurred in WebSocket! (" + req.path + ") -- ID " + id + " -- " + verbose);
    return close(ws, 1011, {
      type: "error",
-      error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id
+      error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id
    });
  }
 }
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -1,11 +1,6 @@
 import { Response } from "express";
 import { v4 as uuidv4 } from "uuid";
-import {
+import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
  MapDocument,
  mapRequestSchema,
  RequestWithAuth,
  scrapeOptions,
 } from "./types";
 import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
 import { MapResponse, MapRequest } from "./types";
 import { configDotenv } from "dotenv";
@ -65,11 +60,13 @@ export async function getMapResults({
 }): Promise<MapResult> {
  const id = uuidv4();
  let links: string[] = [url];
  let mapResults: MapDocument[] = [];
  const sc: StoredCrawl = {
    originUrl: url,
    crawlerOptions: {
      ...crawlerOptions,
      limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
      scrapeOptions: undefined,
    },
    scrapeOptions: scrapeOptions.parse({}),
@ -81,6 +78,29 @@ export async function getMapResults({
  const crawler = crawlToCrawler(id, sc);
  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
    if (includeMetadata) {
      throw new Error("includeMetadata is not supported with sitemapOnly");
    }
    const sitemap = await crawler.tryGetSitemap(true, true);
    if (sitemap !== null) {
      sitemap.forEach((x) => {
        links.push(x.url);
      });
      links = links.slice(1)
        .map((x) => {
          try {
            return checkAndUpdateURLForMap(x).url.trim();
          } catch (_) {
            return null;
          }
        })
        .filter((x) => x !== null) as string[];
      // links = links.slice(1, limit); // don't slice, unnecessary
    }
  } else {
    let urlWithoutWww = url.replace("www.", "");
    let mapUrl = search && allowExternalLinks
@ -107,13 +127,14 @@ export async function getMapResults({
        });
      };
-    pagePromises = Array.from({ length: maxPages }, (_, i) => fetchPage(i + 1));
+      pagePromises = Array.from({ length: maxPages }, (_, i) =>
        fetchPage(i + 1)
      );
      allResults = await Promise.all(pagePromises);
      await redis.set(cacheKey, JSON.stringify(allResults), "EX", 24 * 60 * 60); // Cache for 24 hours
    }
  console.log("allResults", allResults);
    // Parallelize sitemap fetch with serper search
    const [sitemap, ...searchResults] = await Promise.all([
      ignoreSitemap ? null : crawler.tryGetSitemap(),
@ -130,7 +151,7 @@ export async function getMapResults({
      });
    }
-  let mapResults : MapDocument[] = allResults
+    mapResults = allResults
      .flat()
      .filter((result) => result !== null && result !== undefined);
@ -180,6 +201,7 @@ export async function getMapResults({
    // remove duplicates that could be due to http/https or www
    links = removeDuplicateUrls(links);
  }
  const linksToReturn = links.slice(0, limit);
@ -242,51 +264,3 @@ export async function mapController(
  return res.status(200).json(response);
 }
 // Subdomain sitemap url checking
 // // For each result, check for subdomains, get their sitemaps and add them to the links
 // const processedUrls = new Set();
 // const processedSubdomains = new Set();
 // for (const result of links) {
 //   let url;
 //   let hostParts;
 //   try {
 //     url = new URL(result);
 //     hostParts = url.hostname.split('.');
 //   } catch (e) {
 //     continue;
 //   }
 //   console.log("hostParts", hostParts);
 //   // Check if it's a subdomain (more than 2 parts, and not 'www')
 //   if (hostParts.length > 2 && hostParts[0] !== 'www') {
 //     const subdomain = hostParts[0];
 //     console.log("subdomain", subdomain);
 //     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
 //     console.log("subdomainUrl", subdomainUrl);
 //     if (!processedSubdomains.has(subdomainUrl)) {
 //       processedSubdomains.add(subdomainUrl);
 //       const subdomainCrawl = crawlToCrawler(id, {
 //         originUrl: subdomainUrl,
 //         crawlerOptions: legacyCrawlerOptions(req.body),
 //         pageOptions: {},
 //         team_id: req.auth.team_id,
 //         createdAt: Date.now(),
 //         plan: req.auth.plan,
 //       });
 //       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
 //       if (subdomainSitemap) {
 //         subdomainSitemap.forEach((x) => {
 //           if (!processedUrls.has(x.url)) {
 //             processedUrls.add(x.url);
 //             links.push(x.url);
 //           }
 //         });
 //       }
 //     }
 //   }
 // }
--- a/apps/api/src/controllers/v1/scrape-status.ts
+++ b/apps/api/src/controllers/v1/scrape-status.ts
@ -11,8 +11,12 @@ export async function scrapeStatusController(req: any, res: any) {
    await rateLimiter.consume(iptoken);
    const job = await supabaseGetJobByIdOnlyData(req.params.jobId);
    const allowedTeams = [
      "41bdbfe1-0579-4d9b-b6d5-809f16be12f5", 
      "511544f2-2fce-4183-9c59-6c29b02c69b5"
    ];
-    if(job?.team_id !== "41bdbfe1-0579-4d9b-b6d5-809f16be12f5"){
+    if(!allowedTeams.includes(job?.team_id)){
      return res.status(403).json({
        success: false,
        error: "You are not allowed to access this resource.",
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -119,7 +119,7 @@ export const scrapeOptions = z.object({
  includeTags: z.string().array().optional(),
  excludeTags: z.string().array().optional(),
  onlyMainContent: z.boolean().default(true),
-  timeout: z.number().int().positive().finite().safe().default(30000),
+  timeout: z.number().int().positive().finite().safe().optional(),
  waitFor: z.number().int().nonnegative().finite().safe().default(0),
  extract: extractOptions.optional(),
  mobile: z.boolean().default(false),
@ -170,9 +170,10 @@ export type ExtractV1Options = z.infer<typeof extractV1Options>;
 export const extractRequestSchema = extractV1Options;
 export type ExtractRequest = z.infer<typeof extractRequestSchema>;
-export const scrapeRequestSchema = scrapeOptions.extend({
+export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
  url,
  origin: z.string().optional().default("api"),
  timeout: z.number().int().positive().finite().safe().default(30000),
 }).strict(strictMessage).refine(
  (obj) => {
    const hasExtractFormat = obj.formats?.includes("extract");
@ -194,9 +195,21 @@ export const scrapeRequestSchema = scrapeOptions.extend({
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
 export const webhookSchema = z.preprocess(x => {
  if (typeof x === "string") {
    return { url: x };
  } else {
    return x;
  }
 }, z.object({
  url: z.string().url(),
  headers: z.record(z.string(), z.string()).default({}),
 }).strict(strictMessage))
 export const batchScrapeRequestSchema = scrapeOptions.extend({
  urls: url.array(),
  origin: z.string().optional().default("api"),
  webhook: webhookSchema.optional(),
 }).strict(strictMessage).refine(
  (obj) => {
    const hasExtractFormat = obj.formats?.includes("extract");
@ -206,12 +219,7 @@ export const batchScrapeRequestSchema = scrapeOptions.extend({
  {
    message: "When 'extract' format is specified, 'extract' options must be provided, and vice versa",
  }
-).transform((obj) => {
+);
  if ((obj.formats?.includes("extract") || obj.extract) && !obj.timeout) {
    return { ...obj, timeout: 60000 };
  }
  return obj;
 });
 export type BatchScrapeRequest = z.infer<typeof batchScrapeRequestSchema>;
@ -239,21 +247,10 @@ const crawlerOptions = z.object({
 export type CrawlerOptions = z.infer<typeof crawlerOptions>;
 export const webhookSchema = z.preprocess(x => {
  if (typeof x === "string") {
    return { url: x };
  } else {
    return x;
  }
 }, z.object({
  url: z.string().url(),
  headers: z.record(z.string(), z.string()).default({}),
 }).strict(strictMessage))
 export const crawlRequestSchema = crawlerOptions.extend({
  url,
  origin: z.string().optional().default("api"),
-  scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
+  scrapeOptions: scrapeOptions.default({}),
  webhook: webhookSchema.optional(),
  limit: z.number().default(10000),
 }).strict(strictMessage);
@ -279,6 +276,7 @@ export const mapRequestSchema = crawlerOptions.extend({
  includeSubdomains: z.boolean().default(true),
  search: z.string().optional(),
  ignoreSitemap: z.boolean().default(false),
  sitemapOnly: z.boolean().default(false),
  limit: z.number().min(1).max(5000).default(5000),
 }).strict(strictMessage);
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -207,7 +207,7 @@ app.use((err: unknown, req: Request<{}, ErrorResponse, undefined>, res: Response
  }
  logger.error("Error occurred in request! (" + req.path + ") -- ID " + id  + " -- " + verbose);
-  res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact hello@firecrawl.com for help. Your exception ID is " + id });
+  res.status(500).json({ success: false, error: "An unexpected error occurred. Please contact help@firecrawl.com for help. Your exception ID is " + id });
 });
 logger.info(`Worker ${process.pid} started`);
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -52,7 +52,7 @@ export async function addCrawlJobs(id: string, job_ids: string[]) {
 export async function addCrawlJobDone(id: string, job_id: string) {
    await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id);
-    await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id);
+    await redisConnection.rpush("crawl:" + id + ":jobs_done_ordered", job_id);
    await redisConnection.expire("crawl:" + id + ":jobs_done", 24 * 60 * 60, "NX");
    await redisConnection.expire("crawl:" + id + ":jobs_done_ordered", 24 * 60 * 60, "NX");
 }
--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@ -6,22 +6,28 @@ import * as Sentry from "@sentry/node";
 import dotenv from 'dotenv';
 import { logger } from './logger';
 import { stat } from 'fs/promises';
 dotenv.config();
 // TODO: add a timeout to the Go parser
 const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
 class GoMarkdownConverter {
  private static instance: GoMarkdownConverter;
  private convert: any;
  private constructor() {
    const goExecutablePath = join(process.cwd(), 'sharedLibs', 'go-html-to-md', 'html-to-markdown.so');
    const lib = koffi.load(goExecutablePath);
    this.convert = lib.func('ConvertHTMLToMarkdown', 'string', ['string']);
  }
-  public static getInstance(): GoMarkdownConverter {
+  public static async getInstance(): Promise<GoMarkdownConverter> {
    if (!GoMarkdownConverter.instance) {
      try {
        await stat(goExecutablePath);
      } catch (_) {
        throw Error("Go shared library not found");
      }
      GoMarkdownConverter.instance = new GoMarkdownConverter();
    }
    return GoMarkdownConverter.instance;
@ -47,7 +53,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
  try {
    if (process.env.USE_GO_MARKDOWN_PARSER == "true") {
-      const converter = GoMarkdownConverter.getInstance();
+      const converter = await GoMarkdownConverter.getInstance();
      let markdownContent = await converter.convertHTMLToMarkdown(html);
      markdownContent = processMultiLineLinks(markdownContent);
@ -56,8 +62,12 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
      return markdownContent;
    }
  } catch (error) {
    if (!(error instanceof Error) || error.message !== "Go shared library not found") {
      Sentry.captureException(error);
      logger.error(`Error converting HTML to Markdown with Go parser: ${error}`);
    } else {
      logger.warn("Tried to use Go parser, but it doesn't exist in the file system.", { goExecutablePath });
    }
  }
  // Fallback to TurndownService if Go parser fails or is not enabled
@ -89,7 +99,7 @@ export async function parseMarkdown(html: string | null | undefined): Promise<st
    return markdownContent;
  } catch (error) {
-    console.error("Error converting HTML to Markdown: ", error);
+    logger.error("Error converting HTML to Markdown", {error});
    return ""; // Optionally return an empty string or handle the error as needed
  }
 }
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -65,7 +65,12 @@ export class WebCrawler {
    this.allowExternalContentLinks = allowExternalContentLinks ?? false;
  }
-  public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
+  public filterLinks(sitemapLinks: string[], limit: number, maxDepth: number, fromMap: boolean = false): string[] {
    // If the initial URL is a sitemap.xml, skip filtering
    if (this.initialUrl.endsWith('sitemap.xml') && fromMap) {
      return sitemapLinks.slice(0, limit);
    }
    return sitemapLinks
      .filter((link) => {
        let url: URL;
@ -159,11 +164,14 @@ export class WebCrawler {
    this.robots = robotsParser(this.robotsTxtUrl, txt);
  }
-  public async tryGetSitemap(): Promise<{ url: string; html: string; }[] | null> {
+  public async tryGetSitemap(fromMap: boolean = false, onlySitemap: boolean = false): Promise<{ url: string; html: string; }[] | null> {
    logger.debug(`Fetching sitemap links from ${this.initialUrl}`);
    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
    if(fromMap && onlySitemap) {
      return sitemapLinks.map(link => ({ url: link, html: "" }));
    }
    if (sitemapLinks.length > 0) {
-      let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth);
+      let filteredLinks = this.filterLinks(sitemapLinks, this.limit, this.maxCrawledDepth, fromMap);
      return filteredLinks.map(link => ({ url: link, html: "" }));
    }
    return null;
@ -353,7 +361,8 @@ export class WebCrawler {
      return url;
    };
-    const sitemapUrl = url.endsWith("/sitemap.xml")
+
    const sitemapUrl = url.endsWith(".xml")
      ? url
      : `${url}/sitemap.xml`;
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -24,7 +24,7 @@ export async function getLinksFromSitemap(
        const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
        content = response.data;
      } else if (mode === 'fire-engine') {
-        const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;playwright" });;
+        const response = await scrapeURL("sitemap", sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }), { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true });
        if (!response.success) {
          throw response.error;
        }
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -87,6 +87,7 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
        priority: meta.internalOptions.priority,
        geolocation: meta.options.geolocation,
        mobile: meta.options.mobile,
        timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
        // TODO: scrollXPaths
    };
@ -95,7 +96,9 @@ export async function scrapeURLWithFireEngineChromeCDP(meta: Meta): Promise<Engi
    let response = await performFireEngineScrape(
        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
        request,
-        defaultTimeout + totalWait,
+        meta.options.timeout !== undefined
            ? defaultTimeout + totalWait
            : Infinity, // TODO: better timeout handling
    );
    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck" }), response.responseHeaders);
@ -140,12 +143,16 @@ export async function scrapeURLWithFireEnginePlaywright(meta: Meta): Promise<Eng
        fullPageScreenshot: meta.options.formats.includes("screenshot@fullPage"),
        wait: meta.options.waitFor,
        geolocation: meta.options.geolocation,
        timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
    };
    let response = await performFireEngineScrape(
        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
        request,
-        defaultTimeout + meta.options.waitFor
+        meta.options.timeout !== undefined
            ? defaultTimeout + meta.options.waitFor
            : Infinity, // TODO: better timeout handling
    );
    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck" }), response.responseHeaders);
@ -179,11 +186,16 @@ export async function scrapeURLWithFireEngineTLSClient(meta: Meta): Promise<Engi
        atsv: meta.internalOptions.atsv,
        geolocation: meta.options.geolocation,
        disableJsDom: meta.internalOptions.v0DisableJsDom,
        timeout: meta.options.timeout === undefined ? 300000 : undefined, // TODO: better timeout logic
    };
    let response = await performFireEngineScrape(
        meta.logger.child({ method: "scrapeURLWithFireEngineChromeCDP/callFireEngine", request }),
        request,
        meta.options.timeout !== undefined
            ? defaultTimeout
            : Infinity, // TODO: better timeout handling
    );
    specialtyScrapeCheck(meta.logger.child({ method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck" }), response.responseHeaders);
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/scrape.ts
@ -25,6 +25,8 @@ export type FireEngineScrapeRequestCommon = {
    logRequest?: boolean; // default: true
    instantReturn?: boolean; // default: false
    geolocation?: { country?: string; languages?: string[]; };
    timeout?: number;
 }
 export type FireEngineScrapeRequestChromeCDP = {
--- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts
@ -13,12 +13,12 @@ export async function scrapeURLWithPlaywright(meta: Meta): Promise<EngineScrapeR
            headers: {
                "Content-Type": "application/json",
            },
-            body: JSON.stringify({
+            body: {
                url: meta.url,
                wait_after_load: meta.options.waitFor,
                timeout,
                headers: meta.options.headers,
-            }),
+            },
            method: "POST",
            logger: meta.logger.child("scrapeURLWithPlaywright/robustFetch"),
            schema: z.object({
--- a/apps/api/src/scraper/scrapeURL/error.ts
+++ b/apps/api/src/scraper/scrapeURL/error.ts
@ -18,7 +18,7 @@ export class NoEnginesLeftError extends Error {
    public results: EngineResultsTracker;
    constructor(fallbackList: Engine[], results: EngineResultsTracker) {
-        super("All scraping engines failed!");
+        super("All scraping engines failed! -- Double check the URL to make sure it's not broken. If the issue persists, contact us at help@firecrawl.com.");
        this.fallbackList = fallbackList;
        this.results = results;
    }
--- a/apps/api/src/search/fireEngine.ts
+++ b/apps/api/src/search/fireEngine.ts
@ -5,7 +5,6 @@ import { logger } from "../lib/logger";
 dotenv.config();
 export async function fireEngineMap(
  q: string,
  options: {
@ -40,14 +39,13 @@ export async function fireEngineMap(
      method: "POST",
      headers: {
        "Content-Type": "application/json",
-        "X-Disable-Cache": "true"
+        "X-Disable-Cache": "true",
      },
-      body: data
+      body: data,
    });
    if (response.ok) {
      const responseData = await response.json();
      console.log("response", responseData);
      return responseData;
    } else {
      return [];
--- a/apps/api/src/services/logging/log_job.ts
+++ b/apps/api/src/services/logging/log_job.ts
@ -7,7 +7,7 @@ import { logger } from "../../lib/logger";
 import { configDotenv } from "dotenv";
 configDotenv();
-export async function logJob(job: FirecrawlJob) {
+export async function logJob(job: FirecrawlJob, force: boolean = false) {
  try {
    const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true';
    if (!useDbAuthentication) {
@ -23,11 +23,7 @@ export async function logJob(job: FirecrawlJob) {
      job.scrapeOptions.headers["Authorization"] = "REDACTED";
      job.docs = [{ content: "REDACTED DUE TO AUTHORIZATION HEADER", html: "REDACTED DUE TO AUTHORIZATION HEADER" }];
    }
-
+    const jobColumn = {
    const { data, error } = await supabase_service
      .from("firecrawl_jobs")
      .insert([
        {
      job_id: job.job_id ? job.job_id : null,
      success: job.success,
      message: job.message,
@ -43,8 +39,36 @@ export async function logJob(job: FirecrawlJob) {
      num_tokens: job.num_tokens,
      retry: !!job.retry,
      crawl_id: job.crawl_id,
-        },
+    };
-      ]);
+
    if (force) {
      while (true) {
        try {
          const { error } = await supabase_service
            .from("firecrawl_jobs")
            .insert([jobColumn]);
          if (error) {
            logger.error("Failed to log job due to Supabase error -- trying again", { error, scrapeId: job.job_id });
            await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
          } else {
            break;
          }
        } catch (error) {
          logger.error("Failed to log job due to thrown error -- trying again", { error, scrapeId: job.job_id });
          await new Promise<void>((resolve) => setTimeout(() => resolve(), 75));
        }
      }
      logger.debug("Job logged successfully!", { scrapeId: job.job_id });
    } else {
      const { error } = await supabase_service
        .from("firecrawl_jobs")
        .insert([jobColumn]);
      if (error) {
        logger.error(`Error logging job: ${error.message}`, { error, scrapeId: job.job_id });
      } else {
        logger.debug("Job logged successfully!", { scrapeId: job.job_id });
      }
    }
    if (process.env.POSTHOG_API_KEY && !job.crawl_id) {
      let phLog = {
@ -72,9 +96,7 @@ export async function logJob(job: FirecrawlJob) {
        posthog.capture(phLog);
      }
    }
-    if (error) {
+    
      logger.error(`Error logging job: ${error.message}`);
    }
  } catch (error) {
    logger.error(`Error logging job: ${error.message}`);
  }
--- a/apps/api/src/services/notification/email_notification.ts
+++ b/apps/api/src/services/notification/email_notification.ts
@ -23,7 +23,7 @@ const emailTemplates: Record<
  },
  [NotificationType.RATE_LIMIT_REACHED]: {
    subject: "Rate Limit Reached - Firecrawl",
-    html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
+    html: "Hey there,<br/><p>You've hit one of the Firecrawl endpoint's rate limit! Take a breather and try again in a few moments. If you need higher rate limits, consider upgrading your plan. Check out our <a href='https://firecrawl.dev/pricing'>pricing page</a> for more info.</p><p>If you have any questions, feel free to reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/><br/>Ps. this email is only sent once every 7 days if you reach a rate limit.",
  },
  [NotificationType.AUTO_RECHARGE_SUCCESS]: {
    subject: "Auto recharge successful - Firecrawl",
@ -31,7 +31,7 @@ const emailTemplates: Record<
  },
  [NotificationType.AUTO_RECHARGE_FAILED]: {
    subject: "Auto recharge failed - Firecrawl",
-    html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:hello@firecrawl.com'>hello@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
+    html: "Hey there,<br/><p>Your auto recharge failed. Please try again manually. If the issue persists, please reach out to us at <a href='mailto:help@firecrawl.com'>help@firecrawl.com</a></p><br/>Thanks,<br/>Firecrawl Team<br/>",
  },
 };
@ -63,7 +63,7 @@ export async function sendEmailNotification(
    const { data, error } = await resend.emails.send({
      from: "Firecrawl <firecrawl@getmendableai.com>",
      to: [email],
-      reply_to: "hello@firecrawl.com",
+      reply_to: "help@firecrawl.com",
      subject: emailTemplates[notificationType].subject,
      html: emailTemplates[notificationType].html,
    });
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -262,7 +262,7 @@ async function processJob(job: Job & { id: string }, token: string) {
      document: null,
      project_id: job.data.project_id,
      error:
-        "URL is blocked. Suspecious activity detected. Please contact hello@firecrawl.com if you believe this is an error.",
+        "URL is blocked. Suspecious activity detected. Please contact help@firecrawl.com if you believe this is an error.",
    };
    return data;
  }
@ -346,7 +346,7 @@ async function processJob(job: Job & { id: string }, token: string) {
        scrapeOptions: job.data.scrapeOptions,
        origin: job.data.origin,
        crawl_id: job.data.crawl_id,
-      });
+      }, true);
      await addCrawlJobDone(job.data.crawl_id, job.id);
@ -486,7 +486,7 @@ async function processJob(job: Job & { id: string }, token: string) {
            url: sc?.originUrl ?? (job.data.crawlerOptions === null ? "Batch Scrape" : "Unknown"),
            crawlerOptions: sc.crawlerOptions,
            origin: job.data.origin,
-          });
+          }, true);
        }
      }
    }
@ -566,7 +566,7 @@ async function processJob(job: Job & { id: string }, token: string) {
        scrapeOptions: job.data.scrapeOptions,
        origin: job.data.origin,
        crawl_id: job.data.crawl_id,
-      });
+      }, true);
      // await logJob({
      //   job_id: job.data.crawl_id,
--- a/apps/api/src/services/webhook.ts
+++ b/apps/api/src/services/webhook.ts
@ -46,6 +46,8 @@ export const callWebhook = async (
      webhookUrl = webhooksData[0].url;
    }
    logger.debug("Calling webhook...", { webhookUrl, teamId, specified, v1, eventType, awaitWebhook });
    if (!webhookUrl) {
      return null;
    }
@ -128,7 +130,6 @@ export const callWebhook = async (
              "Content-Type": "application/json",
              ...webhookUrl.headers,
            },
            timeout: v1 ? 10000 : 30000, // 10 seconds timeout (v1)
          }
        )
        .catch((error) => {
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -175,4 +175,4 @@ export type PlanType =
  | "";
-export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
+export type WebhookEventType = "crawl.page" | "batch_scrape.page" | "crawl.started" | "batch_scrape.started" | "crawl.completed" | "batch_scrape.completed" | "crawl.failed";
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -221,6 +221,7 @@ export interface MapParams {
  search?: string;
  ignoreSitemap?: boolean;
  includeSubdomains?: boolean;
  sitemapOnly?: boolean;
  limit?: number;
 }
@ -563,16 +564,18 @@ export default class FirecrawlApp {
   * @param params - Additional parameters for the scrape request.
   * @param pollInterval - Time in seconds for job status checks.
   * @param idempotencyKey - Optional idempotency key for the request.
   * @param webhook - Optional webhook for the batch scrape.
   * @returns The response from the crawl operation.
   */
  async batchScrapeUrls(
    urls: string[],
    params?: ScrapeParams,
    pollInterval: number = 2,
-    idempotencyKey?: string
+    idempotencyKey?: string,
    webhook?: CrawlParams["webhook"],
  ): Promise<BatchScrapeStatusResponse | ErrorResponse> {
    const headers = this.prepareHeaders(idempotencyKey);
-    let jsonData: any = { urls, ...(params ?? {}) };
+    let jsonData: any = { urls, ...(params ?? {}), webhook };
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/batch/scrape`,
--- a/examples/aginews-ai-newsletter/README.md
+++ b/examples/aginews-ai-newsletter/README.md
@ -0,0 +1,6 @@
 # AGI News ✨
 AGI News is a daily AI newsletter that's completely sourced by autonomous AI agents. It is live at [https://www.aginews.io/](https://www.aginews.io/)
 Here is a link to the repo:
 [https://github.com/ericciarla/aginews](https://github.com/ericciarla/aginews)
--- a/examples/ai-podcast-generator/README.md
+++ b/examples/ai-podcast-generator/README.md
@ -0,0 +1,7 @@
 # Generate AI podcasts based on real time news 🎙️
 This example crawls the web for interesting news stories then records a podcast with your own voice.
 Here is a link to the repo:
 [https://github.com/ericciarla/aginews-podcast](https://github.com/ericciarla/aginews-podcast)
`@ -175,4 +175,4 @@ export type PlanType =`
	`\| "";`	`\| "";`


	`export type WebhookEventType = "crawl.page" \| "batch_scrape.page" \| "crawl.started" \| "crawl.completed" \| "batch_scrape.completed" \| "crawl.failed";`	`export type WebhookEventType = "crawl.page" \| "batch_scrape.page" \| "crawl.started" \| "batch_scrape.started" \| "crawl.completed" \| "batch_scrape.completed" \| "crawl.failed";`