Merge branch 'main' into mog/mineru

2025-08-14 06:05:51 +08:00 · 2024-12-27 19:53:09 -03:00 · 2024-12-27 19:53:09 -03:00 · 5fcf3fa97e
commit 5fcf3fa97e
parent 0b55fb836b a431cafa47
25 changed files with 1681 additions and 440 deletions
--- a/SELF_HOST.md
+++ b/SELF_HOST.md
@ -116,6 +116,10 @@ If you’d like to test the crawl endpoint, you can run this:

 This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl.

+### API Keys for SDK Usage
+
+**Note:** When using Firecrawl SDKs with a self-hosted instance, API keys are optional. API keys are only required when connecting to the cloud service (api.firecrawl.dev).
+
 ### Supabase client is not configured

 **Symptom:**
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -70,8 +70,8 @@ content-type: application/json
  "urls": ["firecrawl.dev"],
  "prompt": "What is the title, description and main product of the page?",
  "schema": {
-    "title": "string",
-    "description": "string",
-    "mainProduct": "string"
+    "title": { "type": "string" },
+    "description": { "type": "string" },
+    "mainProduct": { "type": "string" }
  }
 }
--- a/apps/api/src/controllers/v0/crawl.ts
+++ b/apps/api/src/controllers/v0/crawl.ts
@ -177,20 +177,13 @@ export async function crawlController(req: Request, res: Response) {

    await saveCrawl(id, sc);

-    const sitemap = sc.crawlerOptions?.ignoreSitemap
-      ? null
-      : await crawler.tryGetSitemap();
+    const sitemap = sc.crawlerOptions.ignoreSitemap
+        ? 0
+        : await crawler.tryGetSitemap(async urls => {
+            if (urls.length === 0) return;
            
-    if (sitemap !== null && sitemap.length > 0) {
-      let jobPriority = 20;
-      // If it is over 1000, we need to get the job priority,
-      // otherwise we can use the default priority of 20
-      if (sitemap.length > 1000) {
-        // set base to 21
-        jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
-      }
-      const jobs = sitemap.map((x) => {
-        const url = x.url;
+            let jobPriority = await getJobPriority({ plan, team_id, basePriority: 21 });
+            const jobs = urls.map(url => {
              const uuid = uuidv4();
              return {
                name: uuid,
@ -226,7 +219,9 @@ export async function crawlController(req: Request, res: Response) {
              // add with sentry instrumentation
              await addScrapeJob(job.data as any, {}, job.opts.jobId);
            }
-    } else {
+          });
+
+    if (sitemap === 0) {
      await lockURL(id, sc, url);

      // Not needed, first one should be 15.
--- a/apps/api/src/controllers/v0/crawlPreview.ts
+++ b/apps/api/src/controllers/v0/crawlPreview.ts
@ -113,11 +113,9 @@ export async function crawlPreviewController(req: Request, res: Response) {
    const crawler = crawlToCrawler(id, sc);

    const sitemap = sc.crawlerOptions?.ignoreSitemap
-      ? null
-      : await crawler.tryGetSitemap();
-
-    if (sitemap !== null) {
-      for (const url of sitemap.map((x) => x.url)) {
+      ? 0
+      : await crawler.tryGetSitemap(async urls => {
+        for (const url of urls) {
          await lockURL(id, sc, url);
          const jobId = uuidv4();
          await addScrapeJob(
@ -138,7 +136,9 @@ export async function crawlPreviewController(req: Request, res: Response) {
          );
          await addCrawlJob(id, jobId);
        }
-    } else {
+      });
+
+    if (sitemap === 0) {
      await lockURL(id, sc, url);
      const jobId = uuidv4();
      await addScrapeJob(
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -115,7 +115,7 @@ export async function crawlStatusController(
  const status: Exclude<CrawlStatusResponse, ErrorResponse>["status"] =
    sc.cancelled
      ? "cancelled"
-      : validJobStatuses.every((x) => x[1] === "completed")
+      : (validJobStatuses.every((x) => x[1] === "completed") && validJobStatuses.length > 0)
        ? "completed"
        : "scraping";

--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@ -18,7 +18,7 @@ import {
 } from "../../lib/crawl-redis";
 import { logCrawl } from "../../services/logging/crawl_log";
 import { getScrapeQueue } from "../../services/queue-service";
-import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
+import { _addScrapeJobToBullMQ, addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
 import { logger as _logger } from "../../lib/logger";
 import { getJobPriority } from "../../lib/job-priority";
 import { callWebhook } from "../../services/webhook";
@ -111,112 +111,19 @@ export async function crawlController(

  await saveCrawl(id, sc);

-  const sitemap = sc.crawlerOptions.ignoreSitemap
-    ? null
-    : await crawler.tryGetSitemap();
-
-  if (sitemap !== null && sitemap.length > 0) {
-    logger.debug("Using sitemap of length " + sitemap.length, {
-      sitemapLength: sitemap.length,
-    });
-    let jobPriority = 20;
-    // If it is over 1000, we need to get the job priority,
-    // otherwise we can use the default priority of 20
-    if (sitemap.length > 1000) {
-      // set base to 21
-      jobPriority = await getJobPriority({
-        plan: req.auth.plan,
-        team_id: req.auth.team_id,
-        basePriority: 21,
-      });
-    }
-    logger.debug("Using job priority " + jobPriority, { jobPriority });
-
-    const jobs = sitemap.map((x) => {
-      const url = x.url;
-      const uuid = uuidv4();
-      return {
-        name: uuid,
-        data: {
-          url,
-          mode: "single_urls" as const,
-          team_id: req.auth.team_id,
-          plan: req.auth.plan!,
-          crawlerOptions,
-          scrapeOptions,
-          internalOptions: sc.internalOptions,
-          origin: "api",
-          crawl_id: id,
-          sitemapped: true,
-          webhook: req.body.webhook,
-          v1: true,
-        },
-        opts: {
-          jobId: uuid,
-          priority: 20,
-        },
-      };
-    });
-
-    logger.debug("Locking URLs...");
-    await lockURLs(
-      id,
-      sc,
-      jobs.map((x) => x.data.url),
-    );
-    logger.debug("Adding scrape jobs to Redis...");
-    await addCrawlJobs(
-      id,
-      jobs.map((x) => x.opts.jobId),
-    );
-    logger.debug("Adding scrape jobs to BullMQ...");
-    await addScrapeJobs(jobs);
-  } else {
-    logger.debug("Sitemap not found or ignored.", {
-      ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
-    });
-
-    logger.debug("Locking URL...");
-    await lockURL(id, sc, req.body.url);
-    const jobId = uuidv4();
-    logger.debug("Adding scrape job to Redis...", { jobId });
-    await addScrapeJob(
-      {
+  await _addScrapeJobToBullMQ({
    url: req.body.url,
-        mode: "single_urls",
+    mode: "kickoff" as const,
    team_id: req.auth.team_id,
+    plan: req.auth.plan,
    crawlerOptions,
-        scrapeOptions: scrapeOptionsSchema.parse(scrapeOptions),
+    scrapeOptions: sc.scrapeOptions,
    internalOptions: sc.internalOptions,
-        plan: req.auth.plan!,
    origin: "api",
    crawl_id: id,
    webhook: req.body.webhook,
    v1: true,
-      },
-      {
-        priority: 15,
-      },
-      jobId,
-    );
-    logger.debug("Adding scrape job to BullMQ...", { jobId });
-    await addCrawlJob(id, jobId);
-  }
-  logger.debug("Done queueing jobs!");
-
-  if (req.body.webhook) {
-    logger.debug("Calling webhook with crawl.started...", {
-      webhook: req.body.webhook,
-    });
-    await callWebhook(
-      req.auth.team_id,
-      id,
-      null,
-      req.body.webhook,
-      true,
-      "crawl.started",
-    );
-  }
+  }, {}, crypto.randomUUID(), 10);
  
  const protocol = process.env.ENV === "local" ? req.protocol : "https";

--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -86,11 +86,12 @@ export async function getMapResults({

  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
-    const sitemap = await crawler.tryGetSitemap(true, true);
-    if (sitemap !== null) {
-      sitemap.forEach((x) => {
-        links.push(x.url);
+    const sitemap = await crawler.tryGetSitemap(urls => {
+      urls.forEach((x) => {
+        links.push(x);
      });
+    }, true, true);
+    if (sitemap > 0) {
      links = links
        .slice(1)
        .map((x) => {
@ -143,8 +144,10 @@ export async function getMapResults({
    }

    // Parallelize sitemap fetch with serper search
-    const [sitemap, ...searchResults] = await Promise.all([
-      ignoreSitemap ? null : crawler.tryGetSitemap(true),
+    const [_, ...searchResults] = await Promise.all([
+      ignoreSitemap ? null : crawler.tryGetSitemap(urls => {
+        links.push(...urls);
+      }, true),
      ...(cachedResult ? [] : pagePromises),
    ]);

@ -152,12 +155,6 @@ export async function getMapResults({
      allResults = searchResults;
    }

-    if (sitemap !== null) {
-      sitemap.forEach((x) => {
-        links.push(x.url);
-      });
-    }
-
    mapResults = allResults
      .flat()
      .filter((result) => result !== null && result !== undefined);
--- a/apps/api/src/lib/withAuth.ts
+++ b/apps/api/src/lib/withAuth.ts
@ -17,7 +17,7 @@ export function withAuth<T, U extends any[]>(
        logger.warn("You're bypassing authentication");
        warningCount++;
      }
-      return { success: true } as T;
+      return { success: true, ...(mockSuccess || {}) } as T;
    } else {
      return await originalFunction(...args);
    }
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -4,9 +4,10 @@ import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
 import robotsParser from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
-import { axiosTimeout } from "../../../src/lib/timeout";
-import { logger as _logger } from "../../../src/lib/logger";
+import { axiosTimeout } from "../../lib/timeout";
+import { logger as _logger } from "../../lib/logger";
 import https from "https";
+import { redisConnection } from "../../services/queue-service";
 export class WebCrawler {
  private jobId: string;
  private initialUrl: string;
@ -198,26 +199,60 @@ export class WebCrawler {
  }

  public async tryGetSitemap(
+    urlsHandler: (urls: string[]) => unknown,
    fromMap: boolean = false,
    onlySitemap: boolean = false,
-  ): Promise<{ url: string; html: string }[] | null> {
+  ): Promise<number> {
    this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
      method: "tryGetSitemap",
    });
-    const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
-    if (fromMap && onlySitemap) {
-      return sitemapLinks.map((link) => ({ url: link, html: "" }));
+    let leftOfLimit = this.limit;
+
+    const normalizeUrl = (url: string) => {
+      url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
+      if (url.endsWith("/")) {
+        url = url.slice(0, -1);
      }
-    if (sitemapLinks.length > 0) {
+      return url;
+    };
+
+    const _urlsHandler = async (urls: string[]) => {
+      let uniqueURLs: string[] = [];
+      for (const url of urls) {
+        if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(url))) {
+          uniqueURLs.push(url);
+        }
+      }
+
+      await redisConnection.expire("sitemap:" + this.jobId + ":links", 3600, "NX");
+      if (uniqueURLs.length > 0) {
+        urlsHandler(uniqueURLs);
+      }
+    };
+
+    let count = await this.tryFetchSitemapLinks(this.initialUrl, (urls: string[]) => {
+      if (fromMap && onlySitemap) {
+        return urlsHandler(urls);
+      } else {
        let filteredLinks = this.filterLinks(
-        [...new Set(sitemapLinks)],
-        this.limit,
+          [...new Set(urls)],
+          leftOfLimit,
          this.maxCrawledDepth,
          fromMap,
        );
-      return filteredLinks.map((link) => ({ url: link, html: "" }));
+        leftOfLimit -= filteredLinks.length;
+        return _urlsHandler(filteredLinks);
      }
-    return null;
+    });
+
+    if (count > 0) {
+      if (await redisConnection.sadd("sitemap:" + this.jobId + ":links", normalizeUrl(this.initialUrl))) {
+        urlsHandler([this.initialUrl]);
+      }
+      count++;
+    }
+
+    return count;
  }

  public filterURL(href: string, url: string): string | null {
@ -436,54 +471,74 @@ export class WebCrawler {
    return socialMediaOrEmail.some((ext) => url.includes(ext));
  }

-  private async tryFetchSitemapLinks(url: string): Promise<string[]> {
-    const normalizeUrl = (url: string) => {
-      url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
-      if (url.endsWith("/")) {
-        url = url.slice(0, -1);
-      }
-      return url;
-    };
-
+  private async tryFetchSitemapLinks(url: string, urlsHandler: (urls: string[]) => unknown): Promise<number> {
    const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;

-    let sitemapLinks: string[] = [];
+    let sitemapCount: number = 0;

+    // Try to get sitemap from the provided URL first
    try {
-      const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
-      if (response.status === 200) {
-        sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
-      }
+      sitemapCount = await getLinksFromSitemap(
+        { sitemapUrl, urlsHandler, mode: "fire-engine" },
+        this.logger,
+      );
    } catch (error) {
      this.logger.debug(
-        `Failed to fetch sitemap with axios from ${sitemapUrl}`,
+        `Failed to fetch sitemap from ${sitemapUrl}`,
        { method: "tryFetchSitemapLinks", sitemapUrl, error },
      );
-      if (error instanceof AxiosError && error.response?.status === 404) {
-        // ignore 404
-      } else {
-        const response = await getLinksFromSitemap(
-          { sitemapUrl, mode: "fire-engine" },
-          this.logger,
-        );
-        if (response) {
-          sitemapLinks = response;
-        }
-      }
    }

-    if (sitemapLinks.length === 0) {
-      const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
+    // If this is a subdomain, also try to get sitemap from the main domain
    try {
-        const response = await axios.get(baseUrlSitemap, {
-          timeout: axiosTimeout,
-        });
-        if (response.status === 200) {
-          sitemapLinks = await getLinksFromSitemap(
-            { sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
+      const urlObj = new URL(url);
+      const hostname = urlObj.hostname;
+      const domainParts = hostname.split('.');
+      
+      // Check if this is a subdomain (has more than 2 parts and not www)
+      if (domainParts.length > 2 && domainParts[0] !== 'www') {
+        // Get the main domain by taking the last two parts
+        const mainDomain = domainParts.slice(-2).join('.');
+        const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
+        const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
+
+        try {
+          // Get all links from the main domain's sitemap
+          sitemapCount += await getLinksFromSitemap(
+            { sitemapUrl: mainDomainSitemapUrl, urlsHandler(urls) {
+              urlsHandler(urls.filter(link => {
+                try {
+                  const linkUrl = new URL(link);
+                  return linkUrl.hostname.endsWith(hostname);
+                } catch {
+                }
+              }))
+            }, mode: "fire-engine" },
            this.logger,
          );
+        } catch (error) {
+          this.logger.debug(
+            `Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
+            { method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
+          );
        }
+      }
+    } catch (error) {
+      this.logger.debug(`Error processing main domain sitemap`, {
+        method: "tryFetchSitemapLinks",
+        url,
+        error,
+      });
+    }
+
+    // If no sitemap found yet, try the baseUrl as a last resort
+    if (sitemapCount === 0) {
+      const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
+      try {
+        sitemapCount += await getLinksFromSitemap(
+          { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
+          this.logger,
+        );
      } catch (error) {
        this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
          method: "tryFetchSitemapLinks",
@ -493,25 +548,14 @@ export class WebCrawler {
        if (error instanceof AxiosError && error.response?.status === 404) {
          // ignore 404
        } else {
-          sitemapLinks = await getLinksFromSitemap(
-            { sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
+          sitemapCount += await getLinksFromSitemap(
+            { sitemapUrl: baseUrlSitemap, urlsHandler, mode: "fire-engine" },
            this.logger,
          );
        }
      }
    }

-    const normalizedUrl = normalizeUrl(url);
-    const normalizedSitemapLinks = sitemapLinks.map((link) =>
-      normalizeUrl(link),
-    );
-    // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
-    if (
-      !normalizedSitemapLinks.includes(normalizedUrl) &&
-      sitemapLinks.length > 0
-    ) {
-      sitemapLinks.push(url);
-    }
-    return sitemapLinks;
+    return sitemapCount;
  }
 }
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -5,26 +5,25 @@ import { WebCrawler } from "./crawler";
 import { scrapeURL } from "../scrapeURL";
 import { scrapeOptions } from "../../controllers/v1/types";
 import type { Logger } from "winston";
-
+const useFireEngine =
+  process.env.FIRE_ENGINE_BETA_URL !== "" &&
+  process.env.FIRE_ENGINE_BETA_URL !== undefined;
 export async function getLinksFromSitemap(
  {
    sitemapUrl,
-    allUrls = [],
+    urlsHandler,
    mode = "axios",
  }: {
    sitemapUrl: string;
-    allUrls?: string[];
+    urlsHandler(urls: string[]): unknown,
    mode?: "axios" | "fire-engine";
  },
  logger: Logger,
-): Promise<string[]> {
+): Promise<number> {
  try {
    let content: string = "";
    try {
-      if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
-        const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
-        content = response.data;
-      } else if (mode === "fire-engine") {
+      if (mode === "fire-engine" && useFireEngine) {
        const response = await scrapeURL(
          "sitemap",
          sitemapUrl,
@ -32,10 +31,16 @@ export async function getLinksFromSitemap(
          { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
        );
        if (!response.success) {
-          throw response.error;
-        }
+          logger.debug("Failed to scrape sitemap via TLSClient, falling back to axios...", { error: response.error })
+          const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
+          content = ar.data;
+        } else {
          content = response.document.rawHtml!;
        }
+      } else {
+        const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
+        content = response.data;
+      }
    } catch (error) {
      logger.error(`Request failed for ${sitemapUrl}`, {
        method: "getLinksFromSitemap",
@ -44,33 +49,64 @@ export async function getLinksFromSitemap(
        error,
      });

-      return allUrls;
+      return 0;
    }

    const parsed = await parseStringPromise(content);
    const root = parsed.urlset || parsed.sitemapindex;
+    let count = 0;

    if (root && root.sitemap) {
-      const sitemapPromises = root.sitemap
+      // Handle sitemap index files
+      const sitemapUrls = root.sitemap
        .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
-        .map((sitemap) =>
+        .map((sitemap) => sitemap.loc[0]);
+
+      const sitemapPromises: Promise<number>[] = sitemapUrls.map((sitemapUrl) =>
        getLinksFromSitemap(
-            { sitemapUrl: sitemap.loc[0], allUrls, mode },
+          { sitemapUrl, urlsHandler, mode },
          logger,
        ),
      );
-      await Promise.all(sitemapPromises);
+      
+      const results = await Promise.all(sitemapPromises);
+      count = results.reduce((a,x) => a + x)
    } else if (root && root.url) {
+      // Check if any URLs point to additional sitemaps
+      const xmlSitemaps: string[] = root.url
+        .filter(
+          (url) =>
+            url.loc &&
+            url.loc.length > 0 &&
+            url.loc[0].toLowerCase().endsWith('.xml')
+        )
+        .map((url) => url.loc[0]);
+
+      if (xmlSitemaps.length > 0) {
+        // Recursively fetch links from additional sitemaps
+        const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
+          getLinksFromSitemap(
+            { sitemapUrl: sitemapUrl, urlsHandler, mode },
+            logger,
+          ),
+        );
+        count += (await Promise.all(sitemapPromises)).reduce((a,x) => a + x, 0);
+      }
+
      const validUrls = root.url
        .filter(
          (url) =>
            url.loc &&
            url.loc.length > 0 &&
+            !url.loc[0].toLowerCase().endsWith('.xml') &&
            !WebCrawler.prototype.isFile(url.loc[0]),
        )
        .map((url) => url.loc[0]);
-      allUrls.push(...validUrls);
+      count += validUrls.length;
+      urlsHandler(validUrls);
    }
+
+    return count;
  } catch (error) {
    logger.debug(`Error processing sitemapUrl: ${sitemapUrl}`, {
      method: "getLinksFromSitemap",
@ -80,7 +116,7 @@ export async function getLinksFromSitemap(
    });
  }

-  return allUrls;
+  return 0;
 }

 export const fetchSitemapData = async (
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError
 import * as Sentry from "@sentry/node";
 import { Action } from "../../../../lib/entities";
 import { specialtyScrapeCheck } from "../utils/specialtyHandler";
+import { fireEngineDelete } from "./delete";

 // This function does not take `Meta` on purpose. It may not access any
 // meta values to construct the request -- that must be done by the
@ -44,6 +45,13 @@ async function performFireEngineScrape<
  while (status === undefined) {
    if (errors.length >= errorLimit) {
      logger.error("Error limit hit.", { errors });
+      fireEngineDelete(
+        logger.child({
+          method: "performFireEngineScrape/fireEngineDelete",
+          afterErrors: errors,
+        }),
+        scrape.jobId,
+      );
      throw new Error("Error limit hit. See e.cause.errors for errors.", {
        cause: { errors },
      });
@ -74,6 +82,13 @@ async function performFireEngineScrape<
        error instanceof ActionError ||
        error instanceof UnsupportedFileError
      ) {
+        fireEngineDelete(
+          logger.child({
+            method: "performFireEngineScrape/fireEngineDelete",
+            afterError: error,
+          }),
+          scrape.jobId,
+        );
        logger.debug("Fire-engine scrape job failed.", {
          error,
          jobId: scrape.jobId,
@ -105,6 +120,13 @@ async function performFireEngineScrape<
    status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag
  }

+  fireEngineDelete(
+    logger.child({
+      method: "performFireEngineScrape/fireEngineDelete",
+    }),
+    scrape.jobId,
+  );
+
  return status;
 }

--- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
@ -8,6 +8,7 @@ export function extractMetadata(
 ): Partial<Document["metadata"]> {
  let title: string | undefined = undefined;
  let description: string | undefined = undefined;
+  let favicon: string | undefined = undefined;
  let language: string | undefined = undefined;
  let keywords: string | undefined = undefined;
  let robots: string | undefined = undefined;
@ -43,6 +44,12 @@ export function extractMetadata(
    title = soup("title").first().text().trim() || undefined;
    description = soup('meta[name="description"]').attr("content") || undefined;
    
+    const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
+    if (faviconLink) {
+      const baseUrl = new URL(meta.url).origin;
+      favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
+    }
+
    // Assuming the language is part of the URL as per the regex pattern
    language = soup("html").attr("lang") || undefined;

@ -121,6 +128,7 @@ export function extractMetadata(
  return {
    title,
    description,
+    favicon,
    language,
    keywords,
    robots,
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.test.ts
@ -0,0 +1,33 @@
+import { removeDefaultProperty } from "./llmExtract";
+
+describe("removeDefaultProperty", () => {
+    it("should remove the default property from a simple object", () => {
+        const input = { default: "test", test: "test" };
+        const expectedOutput = { test: "test" };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should remove the default property from a nested object", () => {
+        const input = { default: "test", nested: { default: "nestedTest", test: "nestedTest" } };
+        const expectedOutput = { nested: { test: "nestedTest" } };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should remove the default property from an array of objects", () => {
+        const input = { array: [{ default: "test1", test: "test1" }, { default: "test2", test: "test2" }] };
+        const expectedOutput = { array: [{ test: "test1" }, { test: "test2" }] };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should handle objects without a default property", () => {
+        const input = { test: "test" };
+        const expectedOutput = { test: "test" };
+        expect(removeDefaultProperty(input)).toEqual(expectedOutput);
+    });
+
+    it("should handle null and non-object inputs", () => {
+        expect(removeDefaultProperty(null)).toBeNull();
+        expect(removeDefaultProperty("string")).toBe("string");
+        expect(removeDefaultProperty(123)).toBe(123);
+    });
+});
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -121,6 +121,10 @@ export async function generateOpenAICompletions(
  }

  let schema = options.schema;
+  if (schema) {
+    schema = removeDefaultProperty(schema);
+}
+
  if (schema && schema.type === "array") {
    schema = {
      type: "object",
@ -134,10 +138,12 @@ export async function generateOpenAICompletions(
    schema = {
      type: "object",
      properties: Object.fromEntries(
-        Object.entries(schema).map(([key, value]) => [key, { type: value }]),
+        Object.entries(schema).map(([key, value]) => {
+          return [key, removeDefaultProperty(value)];
+        })
      ),
      required: Object.keys(schema),
-      additionalProperties: false,
+      additionalProperties: false
    };
  }

@ -232,3 +238,19 @@ export async function performLLMExtract(

  return document;
 }
+
+export function removeDefaultProperty(schema: any): any {
+  if (typeof schema !== 'object' || schema === null) return schema;
+
+  const { default: _, ...rest } = schema;
+
+  for (const key in rest) {
+      if (Array.isArray(rest[key])) {
+          rest[key] = rest[key].map((item: any) => removeDefaultProperty(item));
+      } else if (typeof rest[key] === 'object' && rest[key] !== null) {
+          rest[key] = removeDefaultProperty(rest[key]);
+      }
+  }
+
+  return rest;
+}
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -29,7 +29,7 @@ async function _addScrapeJobToConcurrencyQueue(
  });
 }

-async function _addScrapeJobToBullMQ(
+export async function _addScrapeJobToBullMQ(
  webScraperOptions: any,
  options: any,
  jobId: string,
@ -138,7 +138,6 @@ export async function addScrapeJobs(
  if (jobs[0].data && jobs[0].data.team_id && jobs[0].data.plan) {
    const now = Date.now();
    const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
-    console.log("CC limit", limit);
    cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);

    countCanBeDirectlyAdded = Math.max(
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -18,16 +18,18 @@ import { v4 as uuidv4 } from "uuid";
 import {
  addCrawlJob,
  addCrawlJobDone,
+  addCrawlJobs,
  crawlToCrawler,
  finishCrawl,
  generateURLPermutations,
  getCrawl,
  getCrawlJobs,
  lockURL,
+  lockURLs,
  normalizeURL,
 } from "../lib/crawl-redis";
 import { StoredCrawl } from "../lib/crawl-redis";
-import { addScrapeJob } from "./queue-jobs";
+import { addScrapeJob, addScrapeJobs } from "./queue-jobs";
 import {
  addJobPriority,
  deleteJobPriority,
@ -191,6 +193,17 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
  await addJobPriority(job.data.team_id, job.id);
  let err = null;
  try {
+    if (job.data?.mode === "kickoff") {
+      const result = await processKickoffJob(job, token);
+      if (result.success) {
+        try {
+          await job.moveToCompleted(null, token, false);
+        } catch (e) {}
+      } else {
+        logger.debug("Job failed", { result, mode: job.data.mode });
+        await job.moveToFailed((result as any).error, token, false);
+      }
+    } else {
      const result = await processJob(job, token);
      if (result.success) {
        try {
@ -208,6 +221,7 @@ const processJobInternal = async (token: string, job: Job & { id: string }) => {
        logger.debug("Job failed", { result });
        await job.moveToFailed((result as any).error, token, false);
      }
+    }
  } catch (error) {
    logger.debug("Job failed", { error });
    Sentry.captureException(error);
@ -379,6 +393,130 @@ const workerFun = async (

 workerFun(getScrapeQueue(), processJobInternal);

+async function processKickoffJob(job: Job & { id: string }, token: string) {
+  const logger = _logger.child({
+    module: "queue-worker",
+    method: "processKickoffJob",
+    jobId: job.id,
+    scrapeId: job.id,
+    crawlId: job.data?.crawl_id ?? undefined,
+    teamId: job.data?.team_id ?? undefined,
+  });
+
+  try {
+    const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
+    const crawler = crawlToCrawler(job.data.crawl_id, sc);
+
+    const sitemap = sc.crawlerOptions.ignoreSitemap
+        ? 0
+        : await crawler.tryGetSitemap(async urls => {
+            if (urls.length === 0) return;
+            
+            logger.debug("Using sitemap chunk of length " + urls.length, {
+              sitemapLength: urls.length,
+            });
+    
+            let jobPriority = await getJobPriority({
+              plan: job.data.plan,
+              team_id: job.data.team_id,
+              basePriority: 21,
+            });
+            logger.debug("Using job priority " + jobPriority, { jobPriority });
+    
+            const jobs = urls.map(url => {
+              const uuid = uuidv4();
+              return {
+                name: uuid,
+                data: {
+                  url,
+                  mode: "single_urls" as const,
+                  team_id: job.data.team_id,
+                  plan: job.data.plan!,
+                  crawlerOptions: job.data.crawlerOptions,
+                  scrapeOptions: job.data.scrapeOptions,
+                  internalOptions: sc.internalOptions,
+                  origin: job.data.origin,
+                  crawl_id: job.data.crawl_id,
+                  sitemapped: true,
+                  webhook: job.data.webhook,
+                  v1: job.data.v1,
+                },
+                opts: {
+                  jobId: uuid,
+                  priority: 20,
+                },
+              };
+            });
+        
+            logger.debug("Locking URLs...");
+            await lockURLs(
+              job.data.crawl_id,
+              sc,
+              jobs.map((x) => x.data.url),
+            );
+            logger.debug("Adding scrape jobs to Redis...");
+            await addCrawlJobs(
+              job.data.crawl_id,
+              jobs.map((x) => x.opts.jobId),
+            );
+            logger.debug("Adding scrape jobs to BullMQ...");
+            await addScrapeJobs(jobs);
+          });
+
+    if (sitemap === 0) {
+      logger.debug("Sitemap not found or ignored.", {
+        ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
+      });
+  
+      logger.debug("Locking URL...");
+      await lockURL(job.data.crawl_id, sc, job.data.url);
+      const jobId = uuidv4();
+      logger.debug("Adding scrape job to Redis...", { jobId });
+      await addScrapeJob(
+        {
+          url: job.data.url,
+          mode: "single_urls",
+          team_id: job.data.team_id,
+          crawlerOptions: job.data.crawlerOptions,
+          scrapeOptions: scrapeOptions.parse(job.data.scrapeOptions),
+          internalOptions: sc.internalOptions,
+          plan: job.data.plan!,
+          origin: job.data.origin,
+          crawl_id: job.data.crawl_id,
+          webhook: job.data.webhook,
+          v1: job.data.v1,
+        },
+        {
+          priority: 15,
+        },
+        jobId,
+      );
+      logger.debug("Adding scrape job to BullMQ...", { jobId });
+      await addCrawlJob(job.data.crawl_id, jobId);
+    }
+    logger.debug("Done queueing jobs!");
+
+    if (job.data.webhook) {
+      logger.debug("Calling webhook with crawl.started...", {
+        webhook: job.data.webhook,
+      });
+      await callWebhook(
+        job.data.team_id,
+        job.data.crawl_id,
+        null,
+        job.data.webhook,
+        true,
+        "crawl.started",
+      );
+    }
+    
+    return { success: true }
+  } catch (error) {
+    logger.error("An error occurred!", { error })
+    return { success: false, error };
+  }
+}
+
 async function processJob(job: Job & { id: string }, token: string) {
  const logger = _logger.child({
    module: "queue-worker",
--- a/apps/api/v1-openapi.json
+++ b/apps/api/v1-openapi.json
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.6.8"
+__version__ = "1.7.0"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
+++ b/apps/python-sdk/firecrawl/tests/v1/e2e_withAuth/test.py
@ -8,7 +8,7 @@ from datetime import datetime

 load_dotenv()

-API_URL = "http://127.0.0.1:3002";
+API_URL = os.getenv('API_URL', 'http://127.0.0.1:3002')
 ABSOLUTE_FIRECRAWL_PATH = "firecrawl/firecrawl.py"
 TEST_API_KEY = os.getenv('TEST_API_KEY')

@ -20,15 +20,26 @@ spec.loader.exec_module(firecrawl)
 FirecrawlApp = firecrawl.FirecrawlApp

 def test_no_api_key():
+    if 'api.firecrawl.dev' in API_URL:
        with pytest.raises(Exception) as excinfo:
            invalid_app = FirecrawlApp(api_url=API_URL)
        assert "No API key provided" in str(excinfo.value)
+    else:
+        # Should not raise error for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        assert app is not None

 def test_scrape_url_invalid_api_key():
+    if 'api.firecrawl.dev' in API_URL:
        invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
        with pytest.raises(Exception) as excinfo:
            invalid_app.scrape_url('https://firecrawl.dev')
        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.scrape_url('https://firecrawl.dev')
+        assert response is not None

 # def test_blocklisted_url():
 #     blocklisted_url = "https://facebook.com/fake-test"
@ -131,10 +142,16 @@ def test_successful_response_for_valid_scrape_with_pdf_file_without_explicit_ext
    assert 'We present spectrophotometric observations of the Broad Line Radio Galaxy' in response['markdown']

 def test_crawl_url_invalid_api_key():
+    if 'api.firecrawl.dev' in API_URL:
        invalid_app = FirecrawlApp(api_url=API_URL, api_key="invalid_api_key")
        with pytest.raises(Exception) as excinfo:
            invalid_app.crawl_url('https://firecrawl.dev')
        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.crawl_url('https://firecrawl.dev')
+        assert response is not None

 # def test_should_return_error_for_blocklisted_url():
 #     app = FirecrawlApp(api_url=API_URL, api_key=TEST_API_KEY)
@ -291,10 +308,16 @@ def test_check_crawl_status_e2e():
    assert 'error' not in status_response['data'][0]['metadata']

 def test_invalid_api_key_on_map():
+    if 'api.firecrawl.dev' in API_URL:
        invalid_app = FirecrawlApp(api_key="invalid_api_key", api_url=API_URL)
        with pytest.raises(Exception) as excinfo:
            invalid_app.map_url('https://roastmywebsite.ai')
        assert "Unauthorized: Invalid token" in str(excinfo.value)
+    else:
+        # Should work without API key for self-hosted
+        app = FirecrawlApp(api_url=API_URL)
+        response = app.map_url('https://roastmywebsite.ai')
+        assert response is not None

 # def test_blocklisted_url_on_map():
 #     app = FirecrawlApp(api_key=TEST_API_KEY, api_url=API_URL)
@ -349,4 +372,3 @@ def test_search_e2e():
 #     assert isinstance(llm_extraction['is_open_source'], bool)


-    
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -49,10 +49,13 @@ class FirecrawlApp:
        """
        self.api_key = api_key or os.getenv('FIRECRAWL_API_KEY')
        self.api_url = api_url or os.getenv('FIRECRAWL_API_URL', 'https://api.firecrawl.dev')
-      if self.api_key is None:
-          logger.warning("No API key provided")
+        
+        # Only require API key when using cloud service
+        if 'api.firecrawl.dev' in self.api_url and self.api_key is None:
+            logger.warning("No API key provided for cloud service")
            raise ValueError('No API key provided')
-      logger.debug(f"Initialized FirecrawlApp with API key: {self.api_key}")
+            
+        logger.debug(f"Initialized FirecrawlApp with API URL: {self.api_url}")

    def scrape_url(self, url: str, params: Optional[Dict[str, Any]] = None) -> Any:
        """
--- a/apps/python-sdk/pyproject.toml
+++ b/apps/python-sdk/pyproject.toml
@ -12,7 +12,8 @@ dependencies = [
    "requests",
    "python-dotenv",
    "websockets",
-    "nest-asyncio"
+    "nest-asyncio",
+    "pydantic>=2.10.3",
 ]
 authors = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
 maintainers = [{name = "Mendable.ai",email = "nick@mendable.ai"}]
--- a/apps/python-sdk/requirements.txt
+++ b/apps/python-sdk/requirements.txt
@ -3,3 +3,4 @@ pytest
 python-dotenv
 websockets
 nest-asyncio
+pydantic
--- a/apps/rust-sdk/src/error.rs
+++ b/apps/rust-sdk/src/error.rs
@ -9,7 +9,7 @@ use crate::crawl::CrawlStatus;
 #[derive(Debug, Deserialize, Serialize, Clone)]
 pub struct FirecrawlAPIError {
    /// Always false.
-    success: bool,
+    pub success: bool,

    /// Error message
    pub error: String,
--- a/apps/rust-sdk/src/lib.rs
+++ b/apps/rust-sdk/src/lib.rs
@ -9,6 +9,7 @@ pub mod map;
 pub mod scrape;

 pub use error::FirecrawlError;
+use error::FirecrawlAPIError;

 #[derive(Clone, Debug)]
 pub struct FirecrawlApp {
@ -18,16 +19,30 @@ pub struct FirecrawlApp {
 }

 pub(crate) const API_VERSION: &str = "/v1";
+const CLOUD_API_URL: &str = "https://api.firecrawl.dev";

 impl FirecrawlApp {
    pub fn new(api_key: impl AsRef<str>) -> Result<Self, FirecrawlError> {
-        FirecrawlApp::new_selfhosted("https://api.firecrawl.dev", Some(api_key))
+        FirecrawlApp::new_selfhosted(CLOUD_API_URL, Some(api_key))
    }

    pub fn new_selfhosted(api_url: impl AsRef<str>, api_key: Option<impl AsRef<str>>) -> Result<Self, FirecrawlError> {
+        let url = api_url.as_ref().to_string();
+        
+        if url == CLOUD_API_URL && api_key.is_none() {
+            return Err(FirecrawlError::APIError(
+                "Configuration".to_string(),
+                FirecrawlAPIError {
+                    success: false,
+                    error: "API key is required for cloud service".to_string(),
+                    details: None,
+                }
+            ));
+        }
+
        Ok(FirecrawlApp {
            api_key: api_key.map(|x| x.as_ref().to_string()),
-            api_url: api_url.as_ref().to_string(),
+            api_url: url,
            client: Client::new(),
        })
    }
--- a/apps/rust-sdk/tests/e2e_with_auth.rs
+++ b/apps/rust-sdk/tests/e2e_with_auth.rs
@ -1,7 +1,7 @@
 use assert_matches::assert_matches;
 use dotenvy::dotenv;
 use firecrawl::scrape::{ExtractOptions, ScrapeFormats, ScrapeOptions};
-use firecrawl::FirecrawlApp;
+use firecrawl::{FirecrawlApp, FirecrawlError};
 use serde_json::json;
 use std::env;

@ -155,3 +155,29 @@ async fn test_llm_extraction() {
    assert!(llm_extraction["supports_sso"].is_boolean());
    assert!(llm_extraction["is_open_source"].is_boolean());
 }
+
+#[test]
+fn test_api_key_requirements() {
+    dotenv().ok();
+    
+    let api_url = env::var("API_URL").unwrap_or("http://localhost:3002".to_string());
+    let api_key = env::var("TEST_API_KEY").ok();
+
+    match (api_url.contains("api.firecrawl.dev"), api_key) {
+        (false, _) => {
+            let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
+            assert!(result.is_ok(), "Local setup failed: {:?}", result.err().unwrap());
+        }
+        (true, None) => {
+            let result = FirecrawlApp::new_selfhosted(&api_url, None::<String>);
+            assert!(matches!(
+                result,
+                Err(FirecrawlError::APIError(msg, _)) if msg == "Configuration"
+            ));
+        }
+        (true, Some(key)) => {
+            let result = FirecrawlApp::new_selfhosted(&api_url, Some(&key));
+            assert!(result.is_ok());
+        }
+    }
+}