Merge branch 'main' into nsc/llm-usage-extract

2025-08-13 23:45:52 +08:00 · 2025-01-17 23:02:12 -03:00 · 2025-01-17 23:02:12 -03:00 · 260a726f37
commit 260a726f37
parent 6e3ceccb5c 146dc47954
15 changed files with 438 additions and 112 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -14,7 +14,7 @@
    "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'",
    "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'",
    "test:full": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_withAuth)'",
-    "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth)'",
+    "test:prod": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='(src/__tests__/e2e_noAuth|src/__tests__/e2e_full_withAuth|src/scraper/scrapeURL)'",
    "test:snips": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false src/__tests__/snips/*.test.ts",
    "workers": "nodemon --exec ts-node src/services/queue-worker.ts",
    "worker:production": "node dist/src/services/queue-worker.js",
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -60,9 +60,6 @@ content-type: application/json
  "sitemapOnly": true
 }

-Authorization: Bearer {{$dotenv TEST_API_KEY}}
-
-
 ### Extract Firecrawl Title
 # @name extractFirecrawl
 POST {{baseUrl}}/v1/extract HTTP/1.1
--- a/apps/api/src/controllers/v1/crawl-errors.ts
+++ b/apps/api/src/controllers/v1/crawl-errors.ts
@ -0,0 +1,81 @@
+import { Response } from "express";
+import {
+    CrawlErrorsResponse,
+  CrawlStatusParams,
+  CrawlStatusResponse,
+  ErrorResponse,
+  RequestWithAuth,
+} from "./types";
+import {
+  getCrawl,
+  getCrawlExpiry,
+  getCrawlJobs,
+  getDoneJobsOrdered,
+  getDoneJobsOrderedLength,
+  getThrottledJobs,
+  isCrawlFinished,
+} from "../../lib/crawl-redis";
+import { getScrapeQueue, redisConnection } from "../../services/queue-service";
+import {
+  supabaseGetJobById,
+  supabaseGetJobsById,
+} from "../../lib/supabase-jobs";
+import { configDotenv } from "dotenv";
+import { Job, JobState } from "bullmq";
+import { logger } from "../../lib/logger";
+configDotenv();
+
+export async function getJob(id: string) {
+  const job = await getScrapeQueue().getJob(id);
+  if (!job) return job;
+
+  return job;
+}
+
+export async function getJobs(ids: string[]) {
+  const jobs: (Job & { id: string })[] = (
+    await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
+  ).filter((x) => x) as (Job & { id: string })[];
+
+  return jobs;
+}
+
+export async function crawlErrorsController(
+  req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
+  res: Response<CrawlErrorsResponse>,
+) {
+  const sc = await getCrawl(req.params.jobId);
+  if (!sc) {
+    return res.status(404).json({ success: false, error: "Job not found" });
+  }
+
+  if (sc.team_id !== req.auth.team_id) {
+    return res.status(403).json({ success: false, error: "Forbidden" });
+  }
+
+  let jobStatuses = await Promise.all(
+    (await getCrawlJobs(req.params.jobId)).map(
+      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
+    ),
+  );
+
+  const failedJobIDs: string[] = [];
+
+  for (const [id, status] of jobStatuses) {
+    if (
+      status === "failed"
+    ) {
+      failedJobIDs.push(id);
+    }
+  }
+
+  res.status(200).json({
+    errors: (await getJobs(failedJobIDs)).map(x => ({
+        id: x.id,
+        timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
+        url: x.data.url,
+        error: x.failedReason,
+    })),
+    robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
+  });
+}
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -13,6 +13,7 @@ import {
  getDoneJobsOrderedLength,
  getThrottledJobs,
  isCrawlFinished,
+  isCrawlFinishedLocked,
 } from "../../lib/crawl-redis";
 import { getScrapeQueue } from "../../services/queue-service";
 import {
@ -117,7 +118,7 @@ export async function crawlStatusController(
    sc.cancelled
      ? "cancelled"
      : validJobStatuses.every((x) => x[1] === "completed") &&
-          await isCrawlFinished(req.params.jobId)
+          (await isCrawlFinishedLocked(req.params.jobId) || await isCrawlFinished(req.params.jobId))
        ? "completed"
        : "scraping";

--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -85,6 +85,11 @@ export async function getMapResults({

  const crawler = crawlToCrawler(id, sc);

+  try {
+    sc.robots = await crawler.getRobotsTxt();
+    await crawler.importRobotsTxt(sc.robots);
+  } catch (_) {}
+
  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -34,7 +34,7 @@ export const url = z.preprocess(
    .url()
    .regex(/^https?:\/\//, "URL uses unsupported protocol")
    .refine(
-      (x) => /\.[a-z]{2,}([\/?#]|$)/i.test(x),
+      (x) => /\.[a-z]{2,}(:\d+)?([\/?#]|$)/i.test(x),
      "URL must have a valid top-level domain or be a valid path",
    )
    .refine((x) => {
@ -569,6 +569,19 @@ export type CrawlStatusResponse =
      data: Document[];
    };

+
+export type CrawlErrorsResponse =
+  | ErrorResponse
+  | {
+      errors: {
+        id: string,
+        timestamp?: string,
+        url: string,
+        error: string,
+      }[];
+      robotsBlocked: string[];
+    };
+
 type AuthObject = {
  team_id: string;
  plan: PlanType | undefined;
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -128,6 +128,7 @@ export async function isCrawlFinished(id: string) {
  return (
    (await redisConnection.scard("crawl:" + id + ":jobs_done")) ===
    (await redisConnection.scard("crawl:" + id + ":jobs"))
+    && (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null
  );
 }

@ -135,6 +136,10 @@ export async function isCrawlFinishedLocked(id: string) {
  return await redisConnection.exists("crawl:" + id + ":finish");
 }

+export async function finishCrawlKickoff(id: string) {
+  await redisConnection.set("crawl:" + id + ":kickoff:finish", "yes", "EX", 24 * 60 * 60);
+}
+
 export async function finishCrawl(id: string) {
  if (await isCrawlFinished(id)) {
    _logger.debug("Marking crawl as finished.", {
@ -152,6 +157,9 @@ export async function finishCrawl(id: string) {
      module: "crawl-redis",
      method: "finishCrawl",
      crawlId: id,
+      jobs_done: (await redisConnection.scard("crawl:" + id + ":jobs_done")),
+      jobs: (await redisConnection.scard("crawl:" + id + ":jobs")),
+      kickoff_finished: (await redisConnection.get("crawl:" + id + ":kickoff:finish")) !== null,
    });
  }
 }
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
 import { creditUsageController } from "../controllers/v1/credit-usage";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { searchController } from "../controllers/v1/search";
+import { crawlErrorsController } from "../controllers/v1/crawl-errors";

 function checkCreditsMiddleware(
  minimum?: number,
@ -192,6 +193,18 @@ v1Router.get(
  wrap((req: any, res): any => crawlStatusController(req, res, true)),
 );

+v1Router.get(
+  "/crawl/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
+v1Router.get(
+  "/batch/scrape/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
 v1Router.get(
  "/scrape/:jobId",
  authMiddleware(RateLimiterMode.CrawlStatus),
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
 import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
-import robotsParser from "robots-parser";
+import robotsParser, { Robot } from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 import { axiosTimeout } from "../../lib/timeout";
 import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
-  public robots: any;
+  public robots: Robot;
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
    this.includes = Array.isArray(includes) ? includes : [];
    this.excludes = Array.isArray(excludes) ? excludes : [];
    this.limit = limit;
-    this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
+    this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
    this.robots = robotsParser(this.robotsTxtUrl, "");
    // Deprecated, use limit instead
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,45 +217,46 @@ export class WebCrawler {
    };

    const _urlsHandler = async (urls: string[]) => {
-      let uniqueURLs: string[] = [];
-      for (const url of urls) {
-        if (
-          await redisConnection.sadd(
-            "sitemap:" + this.jobId + ":links",
-            normalizeUrl(url),
-          )
-        ) {
-          uniqueURLs.push(url);
+      if (fromMap && onlySitemap) {
+        return urlsHandler(urls);
+      } else {
+        let filteredLinks = this.filterLinks(
+          [...new Set(urls)],
+          leftOfLimit,
+          this.maxCrawledDepth,
+          fromMap,
+        );
+        leftOfLimit -= filteredLinks.length;
+        let uniqueURLs: string[] = [];
+        for (const url of filteredLinks) {
+          if (
+            await redisConnection.sadd(
+              "sitemap:" + this.jobId + ":links",
+              normalizeUrl(url),
+            )
+          ) {
+            uniqueURLs.push(url);
+          }
        }
-      }

-      await redisConnection.expire(
-        "sitemap:" + this.jobId + ":links",
-        3600,
-        "NX",
-      );
-      if (uniqueURLs.length > 0) {
-        urlsHandler(uniqueURLs);
+        await redisConnection.expire(
+          "sitemap:" + this.jobId + ":links",
+          3600,
+          "NX",
+        );
+        if (uniqueURLs.length > 0) {
+          return urlsHandler(uniqueURLs);
+        }
      }
    };

-    let count = await this.tryFetchSitemapLinks(
-      this.initialUrl,
-      (urls: string[]) => {
-        if (fromMap && onlySitemap) {
-          return urlsHandler(urls);
-        } else {
-          let filteredLinks = this.filterLinks(
-            [...new Set(urls)],
-            leftOfLimit,
-            this.maxCrawledDepth,
-            fromMap,
-          );
-          leftOfLimit -= filteredLinks.length;
-          return _urlsHandler(filteredLinks);
-        }
-      },
-    );
+    let count = (await Promise.all([
+      this.tryFetchSitemapLinks(
+        this.initialUrl,
+        _urlsHandler,
+      ),
+      ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
+    ])).reduce((a,x) => a+x, 0);

    if (count > 0) {
      if (
@ -298,6 +299,16 @@ export class WebCrawler {
        this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
      ) {
        return fullUrl;
+      } else if (
+        this.isInternalLink(fullUrl) &&
+        this.noSections(fullUrl) &&
+        !this.matchesExcludes(path) &&
+        !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
+      ) {
+        (async() => {
+          await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
+          await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
+        })();
      }
    } else {
      // EXTERNAL LINKS
--- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts
@ -49,12 +49,14 @@ const excludeNonMainTags = [

 const forceIncludeMainTags = ["#main"];

-export const removeUnwantedElements = (
+export const htmlTransform = (
  html: string,
+  url: string,
  scrapeOptions: ScrapeOptions,
 ) => {
-  const soup = load(html);
+  let soup = load(html);

+  // remove unwanted elements
  if (
    scrapeOptions.includeTags &&
    scrapeOptions.includeTags.filter((x) => x.trim().length !== 0).length > 0
@ -66,7 +68,8 @@ export const removeUnwantedElements = (
        newRoot.append(soup(element).clone());
      });
    });
-    return newRoot.html() ?? "";
+
+    soup = load(newRoot.html() ?? "");
  }

  soup("script, style, noscript, meta, head").remove();
@ -114,6 +117,42 @@ export const removeUnwantedElements = (
    });
  }

+  // always return biggest image
+  soup("img[srcset]").each((_, el) => {
+    const sizes = el.attribs.srcset.split(",").map(x => {
+      const tok = x.trim().split(" ");
+      return {
+        url: tok[0],
+        size: parseInt((tok[1] ?? "1x").slice(0, -1), 10),
+        isX: (tok[1] ?? "").endsWith("x")
+      };
+    });
+
+    if (sizes.every(x => x.isX) && el.attribs.src) {
+      sizes.push({
+        url: el.attribs.src,
+        size: 1,
+        isX: true,
+      });
+    }
+
+    sizes.sort((a,b) => b.size - a.size);
+
+    el.attribs.src = sizes[0]?.url;
+  });
+
+  // absolute links
+  soup("img[src]").each((_, el) => {
+    try {
+      el.attribs.src = new URL(el.attribs.src, url).href;
+    } catch (_) {}
+  });
+  soup("a[href]").each((_, el) => {
+    try {
+      el.attribs.href = new URL(el.attribs.href, url).href;
+    } catch (_) {}
+  });
+
  const cleanedHtml = soup.html();
  return cleanedHtml;
 };
--- a/apps/api/src/scraper/scrapeURL/transformers/index.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts
@ -1,7 +1,7 @@
 import { parseMarkdown } from "../../../lib/html-to-markdown";
 import { Meta } from "..";
 import { Document } from "../../../controllers/v1/types";
-import { removeUnwantedElements } from "../lib/removeUnwantedElements";
+import { htmlTransform } from "../lib/removeUnwantedElements";
 import { extractLinks } from "../lib/extractLinks";
 import { extractMetadata } from "../lib/extractMetadata";
 import { performLLMExtract } from "./llmExtract";
@ -41,7 +41,7 @@ export function deriveHTMLFromRawHTML(
    );
  }

-  document.html = removeUnwantedElements(document.rawHtml, meta.options);
+  document.html = htmlTransform(document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options);
  return document;
 }

--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -23,6 +23,7 @@ import {
  addCrawlJobs,
  crawlToCrawler,
  finishCrawl,
+  finishCrawlKickoff,
  generateURLPermutations,
  getCrawl,
  getCrawlJobCount,
@ -675,9 +676,17 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {

    logger.debug("Done queueing jobs!");

+    await finishCrawlKickoff(job.data.crawl_id);
+    await finishCrawlIfNeeded(job, sc);
+
    return { success: true };
  } catch (error) {
    logger.error("An error occurred!", { error });
+    await finishCrawlKickoff(job.data.crawl_id);
+    const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
+    if (sc) {
+      await finishCrawlIfNeeded(job, sc);
+    }
    return { success: false, error };
  }
 }
@ -711,6 +720,7 @@ async function processJob(job: Job & { id: string }, token: string) {
    teamId: job.data?.team_id ?? undefined,
  });
  logger.info(`🐂 Worker taking job ${job.id}`, { url: job.data.url });
+  const start = Date.now();

  // Check if the job URL is researchhub and block it immediately
  // TODO: remove this once solve the root issue
@ -737,7 +747,6 @@ async function processJob(job: Job & { id: string }, token: string) {
      current_step: "SCRAPING",
      current_url: "",
    });
-    const start = Date.now();

    if (job.data.crawl_id) {
      const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
@ -988,6 +997,19 @@ async function processJob(job: Job & { id: string }, token: string) {
    logger.info(`🐂 Job done ${job.id}`);
    return data;
  } catch (error) {
+    if (job.data.crawl_id) {
+      const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
+
+      logger.debug("Declaring job as done...");
+      await addCrawlJobDone(job.data.crawl_id, job.id, false);
+      await redisConnection.srem(
+        "crawl:" + job.data.crawl_id + ":visited_unique",
+        normalizeURL(job.data.url, sc),
+      );
+
+      await finishCrawlIfNeeded(job, sc);
+    }
+    
    const isEarlyTimeout =
      error instanceof Error && error.message === "timeout";
    const isCancelled =
@ -1041,6 +1063,9 @@ async function processJob(job: Job & { id: string }, token: string) {
      );
    }

+    const end = Date.now();
+    const timeTakenInSeconds = (end - start) / 1000;
+
    logger.debug("Logging job to DB...");
    await logJob(
      {
@ -1053,7 +1078,7 @@ async function processJob(job: Job & { id: string }, token: string) {
              "Something went wrong... Contact help@mendable.ai"),
        num_docs: 0,
        docs: [],
-        time_taken: 0,
+        time_taken: timeTakenInSeconds,
        team_id: job.data.team_id,
        mode: job.data.mode,
        url: job.data.url,
@ -1064,39 +1089,6 @@ async function processJob(job: Job & { id: string }, token: string) {
      },
      true,
    );
-
-    if (job.data.crawl_id) {
-      const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
-
-      logger.debug("Declaring job as done...");
-      await addCrawlJobDone(job.data.crawl_id, job.id, false);
-      await redisConnection.srem(
-        "crawl:" + job.data.crawl_id + ":visited_unique",
-        normalizeURL(job.data.url, sc),
-      );
-
-      await finishCrawlIfNeeded(job, sc);
-
-      // await logJob({
-      //   job_id: job.data.crawl_id,
-      //   success: false,
-      //   message:
-      //     typeof error === "string"
-      //       ? error
-      //       : error.message ??
-      //         "Something went wrong... Contact help@mendable.ai",
-      //   num_docs: 0,
-      //   docs: [],
-      //   time_taken: 0,
-      //   team_id: job.data.team_id,
-      //   mode: job.data.crawlerOptions !== null ? "crawl" : "batch_scrape",
-      //   url: sc ? sc.originUrl ?? job.data.url : job.data.url,
-      //   crawlerOptions: sc ? sc.crawlerOptions : undefined,
-      //   scrapeOptions: sc ? sc.scrapeOptions : job.data.scrapeOptions,
-      //   origin: job.data.origin,
-      // });
-    }
-    // done(null, data);
    return data;
  }
 }
@ -1126,5 +1118,6 @@ async function processJob(job: Job & { id: string }, token: string) {
    await new Promise((resolve) => setTimeout(resolve, 500));
  }

+  console.log("All jobs finished. Worker out!");
  process.exit(0);
 })();
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.14.0",
+  "version": "1.14.1",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -279,9 +279,11 @@ export interface ErrorResponse {
 */
 export class FirecrawlError extends Error {
  statusCode: number;
-  constructor(message: string, statusCode: number) {
+  details?: any;
+  constructor(message: string, statusCode: number, details?: any) {
    super(message);
    this.statusCode = statusCode;
+    this.details = details;
  }
 }

@ -312,6 +314,26 @@ export interface SearchResponse {
  error?: string;
 }

+/**
+ * Response interface for crawl/batch scrape error monitoring.
+ */
+export interface CrawlErrorsResponse {
+  /**
+   * Scrapes that errored out + error details
+   */
+  errors: {
+    id: string,
+    timestamp?: string,
+    url: string,
+    error: string,
+  }[];
+
+  /**
+   * URLs blocked by robots.txt
+   */
+  robotsBlocked: string[];
+};
+
 /**
 * Main class for interacting with the Firecrawl API.
 * Provides methods for scraping, searching, crawling, and mapping web content.
@ -619,6 +641,29 @@ export default class FirecrawlApp {
    return { success: false, error: "Internal server error." };
  }

+  /**
+   * Returns information about crawl errors.
+   * @param id - The ID of the crawl operation.
+   * @returns Information about crawl errors.
+   */
+  async checkCrawlErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
+    const headers = this.prepareHeaders();
+    try {
+      const response: AxiosResponse = await this.deleteRequest(
+        `${this.apiUrl}/v1/crawl/${id}/errors`,
+        headers
+      );
+      if (response.status === 200) {
+        return response.data;
+      } else {
+        this.handleError(response, "check crawl errors");
+      }
+    } catch (error: any) {
+      throw new FirecrawlError(error.message, 500);
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
  /**
   * Cancels a crawl job using the Firecrawl API.
   * @param id - The ID of the crawl operation.
@ -881,6 +926,29 @@ export default class FirecrawlApp {
    return { success: false, error: "Internal server error." };
  }

+  /**
+   * Returns information about batch scrape errors.
+   * @param id - The ID of the batch scrape operation.
+   * @returns Information about batch scrape errors.
+   */
+  async checkBatchScrapeErrors(id: string): Promise<CrawlErrorsResponse | ErrorResponse> {
+    const headers = this.prepareHeaders();
+    try {
+      const response: AxiosResponse = await this.deleteRequest(
+        `${this.apiUrl}/v1/batch/scrape/${id}/errors`,
+        headers
+      );
+      if (response.status === 200) {
+        return response.data;
+      } else {
+        this.handleError(response, "check batch scrape errors");
+      }
+    } catch (error: any) {
+      throw new FirecrawlError(error.message, 500);
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
  /**
   * Extracts information from URLs using the Firecrawl API.
   * Currently in Beta. Expect breaking changes on future minor versions.
@ -941,9 +1009,9 @@ export default class FirecrawlApp {
        this.handleError(response, "extract");
      }
    } catch (error: any) {
-      throw new FirecrawlError(error.message, 500);
+      throw new FirecrawlError(error.message, 500, error.response?.data?.details);
    }
-    return { success: false, error: "Internal server error." };
+    return { success: false, error: "Internal server error."};
  }

  /**
@ -985,7 +1053,7 @@ export default class FirecrawlApp {
        this.handleError(response, "start extract job");
      }
    } catch (error: any) {
-      throw new FirecrawlError(error.message, 500);
+      throw new FirecrawlError(error.message, 500, error.response?.data?.details);
    }
    return { success: false, error: "Internal server error." };
  }
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -120,7 +120,10 @@ class FirecrawlApp:
            json=scrape_params,
        )
        if response.status_code == 200:
-            response = response.json()
+            try:
+                response = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            if response['success'] and 'data' in response:
                return response['data']
            elif "error" in response:
@ -159,7 +162,10 @@ class FirecrawlApp:
        if response.status_code != 200:
            raise Exception(f"Request failed with status code {response.status_code}")

-        return response.json()
+        try:
+            return response.json()
+        except:
+            raise Exception(f'Failed to parse Firecrawl response as JSON.')

    def crawl_url(self, url: str,
                  params: Optional[Dict[str, Any]] = None,
@ -194,7 +200,10 @@ class FirecrawlApp:
            json_data.update(params)
        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
        if response.status_code == 200:
-            id = response.json().get('id')
+            try:
+                id = response.json().get('id')
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            return self._monitor_job_status(id, headers, poll_interval)

        else:
@ -223,7 +232,10 @@ class FirecrawlApp:
            json_data.update(params)
        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
        if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
        else:
            self._handle_error(response, 'start crawl job')

@ -245,7 +257,10 @@ class FirecrawlApp:
        headers = self._prepare_headers()
        response = self._get_request(f'{self.api_url}{endpoint}', headers)
        if response.status_code == 200:
-            status_data = response.json()
+            try:
+                status_data = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            if status_data['status'] == 'completed':
                if 'data' in status_data:
                    data = status_data['data']
@ -261,7 +276,10 @@ class FirecrawlApp:
                            if status_response.status_code != 200:
                                logger.error(f"Failed to fetch next page: {status_response.status_code}")
                                break
-                            next_data = status_response.json()
+                            try:
+                                next_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
                            data.extend(next_data.get('data', []))
                            status_data = next_data
                        except Exception as e:
@ -291,6 +309,26 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check crawl status')
    
+    def check_crawl_errors(self, id: str) -> Dict[str, Any]:
+        """
+        Returns information about crawl errors.
+
+        Args:
+            id (str): The ID of the crawl job.
+
+        Returns:
+            Dict[str, Any]: Information about crawl errors.
+        """
+        headers = self._prepare_headers()
+        response = self._get_request(f'{self.api_url}/v1/crawl/{id}/errors', headers)
+        if response.status_code == 200:
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+        else:
+            self._handle_error(response, "check crawl errors")
+    
    def cancel_crawl(self, id: str) -> Dict[str, Any]:
        """
        Cancel an asynchronous crawl job using the Firecrawl API.
@ -304,7 +342,10 @@ class FirecrawlApp:
        headers = self._prepare_headers()
        response = self._delete_request(f'{self.api_url}/v1/crawl/{id}', headers)
        if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
        else:
            self._handle_error(response, "cancel crawl job")

@ -352,7 +393,10 @@ class FirecrawlApp:
            json=json_data,
        )
        if response.status_code == 200:
-            response = response.json()
+            try:
+                response = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            if response['success'] and 'links' in response:
                return response
            elif 'error' in response:
@ -395,7 +439,10 @@ class FirecrawlApp:
            json_data.update(params)
        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
        if response.status_code == 200:
-            id = response.json().get('id')
+            try:
+                id = response.json().get('id')
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            return self._monitor_job_status(id, headers, poll_interval)

        else:
@ -424,7 +471,10 @@ class FirecrawlApp:
            json_data.update(params)
        response = self._post_request(f'{self.api_url}{endpoint}', json_data, headers)
        if response.status_code == 200:
-            return response.json()
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
        else:
            self._handle_error(response, 'start batch scrape job')
    
@ -464,7 +514,10 @@ class FirecrawlApp:
        headers = self._prepare_headers()
        response = self._get_request(f'{self.api_url}{endpoint}', headers)
        if response.status_code == 200:
-            status_data = response.json()
+            try:
+                status_data = response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
            if status_data['status'] == 'completed':
                if 'data' in status_data:
                    data = status_data['data']
@ -480,7 +533,10 @@ class FirecrawlApp:
                            if status_response.status_code != 200:
                                logger.error(f"Failed to fetch next page: {status_response.status_code}")
                                break
-                            next_data = status_response.json()
+                            try:
+                                next_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
                            data.extend(next_data.get('data', []))
                            status_data = next_data
                        except Exception as e:
@ -510,6 +566,25 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check batch scrape status')

+    def check_batch_scrape_errors(self, id: str) -> Dict[str, Any]:
+        """
+        Returns information about batch scrape errors.
+
+        Args:
+            id (str): The ID of the crawl job.
+
+        Returns:
+            Dict[str, Any]: Information about crawl errors.
+        """
+        headers = self._prepare_headers()
+        response = self._get_request(f'{self.api_url}/v1/batch/scrape/{id}/errors', headers)
+        if response.status_code == 200:
+            try:
+                return response.json()
+            except:
+                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+        else:
+            self._handle_error(response, "check batch scrape errors")

    def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Any:
        """
@ -550,7 +625,10 @@ class FirecrawlApp:
                headers
            )
            if response.status_code == 200:
-                data = response.json()
+                try:
+                    data = response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
                if data['success']:
                    job_id = data.get('id')
                    if not job_id:
@ -563,7 +641,10 @@ class FirecrawlApp:
                            headers
                        )
                        if status_response.status_code == 200:
-                            status_data = status_response.json()
+                            try:
+                                status_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
                            if status_data['status'] == 'completed':
                                if status_data['success']:
                                    return status_data
@ -601,7 +682,10 @@ class FirecrawlApp:
        try:
            response = self._get_request(f'{self.api_url}/v1/extract/{job_id}', headers)
            if response.status_code == 200:
-                return response.json()
+                try:
+                    return response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
            else:
                self._handle_error(response, "get extract status")
        except Exception as e:
@ -641,7 +725,10 @@ class FirecrawlApp:
        try:
            response = self._post_request(f'{self.api_url}/v1/extract', request_data, headers)
            if response.status_code == 200:
-                return response.json()
+                try:
+                    return response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
            else:
                self._handle_error(response, "async extract")
        except Exception as e:
@ -771,16 +858,22 @@ class FirecrawlApp:

            status_response = self._get_request(api_url, headers)
            if status_response.status_code == 200:
-                status_data = status_response.json()
+                try:
+                    status_data = status_response.json()
+                except:
+                    raise Exception(f'Failed to parse Firecrawl response as JSON.')
                if status_data['status'] == 'completed':
                    if 'data' in status_data:
                        data = status_data['data']
                        while 'next' in status_data:
-                          if len(status_data['data']) == 0:
-                              break
-                          status_response = self._get_request(status_data['next'], headers)
-                          status_data = status_response.json()
-                          data.extend(status_data.get('data', []))
+                            if len(status_data['data']) == 0:
+                                break
+                            status_response = self._get_request(status_data['next'], headers)
+                            try:
+                                status_data = status_response.json()
+                            except:
+                                raise Exception(f'Failed to parse Firecrawl response as JSON.')
+                            data.extend(status_data.get('data', []))
                        status_data['data'] = data
                        return status_data
                    else:
@ -804,8 +897,12 @@ class FirecrawlApp:
        Raises:
            Exception: An exception with a message containing the status code and error details from the response.
        """
-        error_message = response.json().get('error', 'No error message provided.')
-        error_details = response.json().get('details', 'No additional error details provided.')
+        try:
+            error_message = response.json().get('error', 'No error message provided.')
+            error_details = response.json().get('details', 'No additional error details provided.')
+        except:
+            raise requests.exceptions.HTTPError(f'Failed to parse Firecrawl error response as JSON. Status code: {response.status_code}', response=response)
+        

        if response.status_code == 402:
            message = f"Payment Required: Failed to {action}. {error_message} - {error_details}"