Merge pull request #915 from mendableai/nsc/new-extract

Extract (beta)
2025-08-06 04:47:29 +08:00 · 2024-11-26 10:02:09 -08:00 · 2024-11-26 10:02:09 -08:00 · 6c33b978f3
commit 6c33b978f3
parent d3a9d29288 5522d6af7d
32 changed files with 2767 additions and 162 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -55,7 +55,7 @@
    "@bull-board/api": "^5.20.5",
    "@bull-board/express": "^5.20.5",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.16",
+    "@dqbd/tiktoken": "^1.0.17",
    "@nangohq/node": "^0.40.8",
    "@sentry/cli": "^2.33.1",
    "@sentry/node": "^8.26.0",
@ -73,6 +73,7 @@
    "cacheable-lookup": "^6.1.0",
    "cheerio": "^1.0.0-rc.12",
    "cohere": "^1.1.1",
    "cohere-ai": "^7.14.0",
    "cors": "^2.8.5",
    "cron-parser": "^4.9.0",
    "date-fns": "^3.6.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -47,3 +47,31 @@ content-type: application/json
 # @name batchScrapeStatus
 GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 ### Map Website
 # @name map
 POST {{baseUrl}}/v1/map HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "url": "firecrawl.dev",
  "sitemapOnly": true
 }
 ### Extract
 # @name extract
 POST {{baseUrl}}/v1/extract HTTP/1.1
 Authorization: Bearer {{$dotenv TEST_API_KEY}}
 content-type: application/json
 {
  "urls": ["firecrawl.dev"],
  "prompt": "What is the title, description and main product of the page?",
  "schema": {
    "title": "string",
    "description": "string",
    "mainProduct": "string"
  }
 }
--- a/apps/api/src/tests/e2e_extract/index.test.ts
+++ b/apps/api/src/tests/e2e_extract/index.test.ts
@ -0,0 +1,249 @@
 import request from "supertest";
 import dotenv from "dotenv";
 import {
  FirecrawlCrawlResponse,
  FirecrawlCrawlStatusResponse,
  FirecrawlScrapeResponse,
 } from "../../types";
 dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
 describe("E2E Tests for Extract API Routes", () => {
  it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["https://firecrawl.dev/*"],
        prompt: "Who are the authors of the blog posts?",
        schema: {
          type: "object",
          properties: { authors: { type: "array", items: { type: "string" } } },
        },
      });
    console.log(response.body);
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data).toHaveProperty("authors");
    let gotItRight = 0;
    for (const author of response.body.data?.authors) {
      if (author.includes("Caleb Peffer")) gotItRight++;
      if (author.includes("Gergő Móricz")) gotItRight++;
      if (author.includes("Eric Ciarla")) gotItRight++;
      if (author.includes("Nicolas Camara")) gotItRight++;
      if (author.includes("Jon")) gotItRight++;
      if (author.includes("Wendong")) gotItRight++;
    }
    expect(gotItRight).toBeGreaterThan(1);
  }, 60000);
  it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["firecrawl.dev/*"],
        prompt: "Who are the founders of the company?",
        allowExternalLinks: true,
        schema: {
          type: "object",
          properties: { founders: { type: "array", items: { type: "string" } } },
        },
      });
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data).toHaveProperty("founders");
    console.log(response.body.data?.founders);
    let gotItRight = 0;
    for (const founder of response.body.data?.founders) {
      if (founder.includes("Caleb")) gotItRight++;
      if (founder.includes("Eric")) gotItRight++;
      if (founder.includes("Nicolas")) gotItRight++;
      if (founder.includes("nick")) gotItRight++;
      if (founder.includes("eric")) gotItRight++;
      if (founder.includes("jon-noronha")) gotItRight++;
    }
    expect(gotItRight).toBeGreaterThanOrEqual(2);
  }, 60000);
  it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["https://firecrawl.dev/*"],
        prompt: "What are they hiring for?",
        allowExternalLinks: true,
        schema: {
          type: "array",
          items: {
            type: "string"
          },
          required: ["items"]
        },
      });
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    console.log(response.body.data);
    let gotItRight = 0;
    for (const hiring of response.body.data?.items) {
      if (hiring.includes("Developer Support Engineer")) gotItRight++;
      if (hiring.includes("Dev Ops Engineer")) gotItRight++;
      if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
    }
    expect(gotItRight).toBeGreaterThan(2);
  }, 60000);
  it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["fivetran.com/*"],
        prompt: "Does Fivetran have PCI DSS compliance?",
        allowExternalLinks: true,
        schema: {
          type: "object",
          properties: {
            pciDssCompliance: { type: "boolean" }
          }
        },
      });
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data?.pciDssCompliance).toBe(true);
  }, 60000);
  it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["fivetran.com/*"],
        prompt: "What are the Azure Data Connectors they offer?",
        schema: {
          type: "array",
          items: {
            type: "object",
            properties: {
              connector: { type: "string" },
              description: { type: "string" },
              supportsCaptureDelete: { type: "boolean" }
            }
          }
        }
      })
    console.log(response.body);
    // expect(response.statusCode).toBe(200);
    // expect(response.body).toHaveProperty("data");
    // expect(response.body.data?.pciDssCompliance).toBe(true);
  }, 60000);
  it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
        prompt: "what applicant tracking system is this company using?",
        schema: {
          type: "object",
          properties: {
            isGreenhouseATS: { type: "boolean" },
            answer: { type: "string" }
          }
        },
        allowExternalLinks: true
      })
    console.log(response.body);
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data?.isGreenhouseATS).toBe(true);
  }, 60000);
  it.concurrent("should return mintlify api components", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["https://mintlify.com/docs/*"],
        prompt: "what are the 4 API components?",
        schema: {
          type: "array",
          items: {
            type: "object",
            properties: {
              component: { type: "string" }
            }
          },
          required: ["items"]
        },
        allowExternalLinks: true
      })
    console.log(response.body.data?.items);
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data?.items.length).toBe(4);
    let gotItRight = 0;
    for (const component of response.body.data?.items) {
      if (component.component.toLowerCase().includes("parameter")) gotItRight++;
      if (component.component.toLowerCase().includes("response")) gotItRight++;
      if (component.component.toLowerCase().includes("expandable")) gotItRight++;
      if (component.component.toLowerCase().includes("sticky")) gotItRight++;
      if (component.component.toLowerCase().includes("examples")) gotItRight++;
    }
    expect(gotItRight).toBeGreaterThan(2);
  }, 60000);
  it.concurrent("should return information about Eric Ciarla", async () => {
    const response = await request(TEST_URL)
      .post("/v1/extract")
      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
      .set("Content-Type", "application/json")
      .send({
        urls: ["https://ericciarla.com/"],
        prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
        schema: {
          type: "object",
          properties: {
            name: { type: "string" },
            work: { type: "string" },
            education: { type: "string" }
          },
          required: ["name", "work", "education"]
        },
        allowExternalLinks: true
      })
    console.log(response.body.data);
    expect(response.statusCode).toBe(200);
    expect(response.body).toHaveProperty("data");
    expect(response.body.data?.name).toBe("Eric Ciarla");
    expect(response.body.data?.work).toBeDefined();
    expect(response.body.data?.education).toBeDefined();
  }, 60000);
 });
--- a/apps/api/src/tests/e2e_map/index.test.ts
+++ b/apps/api/src/tests/e2e_map/index.test.ts
@ -0,0 +1,117 @@
 import request from "supertest";
 import dotenv from "dotenv";
 dotenv.config();
 const TEST_URL = "http://127.0.0.1:3002";
 describe("E2E Tests for Map API Routes", () => {
  it.concurrent(
    "(feat-search)should return links containing 'smart-crawl'",
    async () => {
      const response = await request(TEST_URL)
        .post("/v1/map")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://firecrawl.dev",
          sitemapOnly: false,
          search: "smart-crawl",
        });
      console.log(response.body);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("links");
      expect(response.body.links.length).toBeGreaterThan(0);
      expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
    },
    60000
  );
  it.concurrent(
    "(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
    async () => {
      const response = await request(TEST_URL)
        .post("/v1/map")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://firecrawl.dev",
          sitemapOnly: false,
          includeSubdomains: true,
        });
      console.log(response.body);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("links");
      expect(response.body.links.length).toBeGreaterThan(0);
      expect(response.body.links[response.body.links.length - 1]).toContain(
        "docs.firecrawl.dev"
      );
    },
    60000
  );
  it.concurrent(
    "(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
    async () => {
      const response = await request(TEST_URL)
        .post("/v1/map")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://firecrawl.dev",
          sitemapOnly: true,
        });
      console.log(response.body);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("links");
      expect(response.body.links.length).toBeGreaterThan(0);
      expect(response.body.links[response.body.links.length - 1]).not.toContain(
        "docs.firecrawl.dev"
      );
    },
    60000
  );
  it.concurrent(
    "(feat-limit) should return mapped links for firecrawl.dev with a limit",
    async () => {
      const response = await request(TEST_URL)
        .post("/v1/map")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://firecrawl.dev",
          sitemapOnly: false,
          limit: 10,
        });
      console.log(response.body);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("links");
      expect(response.body.links.length).toBeLessThanOrEqual(10);
    },
    60000
  );
  it.concurrent(
    "(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
    async () => {
      const response = await request(TEST_URL)
        .post("/v1/map")
        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
        .set("Content-Type", "application/json")
        .send({
          url: "https://geekflare.com/sitemap_index.xml",
          sitemapOnly: true,
        });
      console.log(response.body);
      expect(response.statusCode).toBe(200);
      expect(response.body).toHaveProperty("links");
      expect(response.body.links.length).toBeGreaterThan(1900);
    },
    60000
  );
 });
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -0,0 +1,265 @@
 import { Request, Response } from "express";
 import {
  // Document,
  RequestWithAuth,
  ExtractRequest,
  extractRequestSchema,
  ExtractResponse,
  MapDocument,
  scrapeOptions,
 } from "./types";
 import { Document } from "../../lib/entities";
 import Redis from "ioredis";
 import { configDotenv } from "dotenv";
 import { performRanking } from "../../lib/ranker";
 import { billTeam } from "../../services/billing/credit_billing";
 import { logJob } from "../../services/logging/log_job";
 import { logger } from "../../lib/logger";
 import { getScrapeQueue } from "../../services/queue-service";
 import { waitForJob } from "../../services/queue-jobs";
 import { addScrapeJob } from "../../services/queue-jobs";
 import { PlanType } from "../../types";
 import { getJobPriority } from "../../lib/job-priority";
 import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
 import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
 import { getMapResults } from "./map";
 import { buildDocument } from "../../lib/extract/build-document";
 configDotenv();
 const redis = new Redis(process.env.REDIS_URL!);
 const MAX_EXTRACT_LIMIT = 100;
 const MAX_RANKING_LIMIT = 10;
 const INITIAL_SCORE_THRESHOLD = 0.75;
 const FALLBACK_SCORE_THRESHOLD = 0.5;
 const MIN_REQUIRED_LINKS = 1;
 /**
 * Extracts data from the provided URLs based on the request parameters.
 * Currently in beta.
 * @param req - The request object containing authentication and extraction details.
 * @param res - The response object to send the extraction results.
 * @returns A promise that resolves when the extraction process is complete.
 */
 export async function extractController(
  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
  res: Response<ExtractResponse>
 ) {
  const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
  req.body = extractRequestSchema.parse(req.body);
  const id = crypto.randomUUID();
  let links: string[] = [];
  let docs: Document[] = [];
  const earlyReturn = false;
  // Process all URLs in parallel
  const urlPromises = req.body.urls.map(async (url) => {
    if (url.includes('/*') || req.body.allowExternalLinks) {
      // Handle glob pattern URLs
      const baseUrl = url.replace('/*', '');
      // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
      const allowExternalLinks = req.body.allowExternalLinks ?? true;
      let urlWithoutWww = baseUrl.replace("www.", "");
      let mapUrl = req.body.prompt && allowExternalLinks
        ? `${req.body.prompt} ${urlWithoutWww}`
        : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
        : `site:${urlWithoutWww}`;
      const mapResults = await getMapResults({
        url: baseUrl,
        search: req.body.prompt,
        teamId: req.auth.team_id,
        plan: req.auth.plan,
        allowExternalLinks,
        origin: req.body.origin,
        limit: req.body.limit,
        // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
        ignoreSitemap: !selfHosted ? true : false,
        includeMetadata: true,
        includeSubdomains: req.body.includeSubdomains,
      });
      let mappedLinks = mapResults.links as MapDocument[];
      // Limit number of links to MAX_EXTRACT_LIMIT
      mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
      let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
      // Filter by path prefix if present
      // wrong
      // if (pathPrefix) {
      //   mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
      // }
      if (req.body.prompt) {
        // Get similarity scores between the search query and each link's context
        const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
        // First try with high threshold
        let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
        // If we don't have enough high-quality links, try with lower threshold
        if (filteredLinks.length < MIN_REQUIRED_LINKS) {
          logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
          filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
          if (filteredLinks.length === 0) {
            // If still no results, take top N results regardless of score
            logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
            filteredLinks = linksAndScores
              .sort((a, b) => b.score - a.score)
              .slice(0, MIN_REQUIRED_LINKS)
              .map(x => mappedLinks.find(link => link.url === x.link))
              .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
          }
        }
        mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
      }
      return mappedLinks.map(x => x.url) as string[];
    } else {
      // Handle direct URLs without glob pattern
      if (!isUrlBlocked(url)) {
        return [url];
      }
      return [];
    }
  });
  // Wait for all URL processing to complete and flatten results
  const processedUrls = await Promise.all(urlPromises);
  links.push(...processedUrls.flat());
  if (links.length === 0) {
    return res.status(400).json({
      success: false,
      error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
    });
  }
  // Scrape all links in parallel with retries
  const scrapePromises = links.map(async (url) => {
    const origin = req.body.origin || "api";
    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
    const jobId = crypto.randomUUID();
    const jobPriority = await getJobPriority({
      plan: req.auth.plan as PlanType,
      team_id: req.auth.team_id,
      basePriority: 10,
    });
    await addScrapeJob(
      {
        url,
        mode: "single_urls", 
        team_id: req.auth.team_id,
        scrapeOptions: scrapeOptions.parse({}),
        internalOptions: {},
        plan: req.auth.plan!,
        origin,
        is_scrape: true,
      },
      {},
      jobId,
      jobPriority
    );
    try {
      const doc = await waitForJob<Document>(jobId, timeout);
      await getScrapeQueue().remove(jobId);
      if (earlyReturn) {
        return null;
      }
      return doc;
    } catch (e) {
      logger.error(`Error in scrapeController: ${e}`);
      if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
        throw {
          status: 408,
          error: "Request timed out"
        };
      } else {
        throw {
          status: 500,
          error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
        };
      }
    }
  });
  try {
    const results = await Promise.all(scrapePromises);
    docs.push(...results.filter(doc => doc !== null).map(x => x!));
  } catch (e) {
    return res.status(e.status).json({
      success: false,
      error: e.error
    });
  }
  const completions = await generateOpenAICompletions(
    logger.child({ method: "extractController/generateOpenAICompletions" }),
    {
      mode: "llm",
      systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided.",
      prompt: req.body.prompt,
      schema: req.body.schema,
    },
    docs.map(x => buildDocument(x)).join('\n')
  );
  // TODO: change this later
  // While on beta, we're billing 5 credits per link discovered/scraped.
  billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
    logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
  });
  let data = completions.extract ?? {};
  let warning = completions.warning;
  logJob({
    job_id: id,
    success: true,
    message: "Extract completed",
    num_docs: 1,
    docs: data,
    time_taken: (new Date().getTime() - Date.now()) / 1000,
    team_id: req.auth.team_id,
    mode: "extract",
    url: req.body.urls.join(", "),
    scrapeOptions: req.body,
    origin: req.body.origin ?? "api",
    num_tokens: completions.numTokens ?? 0
  });
  return res.status(200).json({
    success: true,
    data: data,
    scrape_id: id,
    warning: warning
  });
 }
 /**
 * Filters links based on their similarity score to the search query.
 * @param mappedLinks - The list of mapped links to filter.
 * @param linksAndScores - The list of links and their similarity scores.
 * @param threshold - The score threshold to filter by.
 * @returns The filtered list of links.
 */
 function filterAndProcessLinks(
  mappedLinks: MapDocument[], 
  linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
  threshold: number
 ): MapDocument[] {
  return linksAndScores
    .filter(x => x.score > threshold)
    .map(x => mappedLinks.find(link => link.url === x.link))
    .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
 }
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -1,6 +1,6 @@
 import { Response } from "express";
 import { v4 as uuidv4 } from "uuid";
-import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
+import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
 import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
 import { MapResponse, MapRequest } from "./types";
 import { configDotenv } from "dotenv";
@ -25,37 +25,61 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;
-export async function mapController(
+interface MapResult {
-  req: RequestWithAuth<{}, MapResponse, MapRequest>,
+  success: boolean;
-  res: Response<MapResponse>
+  links: string[] | any[];
-) {
+  scrape_id?: string;
-  const startTime = new Date().getTime();
+  job_id: string;
-
+  time_taken: number;
-  req.body = mapRequestSchema.parse(req.body);
+}
  const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
 export async function getMapResults({
  url,
  search,
  limit = MAX_MAP_LIMIT,
  ignoreSitemap = false,
  includeSubdomains = true,
  crawlerOptions = {},
  teamId,
  plan,
  origin,
  includeMetadata = false,
  allowExternalLinks
 }: {
  url: string;
  search?: string;
  limit?: number;
  ignoreSitemap?: boolean;
  includeSubdomains?: boolean;
  crawlerOptions?: any;
  teamId: string;
  plan?: string;
  origin?: string;
  includeMetadata?: boolean;
  allowExternalLinks?: boolean;
 }): Promise<MapResult> {
  const id = uuidv4();
-  let links: string[] = [req.body.url];
+  let links: string[] = [url];
  let mapResults: MapDocument[] = [];
  const sc: StoredCrawl = {
-    originUrl: req.body.url,
+    originUrl: url,
    crawlerOptions: {
-      ...req.body,
+      ...crawlerOptions,
-      limit: req.body.sitemapOnly ? 10000000 : limit,
+      limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
      scrapeOptions: undefined,
    },
    scrapeOptions: scrapeOptions.parse({}),
    internalOptions: {},
-    team_id: req.auth.team_id,
+    team_id: teamId,
    createdAt: Date.now(),
-    plan: req.auth.plan,
+    plan: plan,
  };
  const crawler = crawlToCrawler(id, sc);
  // If sitemapOnly is true, only get links from sitemap
-  if (req.body.sitemapOnly) {
+  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(true, true);
    if (sitemap !== null) {
      sitemap.forEach((x) => {
@ -73,19 +97,18 @@ export async function mapController(
      // links = links.slice(1, limit); // don't slice, unnecessary
    }
  } else {
-    let urlWithoutWww = req.body.url.replace("www.", "");
+    let urlWithoutWww = url.replace("www.", "");
-    let mapUrl = req.body.search
+    let mapUrl = search && allowExternalLinks
-      ? `"${req.body.search}" site:${urlWithoutWww}`
+      ? `${search} ${urlWithoutWww}`
-      : `site:${req.body.url}`;
+      : search ? `${search} site:${urlWithoutWww}`
      : `site:${url}`;
    const resultsPerPage = 100;
-    const maxPages = Math.ceil(
+    const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
      Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
    );
    const cacheKey = `fireEngineMap:${mapUrl}`;
-    const cachedResult = null;
+    const cachedResult = await redis.get(cacheKey);
    let allResults: any[] = [];
    let pagePromises: Promise<any>[] = [];
@ -110,7 +133,7 @@ export async function mapController(
    // Parallelize sitemap fetch with serper search
    const [sitemap, ...searchResults] = await Promise.all([
-      req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
+      ignoreSitemap ? null : crawler.tryGetSitemap(true),
      ...(cachedResult ? [] : pagePromises),
    ]);
@ -124,7 +147,7 @@ export async function mapController(
      });
    }
-    let mapResults = allResults
+    mapResults = allResults
      .flat()
      .filter((result) => result !== null && result !== undefined);
@ -134,7 +157,7 @@ export async function mapController(
    }
    if (mapResults.length > 0) {
-      if (req.body.search) {
+      if (search) {
        // Ensure all map results are first, maintaining their order
        links = [
          mapResults[0].url,
@ -149,9 +172,8 @@ export async function mapController(
    }
    // Perform cosine similarity between the search query and the list of links
-    if (req.body.search) {
+    if (search) {
-      const searchQuery = req.body.search.toLowerCase();
+      const searchQuery = search.toLowerCase();
      links = performCosineSimilarity(links, searchQuery);
    }
@ -166,95 +188,75 @@ export async function mapController(
      .filter((x) => x !== null) as string[];
    // allows for subdomains to be included
-    links = links.filter((x) => isSameDomain(x, req.body.url));
+    links = links.filter((x) => isSameDomain(x, url));
    // if includeSubdomains is false, filter out subdomains
-    if (!req.body.includeSubdomains) {
+    if (!includeSubdomains) {
-      links = links.filter((x) => isSameSubdomain(x, req.body.url));
+      links = links.filter((x) => isSameSubdomain(x, url));
    }
    // remove duplicates that could be due to http/https or www
    links = removeDuplicateUrls(links);
    links.slice(0, limit);
  }
  const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
  return {
    success: true,
    links: includeMetadata ? mapResults : linksToReturn,
    scrape_id: origin?.includes("website") ? id : undefined,
    job_id: id,
    time_taken: (new Date().getTime() - Date.now()) / 1000,
  };
 }
 export async function mapController(
  req: RequestWithAuth<{}, MapResponse, MapRequest>,
  res: Response<MapResponse>
 ) {
  req.body = mapRequestSchema.parse(req.body);
  const result = await getMapResults({
    url: req.body.url,
    search: req.body.search,
    limit: req.body.limit,
    ignoreSitemap: req.body.ignoreSitemap,
    includeSubdomains: req.body.includeSubdomains,
    crawlerOptions: req.body,
    origin: req.body.origin,
    teamId: req.auth.team_id,
    plan: req.auth.plan,
  });
  // Bill the team
  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
    // Optionally, you could notify an admin or add to a retry queue here
  });
-  const endTime = new Date().getTime();
+  // Log the job
  const timeTakenInSeconds = (endTime - startTime) / 1000;
  logJob({
-    job_id: id,
+    job_id: result.job_id,
-    success: links.length > 0,
+    success: result.links.length > 0,
    message: "Map completed",
-    num_docs: links.length,
+    num_docs: result.links.length,
-    docs: links,
+    docs: result.links,
-    time_taken: timeTakenInSeconds,
+    time_taken: result.time_taken,
    team_id: req.auth.team_id,
    mode: "map", 
    url: req.body.url,
    crawlerOptions: {},
    scrapeOptions: {},
-    origin: req.body.origin,
+    origin: req.body.origin ?? "api",
    num_tokens: 0,
  });
-  return res.status(200).json({
+  const response = {
-    success: true,
+    success: true as const,
-    links: links,
+    links: result.links,
-    scrape_id: req.body.origin?.includes("website") ? id : undefined,
+    scrape_id: result.scrape_id
-  });
+  };
  return res.status(200).json(response);
 }
 // Subdomain sitemap url checking
 // // For each result, check for subdomains, get their sitemaps and add them to the links
 // const processedUrls = new Set();
 // const processedSubdomains = new Set();
 // for (const result of links) {
 //   let url;
 //   let hostParts;
 //   try {
 //     url = new URL(result);
 //     hostParts = url.hostname.split('.');
 //   } catch (e) {
 //     continue;
 //   }
 //   console.log("hostParts", hostParts);
 //   // Check if it's a subdomain (more than 2 parts, and not 'www')
 //   if (hostParts.length > 2 && hostParts[0] !== 'www') {
 //     const subdomain = hostParts[0];
 //     console.log("subdomain", subdomain);
 //     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
 //     console.log("subdomainUrl", subdomainUrl);
 //     if (!processedSubdomains.has(subdomainUrl)) {
 //       processedSubdomains.add(subdomainUrl);
 //       const subdomainCrawl = crawlToCrawler(id, {
 //         originUrl: subdomainUrl,
 //         crawlerOptions: legacyCrawlerOptions(req.body),
 //         pageOptions: {},
 //         team_id: req.auth.team_id,
 //         createdAt: Date.now(),
 //         plan: req.auth.plan,
 //       });
 //       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
 //       if (subdomainSitemap) {
 //         subdomainSitemap.forEach((x) => {
 //           if (!processedUrls.has(x.url)) {
 //             processedUrls.add(x.url);
 //             links.push(x.url);
 //           }
 //         });
 //       }
 //     }
 //   }
 // }
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -151,8 +151,25 @@ export const scrapeOptions = z.object({
 }).strict(strictMessage)
 export type ScrapeOptions = z.infer<typeof scrapeOptions>;
 export const extractV1Options = z.object({
  urls: url.array(),
  prompt: z.string().optional(),
  schema: z.any().optional(),
  limit: z.number().int().positive().finite().safe().optional(),
  ignoreSitemap: z.boolean().default(false),
  includeSubdomains: z.boolean().default(true),
  allowExternalLinks: z.boolean().default(false),
  origin: z.string().optional().default("api"),
  timeout: z.number().int().positive().finite().safe().default(60000)
 }).strict(strictMessage)
 export type ExtractV1Options = z.infer<typeof extractV1Options>;
 export const extractRequestSchema = extractV1Options;
 export type ExtractRequest = z.infer<typeof extractRequestSchema>;
 export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
  url,
  origin: z.string().optional().default("api"),
@ -173,6 +190,8 @@ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend(
  return obj;
 });
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
@ -340,6 +359,21 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }
 export type ExtractResponse =
  | ErrorResponse
  | {
      success: true;
      warning?: string;
      data: z.infer<typeof extractRequestSchema>;
      scrape_id?: string;
    };
 export interface ExtractResponseRequestTest {
  statusCode: number;
  body: ExtractResponse;
  error?: string;
 }
 export type CrawlResponse =
  | ErrorResponse
  | {
@ -496,6 +530,13 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
  };
 }
 export interface MapDocument {
  url: string;
  title?: string;
  description?: string;
 }   
 export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
  return {
    scrapeOptions: scrapeOptions.parse({
--- a/apps/api/src/lib/cache.ts
+++ b/apps/api/src/lib/cache.ts
@ -0,0 +1,50 @@
 import IORedis from "ioredis";
 import { ScrapeOptions } from "../controllers/v1/types";
 import { InternalOptions } from "../scraper/scrapeURL";
 import { logger as _logger } from "./logger";
 const logger = _logger.child({module: "cache"});
 export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
    maxRetriesPerRequest: null,
 }) : null;
 export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
    if (!cacheRedis) return null;
    // these options disqualify a cache
    if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
        || (scrapeOptions.actions && scrapeOptions.actions.length > 0)
    ) {
        return null;
    }
    return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
 }
 export type CacheEntry = {
    url: string;
    html: string;
    statusCode: number;
    error?: string;
 };
 export async function saveEntryToCache(key: string, entry: CacheEntry) {
    if (!cacheRedis) return;
    try {
        await cacheRedis.set(key, JSON.stringify(entry));
    } catch (error) {
        logger.warn("Failed to save to cache", { key, error });
    }
 }
 export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
    if (!cacheRedis) return null;
    try {
        return JSON.parse(await cacheRedis.get(key) ?? "null");
    } catch (error) {
        logger.warn("Failed to get from cache", { key, error });
        return null;
    }
 }
--- a/apps/api/src/lib/extract/build-document.ts
+++ b/apps/api/src/lib/extract/build-document.ts
@ -0,0 +1,15 @@
 import { Document } from "../../controllers/v1/types";
 export function buildDocument(document: Document): string {
  const metadata = document.metadata;
  const markdown = document.markdown;
  // for each key in the metadata allow up to 250 characters
  const metadataString = Object.entries(metadata).map(([key, value]) => {
    return `${key}: ${value?.toString().slice(0, 250)}`;
  }).join('\n');
  const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
  const documentString = `${markdown}${documentMetadataString}`;
  return documentString;
 }
--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -0,0 +1,124 @@
 // use llmExtract.ts instead
 // import OpenAI from "openai";
 // import { encoding_for_model } from "@dqbd/tiktoken";
 // import { TiktokenModel } from "@dqbd/tiktoken";
 // import { ExtractOptions } from "../../controllers/v1/types";
 // import { Document } from "../entities";
 // import { z } from "zod";
 // const maxTokens = 32000;
 // const modifier = 4;
 // export class LLMRefusalError extends Error {
 //   constructor(refusal: string) {
 //     super("LLM refused to extract the website's content");
 //     this.name = "LLMRefusalError";
 //   }
 // }
 // interface GenerateCompletionsParams {
 //   systemPrompt?: string;
 //   prompt?: string;
 //   schema?: any;
 //   pagesContent: string;
 // }
 // export async function generateBasicCompletion(prompt: string) {
 //   const openai = new OpenAI();
 //   const model: TiktokenModel =
 //     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
 //   const completion = await openai.chat.completions.create({
 //     model,
 //     messages: [{ role: "user", content: prompt }],
 //   });
 //   return completion.choices[0].message.content;
 // }
 // export async function generateFinalExtraction({
 //   pagesContent,
 //   systemPrompt,
 //   prompt,
 //   schema,
 // }: GenerateCompletionsParams): Promise<{
 //   content: string;
 //   metadata: { numTokens: number; warning: string };
 // }> {
 //   const openai = new OpenAI();
 //   const model: TiktokenModel =
 //     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
 //   let extractionContent = pagesContent;
 //   let numTokens = 0;
 //   let warning = "";
 //   const encoder = encoding_for_model(model);
 //   try {
 //     const tokens = encoder.encode(extractionContent);
 //     numTokens = tokens.length;
 //   } catch (error) {
 //     extractionContent = extractionContent.slice(0, maxTokens * modifier);
 //     warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
 //   } finally {
 //     encoder.free();
 //   }
 //   if (numTokens > maxTokens) {
 //     extractionContent = extractionContent.slice(0, maxTokens * modifier);
 //     warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
 //   }
 //   if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
 //     schema = {
 //       type: "object",
 //       properties: {
 //         items: schema,
 //       },
 //       required: ["items"],
 //       additionalProperties: false,
 //     };
 //   } else if (schema) {
 //     schema.additionalProperties = false;
 //     schema.required = Object.keys(schema.properties);
 //   }
 //   const jsonCompletion = await openai.beta.chat.completions.parse({
 //     temperature: 0,
 //     model,
 //     messages: [
 //       { role: "system", content: systemPrompt ?? "" },
 //       { role: "user", content: [{ type: "text", text: extractionContent }] },
 //       {
 //         role: "user",
 //         content: prompt
 //           ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
 //           : "Transform the above content into structured JSON output.",
 //       },
 //     ],
 //     response_format: schema
 //       ? {
 //           type: "json_schema",
 //           json_schema: {
 //             name: "websiteContent",
 //             schema: schema,
 //             strict: true,
 //           },
 //         }
 //       : { type: "json_object" },
 //   });
 //   if (jsonCompletion.choices[0].message.refusal !== null) {
 //     throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
 //   }
 //   const extraction = jsonCompletion.choices[0].message.parsed;
 //   return {
 //     content: extraction ?? "",
 //     metadata: {
 //       numTokens,
 //       warning,
 //     },
 //   };
 // }
--- a/apps/api/src/lib/extract/reranker.ts
+++ b/apps/api/src/lib/extract/reranker.ts
@ -0,0 +1,22 @@
 import { CohereClient } from "cohere-ai";
 import { MapDocument } from "../../controllers/v1/types";
 const cohere = new CohereClient({
  token: process.env.COHERE_API_KEY,
 });
 export async function rerankDocuments(
  documents: (string | Record<string, string>)[],
  query: string,
  topN = 3,
  model = "rerank-english-v3.0"
 ) {
  const rerank = await cohere.v2.rerank({
    documents,
    query,
    topN,
    model,
    returnDocuments: true,
  });
  return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
 }
--- a/apps/api/src/lib/ranker.test.ts
+++ b/apps/api/src/lib/ranker.test.ts
@ -0,0 +1,68 @@
 import { performRanking } from './ranker';
 describe('performRanking', () => {
  it('should rank links based on similarity to search query', async () => {
    const linksWithContext = [
      'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
      'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
      'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
    ];
    const links = [
      'https://example.com/dogs',
      'https://example.com/cats', 
      'https://example.com/pets'
    ];
    const searchQuery = 'cats training';
    const result = await performRanking(linksWithContext, links, searchQuery);
    // Should return array of objects with link, linkWithContext, score, originalIndex
    expect(result).toBeInstanceOf(Array);
    expect(result.length).toBe(3);
    // First result should be the dogs page since query is about dogs
    expect(result[0].link).toBe('https://example.com/cats');
    // Each result should have required properties
    result.forEach(item => {
      expect(item).toHaveProperty('link');
      expect(item).toHaveProperty('linkWithContext');
      expect(item).toHaveProperty('score');
      expect(item).toHaveProperty('originalIndex');
      expect(typeof item.score).toBe('number');
      expect(item.score).toBeGreaterThanOrEqual(0);
      expect(item.score).toBeLessThanOrEqual(1);
    });
    // Scores should be in descending order
    for (let i = 1; i < result.length; i++) {
      expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
    }
  });
  it('should handle empty inputs', async () => {
    const result = await performRanking([], [], '');
    expect(result).toEqual([]);
  });
  it('should maintain original order for equal scores', async () => {
    const linksWithContext = [
      'url: https://example.com/1, title: Similar content A, description: test',
      'url: https://example.com/2, title: Similar content B, description: test'
    ];
    const links = [
      'https://example.com/1',
      'https://example.com/2'
    ];
    const searchQuery = 'test';
    const result = await performRanking(linksWithContext, links, searchQuery);
    // If scores are equal, original order should be maintained
    expect(result[0].originalIndex).toBeLessThan(result[1].originalIndex);
  });
 });
--- a/apps/api/src/lib/ranker.ts
+++ b/apps/api/src/lib/ranker.ts
@ -0,0 +1,92 @@
 import axios from 'axios';
 import { configDotenv } from 'dotenv';
 import OpenAI from "openai";
 configDotenv();
 const openai = new OpenAI({
  apiKey: process.env.OPENAI_API_KEY,
 });
 async function getEmbedding(text: string) {
  const embedding = await openai.embeddings.create({
    model: "text-embedding-ada-002",
    input: text,
    encoding_format: "float",
  });
  return embedding.data[0].embedding;
 }
 const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
  const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
  const magnitude1 = Math.sqrt(
    vec1.reduce((sum, val) => sum + val * val, 0)
  );
  const magnitude2 = Math.sqrt(
    vec2.reduce((sum, val) => sum + val * val, 0)
  );
  if (magnitude1 === 0 || magnitude2 === 0) return 0;
  return dotProduct / (magnitude1 * magnitude2);
 };
 // Function to convert text to vector
 const textToVector = (searchQuery: string, text: string): number[] => {
  const words = searchQuery.toLowerCase().split(/\W+/);
  return words.map((word) => {
    const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
      .length;
    return count / text.length;
  });
 };
 async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
  try {
    // Handle invalid inputs
    if (!searchQuery || !linksWithContext.length || !links.length) {
      return [];
    }
    // Sanitize search query by removing null characters
    const sanitizedQuery = searchQuery;
    // Generate embeddings for the search query
    const queryEmbedding = await getEmbedding(sanitizedQuery);
    // Generate embeddings for each link and calculate similarity
    const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
      try {
        const linkEmbedding = await getEmbedding(linkWithContext);
        const score = cosineSimilarity(queryEmbedding, linkEmbedding);
        return { 
          link: links[index],
          linkWithContext,
          score,
          originalIndex: index
        };
      } catch (err) {
        // If embedding fails for a link, return with score 0
        return {
          link: links[index],
          linkWithContext,
          score: 0,
          originalIndex: index
        };
      }
    }));
    // Sort links based on similarity scores while preserving original order for equal scores
    linksAndScores.sort((a, b) => {
      const scoreDiff = b.score - a.score;
      return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
    });
    return linksAndScores;
  } catch (error) {
    console.error(`Error performing semantic search: ${error}`);
    return [];
  }
 }
 export { performRanking };
--- a/apps/api/src/lib/timeout.ts
+++ b/apps/api/src/lib/timeout.ts
@ -1 +1 @@
-export const axiosTimeout = 3000;
+export const axiosTimeout = 5000;
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
 import { scrapeStatusController } from "../controllers/v1/scrape-status";
 import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
 import { batchScrapeController } from "../controllers/v1/batch-scrape";
 import { extractController } from "../controllers/v1/extract";
 // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
 // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
 // import { searchController } from "../../src/controllers/v1/search";
@ -98,7 +99,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
 function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
    if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
        if (!res.headersSent) {
-            return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
+            return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
        }
    }
    next();
@ -178,6 +179,13 @@ v1Router.ws(
    crawlStatusWSController
 );
 v1Router.post(
    "/extract",
    authMiddleware(RateLimiterMode.Scrape),
    checkCreditsMiddleware(1),
    wrap(extractController)
 );
 // v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -199,3 +207,4 @@ v1Router.delete(
 // Health/Probe routes
 // v1Router.get("/health/liveness", livenessController);
 // v1Router.get("/health/readiness", readinessController);
--- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
@ -0,0 +1,19 @@
 import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
 import { EngineScrapeResult } from "..";
 import { Meta } from "../..";
 import { EngineError } from "../../error";
 export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
    const key = cacheKey(meta.url, meta.options, meta.internalOptions);
    if (key === null) throw new EngineError("Scrape not eligible for caching");
    const entry = await getEntryFromCache(key);
    if (entry === null) throw new EngineError("Cache missed");
    return {
        url: entry.url,
        html: entry.html,
        statusCode: entry.statusCode,
        error: entry.error,
    };
 }
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
 import { scrapeURLWithScrapingBee } from "./scrapingbee";
 import { scrapeURLWithFetch } from "./fetch";
 import { scrapeURLWithPlaywright } from "./playwright";
 import { scrapeCache } from "./cache";
-export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
+export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
 const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
 const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
 const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
 const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
 export const engines: Engine[] = [
    // ...(useCache ? [ "cache" as const ] : []),
    ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
    ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
    ...(usePlaywright ? [ "playwright" as const ] : []),
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
 const engineHandlers: {
    [E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
 } = {
    "cache": scrapeCache,
    "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
    "fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
    "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
@ -95,6 +99,22 @@ export const engineOptions: {
        quality: number,
    }
 } = {
    "cache": {
        features: {
            "actions": false,
            "waitFor": true,
            "screenshot": false,
            "screenshot@fullScreen": false,
            "pdf": false, // TODO: figure this out
            "docx": false, // TODO: figure this out
            "atsv": false,
            "location": false,
            "mobile": false,
            "skipTlsVerification": false,
            "useFastMode": false,
        },
        quality: 1000, // cache should always be tried first
    },
    "fire-engine;chrome-cdp": {
        features: {
            "actions": true,
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@ -0,0 +1,26 @@
 import { Document } from "../../../controllers/v1/types";
 import { Meta } from "..";
 import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
 export function saveToCache(meta: Meta, document: Document): Document {
    if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
    if (document.rawHtml === undefined) {
        throw new Error("rawHtml is undefined -- this transformer is being called out of order");
    }
    const key = cacheKey(meta.url, meta.options, meta.internalOptions);
    if (key !== null) {
        const entry: CacheEntry = {
            html: document.rawHtml!,
            statusCode: document.metadata.statusCode!,
            url: document.metadata.url ?? document.metadata.sourceURL!,
            error: document.metadata.error ?? undefined,
        };
        saveEntryToCache(key, entry);
    }
    return document;
 }
--- a/apps/api/src/scraper/scrapeURL/transformers/index.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts
@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
 import { performLLMExtract } from "./llmExtract";
 import { uploadScreenshot } from "./uploadScreenshot";
 import { removeBase64Images } from "./removeBase64Images";
 import { saveToCache } from "./cache";
 export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
 // TODO: allow some of these to run in parallel
 export const transformerStack: Transformer[] = [
    saveToCache,
    deriveHTMLFromRawHTML,
    deriveMarkdownFromHTML,
    deriveLinksFromHTML,
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
    }
 }
-async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
+export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
    let extract: any;
    let warning: string | undefined;
    const openai = new OpenAI();
    const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
-    if (document.markdown === undefined) {
+    if (markdown === undefined) {
        throw new Error("document.markdown is undefined -- this is unexpected");
    }
    let extractionContent = document.markdown;
    // count number of tokens
    let numTokens = 0;
    const encoder = encoding_for_model(model as TiktokenModel);
    try {
        // Encode the message into tokens
-        const tokens = encoder.encode(extractionContent);
+        const tokens = encoder.encode(markdown);
        // Return the number of tokens
        numTokens = tokens.length;
    } catch (error) {
-        logger.warn("Calculating num tokens of string failed", { error, extractionContent });
+        logger.warn("Calculating num tokens of string failed", { error, markdown });
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);
-        const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
+        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    } finally {
        // Free the encoder resources after use
        encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
    if (numTokens > maxTokens) {
        // trim the document to the maximum number of tokens, tokens != characters
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);
-        const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
+        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    }
    let schema = options.schema;
@ -107,12 +108,22 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            required: ["items"],
            additionalProperties: false,
        };
    } else if (schema && typeof schema === 'object' && !schema.type) {
      schema = {
          type: "object",
          properties: Object.fromEntries(
              Object.entries(schema).map(([key, value]) => [key, { type: value }])
          ),
          required: Object.keys(schema),
          additionalProperties: false
      };
    }
    schema = normalizeSchema(schema);
    const jsonCompletion = await openai.beta.chat.completions.parse({
        model,
        temperature: 0,
        messages: [
            {
                role: "system",
@ -120,7 +131,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            },
            {
                role: "user",
-                content: [{ type: "text", text: extractionContent }],
+                content: [{ type: "text", text: markdown }],
            },
            {
                role: "user",
@ -143,26 +154,35 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
    }
-    document.extract = jsonCompletion.choices[0].message.parsed;
+    extract = jsonCompletion.choices[0].message.parsed;
-    if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
+    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
        try {
-            document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
+            extract = JSON.parse(jsonCompletion.choices[0].message.content);
        } catch (e) {
            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
        }
    }
-    if (options.schema && options.schema.type === "array") {
+    // If the users actually wants the items object, they can specify it as 'required' in the schema
-        document.extract = document.extract?.items;
+    // otherwise, we just return the items array
    if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
        extract = extract?.items;
    }
-    return document;
+    return { extract, warning, numTokens };
 }
 export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
    if (meta.options.formats.includes("extract")) {
-        document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
+        const { extract, warning } = await generateOpenAICompletions(
          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
          meta.options.extract!,
          document.markdown,
          document.warning,
        );
        document.extract = extract;
        document.warning = warning;
    }
    return document;
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
          }
        }
      }
-    }, 500);
+    }, 250);
  })
 }
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -106,6 +106,15 @@ export interface FirecrawlCrawlStatusResponse {
  error?: string;
 }
 export interface FirecrawlExtractResponse {
  statusCode: number;
  body: {
    success: boolean;
    data: any[];
  };
  error?: string;
 }
 export enum RateLimiterMode {
  Crawl = "crawl",
  CrawlStatus = "crawlStatus",
--- a/apps/js-sdk/example.js
+++ b/apps/js-sdk/example.js
@ -1,4 +1,5 @@
 import FirecrawlApp from 'firecrawl';
 import { z } from 'zod';
 const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -42,6 +43,18 @@ const main = async () => {
  const mapResult = await app.mapUrl('https://firecrawl.dev');
  console.log(mapResult)
  // Extract information from a website using LLM:
  const extractSchema = z.object({
    title: z.string(),
    description: z.string(),
    links: z.array(z.string())
  });
  const extractResult = await app.extract(['https://firecrawl.dev'], {
    prompt: "Extract the title, description, and links from the website",
    schema: extractSchema
  });
  console.log(extractResult);
  // Crawl a website with WebSockets:
  const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
--- a/apps/js-sdk/example.ts
+++ b/apps/js-sdk/example.ts
@ -42,6 +42,19 @@ const main = async () => {
  const mapResult = await app.mapUrl('https://firecrawl.dev');
  console.log(mapResult)
  // // Extract information from a website using LLM:
  // const extractSchema = z.object({
  //   title: z.string(),
  //   description: z.string(),
  //   links: z.array(z.string())
  // });
  // const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
  //   prompt: "Extract the title, description, and links from the website",
  //   schema: extractSchema
  // });
  // console.log(extractResult);
  // Crawl a website with WebSockets:
  const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.8.5",
+  "version": "1.9.0",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -236,6 +236,27 @@ export interface MapResponse {
  error?: string;
 }
 /**
 * Parameters for extracting information from URLs.
 * Defines options for extracting information from URLs.
 */
 export interface ExtractParams {
  prompt: string;
  schema?: zt.ZodSchema;
  systemPrompt?: string;
  allowExternalLinks?: boolean;
 }
 /**
 * Response interface for extracting information from URLs.
 * Defines the structure of the response received after extracting information from URLs.
 */
 export interface ExtractResponse {
  success: true;
  data: zt.infer<zt.ZodSchema>;
  error?: string;
 }
 /**
 * Error response interface.
 * Defines the structure of the response received when an error occurs.
@ -245,7 +266,6 @@ export interface ErrorResponse {
  error: string;
 }
 /**
 * Custom error class for Firecrawl.
 * Extends the built-in Error class to include a status code.
@ -679,6 +699,44 @@ export default class FirecrawlApp {
    return { success: false, error: "Internal server error." };
  }
  /**
   * Extracts information from URLs using the Firecrawl API.
   * @param url - The URL to extract information from.
   * @param params - Additional parameters for the extract request.
   * @returns The response from the extract operation.
   */
  async extract(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
    if (!params?.prompt) {
      throw new FirecrawlError("Prompt is required", 400);
    }
    let jsonData: { urls: string[] } & ExtractParams= { urls,  ...params };
    let jsonSchema: any;
    try {
      jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
    } catch (error: any) {
      throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
    }
    try {
      const response: AxiosResponse = await this.postRequest(
        this.apiUrl + `/v1/extract`,
        { ...jsonData, schema: jsonSchema },
        headers
      );
      if (response.status === 200) {
        return response.data as ExtractResponse;
      } else {
        this.handleError(response, "extract");
      }
    } catch (error: any) {
      throw new FirecrawlError(error.message, 500);
    }
    return { success: false, error: "Internal server error." };
  }
  /**
   * Prepares the headers for an API request.
   * @param idempotencyKey - Optional key to ensure idempotency.
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@ -2,6 +2,8 @@ import time
 import nest_asyncio
 import uuid
 from firecrawl.firecrawl import FirecrawlApp
 from pydantic import BaseModel, Field
 from typing import List
 app = FirecrawlApp(api_key="fc-")
@ -50,9 +52,6 @@ print(crawl_status)
 # LLM Extraction:
 # Define schema to extract contents into using pydantic
 from pydantic import BaseModel, Field
 from typing import List
 class ArticleSchema(BaseModel):
    title: str
    points: int 
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
 map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
 print(map_result)
 # Extract URLs:
 class ExtractSchema(BaseModel):
    title: str
    description: str
    links: List[str]
 # Define the schema using Pydantic
 extract_schema = ExtractSchema.schema()
 # Perform the extraction
 extract_result = app.extract(['https://firecrawl.dev'], {
    'prompt': "Extract the title, description, and links from the website",
    'schema': extract_schema
 })
 print(extract_result)
 # Crawl a website with WebSockets:
 # inside an async function...
 nest_asyncio.apply()
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os
 from .firecrawl import FirecrawlApp # noqa
-__version__ = "1.5.0"
+__version__ = "1.6.0"
 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -12,15 +12,40 @@ Classes:
 import logging
 import os
 import time
-from typing import Any, Dict, Optional, List
+from typing import Any, Dict, Optional, List, Union
 import json
 import requests
 import pydantic
 import websockets
 logger : logging.Logger = logging.getLogger("firecrawl")
 class FirecrawlApp:
    class ExtractParams(pydantic.BaseModel):
        """
        Parameters for the extract operation.
        """
        prompt: str
        schema: Optional[Any] = None
        system_prompt: Optional[str] = None
        allow_external_links: Optional[bool] = False
    class ExtractResponse(pydantic.BaseModel):
        """
        Response from the extract operation.
        """
        success: bool
        data: Optional[Any] = None
        error: Optional[str] = None
    class ErrorResponse(pydantic.BaseModel):
        """
        Error response.
        """
        success: bool
        error: str
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
      """
      Initialize the FirecrawlApp instance with API key, API URL.
@ -434,6 +459,48 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check batch scrape status')
    def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
        """
        Extracts information from a URL using the Firecrawl API.
        Args:
            urls (List[str]): The URLs to extract information from.
            params (Optional[ExtractParams]): Additional parameters for the extract request.
        Returns:
            Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
        """
        headers = self._prepare_headers()
        if not params or not params.get('prompt'):
            raise ValueError("Prompt is required")
        if not params.get('schema'):
            raise ValueError("Schema is required for extraction")
        jsonData = {'urls': urls, **params}
        jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
        try:
            response = self._post_request(
                f'{self.api_url}/v1/extract',
                {
                    **jsonData,
                    'allowExternalLinks': params.get('allow_external_links', False),
                    'schema': jsonSchema
                },
                headers
            )
            if response.status_code == 200:
                return response.json()
            else:
                self._handle_error(response, "extract")
        except Exception as e:
            raise ValueError(str(e), 500)
        return {'success': False, 'error': "Internal server error."}
    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.
		`@ -1 +1 @@`
			`export const axiosTimeout = 3000;`				`export const axiosTimeout = 5000;`