Merge pull request #915 from mendableai/nsc/new-extract

Extract (beta)
2025-08-05 15:40:37 +08:00 · 2024-11-26 10:02:09 -08:00 · 2024-11-26 10:02:09 -08:00 · 6c33b978f3
commit 6c33b978f3
parent d3a9d29288 5522d6af7d
32 changed files with 2767 additions and 162 deletions
--- a/apps/api/package.json
+++ b/apps/api/package.json
@ -55,7 +55,7 @@
    "@bull-board/api": "^5.20.5",
    "@bull-board/express": "^5.20.5",
    "@devil7softwares/pos": "^1.0.2",
-    "@dqbd/tiktoken": "^1.0.16",
+    "@dqbd/tiktoken": "^1.0.17",
    "@nangohq/node": "^0.40.8",
    "@sentry/cli": "^2.33.1",
    "@sentry/node": "^8.26.0",
@ -73,6 +73,7 @@
    "cacheable-lookup": "^6.1.0",
    "cheerio": "^1.0.0-rc.12",
    "cohere": "^1.1.1",
+    "cohere-ai": "^7.14.0",
    "cors": "^2.8.5",
    "cron-parser": "^4.9.0",
    "date-fns": "^3.6.0",
--- a/apps/api/pnpm-lock.yaml
+++ b/apps/api/pnpm-lock.yaml
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -46,4 +46,32 @@ content-type: application/json
@batchScrapeId = {{batchScrape.response.body.$.id}}
 # @name batchScrapeStatus
 GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
-Authorization: Bearer {{$dotenv TEST_API_KEY}}
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+
+
+### Map Website
+# @name map
+POST {{baseUrl}}/v1/map HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "url": "firecrawl.dev",
+  "sitemapOnly": true
+}
+
+### Extract
+# @name extract
+POST {{baseUrl}}/v1/extract HTTP/1.1
+Authorization: Bearer {{$dotenv TEST_API_KEY}}
+content-type: application/json
+
+{
+  "urls": ["firecrawl.dev"],
+  "prompt": "What is the title, description and main product of the page?",
+  "schema": {
+    "title": "string",
+    "description": "string",
+    "mainProduct": "string"
+  }
+}
--- a/apps/api/src/tests/e2e_extract/index.test.ts
+++ b/apps/api/src/tests/e2e_extract/index.test.ts
@ -0,0 +1,249 @@
+import request from "supertest";
+import dotenv from "dotenv";
+import {
+  FirecrawlCrawlResponse,
+  FirecrawlCrawlStatusResponse,
+  FirecrawlScrapeResponse,
+} from "../../types";
+
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+
+describe("E2E Tests for Extract API Routes", () => {
+  it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["https://firecrawl.dev/*"],
+        prompt: "Who are the authors of the blog posts?",
+        schema: {
+          type: "object",
+          properties: { authors: { type: "array", items: { type: "string" } } },
+        },
+      });
+
+    console.log(response.body);
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data).toHaveProperty("authors");
+
+    let gotItRight = 0;
+    for (const author of response.body.data?.authors) {
+      if (author.includes("Caleb Peffer")) gotItRight++;
+      if (author.includes("Gergő Móricz")) gotItRight++;
+      if (author.includes("Eric Ciarla")) gotItRight++;
+      if (author.includes("Nicolas Camara")) gotItRight++;
+      if (author.includes("Jon")) gotItRight++;
+      if (author.includes("Wendong")) gotItRight++;
+
+    }
+
+    expect(gotItRight).toBeGreaterThan(1);
+  }, 60000);
+
+  it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["firecrawl.dev/*"],
+        prompt: "Who are the founders of the company?",
+        allowExternalLinks: true,
+        schema: {
+          type: "object",
+          properties: { founders: { type: "array", items: { type: "string" } } },
+        },
+      });
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data).toHaveProperty("founders");
+
+    console.log(response.body.data?.founders);
+    let gotItRight = 0;
+    for (const founder of response.body.data?.founders) {
+      if (founder.includes("Caleb")) gotItRight++;
+      if (founder.includes("Eric")) gotItRight++;
+      if (founder.includes("Nicolas")) gotItRight++;
+      if (founder.includes("nick")) gotItRight++;
+      if (founder.includes("eric")) gotItRight++;
+      if (founder.includes("jon-noronha")) gotItRight++;
+
+    }
+
+    expect(gotItRight).toBeGreaterThanOrEqual(2);
+  }, 60000);
+
+  it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["https://firecrawl.dev/*"],
+        prompt: "What are they hiring for?",
+        allowExternalLinks: true,
+        schema: {
+          type: "array",
+          items: {
+            type: "string"
+          },
+          required: ["items"]
+        },
+      });
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    console.log(response.body.data);
+
+    let gotItRight = 0;
+    for (const hiring of response.body.data?.items) {
+      if (hiring.includes("Developer Support Engineer")) gotItRight++;
+      if (hiring.includes("Dev Ops Engineer")) gotItRight++;
+      if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
+    }
+
+    expect(gotItRight).toBeGreaterThan(2);
+  }, 60000);
+
+  it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["fivetran.com/*"],
+        prompt: "Does Fivetran have PCI DSS compliance?",
+        allowExternalLinks: true,
+        schema: {
+          type: "object",
+          properties: {
+            pciDssCompliance: { type: "boolean" }
+          }
+        },
+      });
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data?.pciDssCompliance).toBe(true);
+  }, 60000);
+
+  it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["fivetran.com/*"],
+        prompt: "What are the Azure Data Connectors they offer?",
+        schema: {
+          type: "array",
+          items: {
+            type: "object",
+            properties: {
+              connector: { type: "string" },
+              description: { type: "string" },
+              supportsCaptureDelete: { type: "boolean" }
+            }
+          }
+        }
+      })
+
+    console.log(response.body);
+    // expect(response.statusCode).toBe(200);
+    // expect(response.body).toHaveProperty("data");
+    // expect(response.body.data?.pciDssCompliance).toBe(true);
+  }, 60000);
+
+  it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
+        prompt: "what applicant tracking system is this company using?",
+        schema: {
+          type: "object",
+          properties: {
+            isGreenhouseATS: { type: "boolean" },
+            answer: { type: "string" }
+          }
+        },
+        allowExternalLinks: true
+      })
+
+    console.log(response.body);
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data?.isGreenhouseATS).toBe(true);
+  }, 60000);
+
+  it.concurrent("should return mintlify api components", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["https://mintlify.com/docs/*"],
+        prompt: "what are the 4 API components?",
+        schema: {
+          type: "array",
+          items: {
+            type: "object",
+            properties: {
+              component: { type: "string" }
+            }
+          },
+          required: ["items"]
+        },
+        allowExternalLinks: true
+      })
+
+    console.log(response.body.data?.items);
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data?.items.length).toBe(4);
+    let gotItRight = 0;
+    for (const component of response.body.data?.items) {
+      if (component.component.toLowerCase().includes("parameter")) gotItRight++;
+      if (component.component.toLowerCase().includes("response")) gotItRight++;
+      if (component.component.toLowerCase().includes("expandable")) gotItRight++;
+      if (component.component.toLowerCase().includes("sticky")) gotItRight++;
+      if (component.component.toLowerCase().includes("examples")) gotItRight++;
+
+    }
+    expect(gotItRight).toBeGreaterThan(2);
+  }, 60000);
+
+  it.concurrent("should return information about Eric Ciarla", async () => {
+    const response = await request(TEST_URL)
+      .post("/v1/extract")
+      .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+      .set("Content-Type", "application/json")
+      .send({
+        urls: ["https://ericciarla.com/"],
+        prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
+        schema: {
+          type: "object",
+          properties: {
+            name: { type: "string" },
+            work: { type: "string" },
+            education: { type: "string" }
+          },
+          required: ["name", "work", "education"]
+        },
+        allowExternalLinks: true
+      })
+
+    console.log(response.body.data);
+    expect(response.statusCode).toBe(200);
+    expect(response.body).toHaveProperty("data");
+    expect(response.body.data?.name).toBe("Eric Ciarla");
+    expect(response.body.data?.work).toBeDefined();
+    expect(response.body.data?.education).toBeDefined();
+  }, 60000);
+
+  
+
+});
--- a/apps/api/src/tests/e2e_map/index.test.ts
+++ b/apps/api/src/tests/e2e_map/index.test.ts
@ -0,0 +1,117 @@
+import request from "supertest";
+import dotenv from "dotenv";
+
+dotenv.config();
+const TEST_URL = "http://127.0.0.1:3002";
+
+describe("E2E Tests for Map API Routes", () => {
+  it.concurrent(
+    "(feat-search)should return links containing 'smart-crawl'",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          search: "smart-crawl",
+        });
+
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
+    },
+    60000
+  );
+
+  it.concurrent(
+    "(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          includeSubdomains: true,
+        });
+
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[response.body.links.length - 1]).toContain(
+        "docs.firecrawl.dev"
+      );
+    },
+    60000
+  );
+
+  it.concurrent(
+    "(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: true,
+        });
+
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(0);
+      expect(response.body.links[response.body.links.length - 1]).not.toContain(
+        "docs.firecrawl.dev"
+      );
+    },
+    60000
+  );
+
+  it.concurrent(
+    "(feat-limit) should return mapped links for firecrawl.dev with a limit",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://firecrawl.dev",
+          sitemapOnly: false,
+          limit: 10,
+        });
+
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeLessThanOrEqual(10);
+    },
+    60000
+  );
+
+  it.concurrent(
+    "(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
+    async () => {
+      const response = await request(TEST_URL)
+        .post("/v1/map")
+        .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
+        .set("Content-Type", "application/json")
+        .send({
+          url: "https://geekflare.com/sitemap_index.xml",
+          sitemapOnly: true,
+        });
+
+      console.log(response.body);
+      expect(response.statusCode).toBe(200);
+      expect(response.body).toHaveProperty("links");
+      expect(response.body.links.length).toBeGreaterThan(1900);
+    },
+    60000
+  );
+});
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -0,0 +1,265 @@
+import { Request, Response } from "express";
+import {
+  // Document,
+  RequestWithAuth,
+  ExtractRequest,
+  extractRequestSchema,
+  ExtractResponse,
+  MapDocument,
+  scrapeOptions,
+} from "./types";
+import { Document } from "../../lib/entities";
+import Redis from "ioredis";
+import { configDotenv } from "dotenv";
+import { performRanking } from "../../lib/ranker";
+import { billTeam } from "../../services/billing/credit_billing";
+import { logJob } from "../../services/logging/log_job";
+import { logger } from "../../lib/logger";
+import { getScrapeQueue } from "../../services/queue-service";
+import { waitForJob } from "../../services/queue-jobs";
+import { addScrapeJob } from "../../services/queue-jobs";
+import { PlanType } from "../../types";
+import { getJobPriority } from "../../lib/job-priority";
+import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
+import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
+import { getMapResults } from "./map";
+import { buildDocument } from "../../lib/extract/build-document";
+
+configDotenv();
+const redis = new Redis(process.env.REDIS_URL!);
+
+const MAX_EXTRACT_LIMIT = 100;
+const MAX_RANKING_LIMIT = 10;
+const INITIAL_SCORE_THRESHOLD = 0.75;
+const FALLBACK_SCORE_THRESHOLD = 0.5;
+const MIN_REQUIRED_LINKS = 1;
+
+/**
+ * Extracts data from the provided URLs based on the request parameters.
+ * Currently in beta.
+ * @param req - The request object containing authentication and extraction details.
+ * @param res - The response object to send the extraction results.
+ * @returns A promise that resolves when the extraction process is complete.
+ */
+export async function extractController(
+  req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
+  res: Response<ExtractResponse>
+) {
+  const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
+  
+  req.body = extractRequestSchema.parse(req.body);
+
+  const id = crypto.randomUUID();
+  let links: string[] = [];
+  let docs: Document[] = [];
+  const earlyReturn = false;
+
+  // Process all URLs in parallel
+  const urlPromises = req.body.urls.map(async (url) => {
+    if (url.includes('/*') || req.body.allowExternalLinks) {
+      // Handle glob pattern URLs
+      const baseUrl = url.replace('/*', '');
+      // const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
+
+      const allowExternalLinks = req.body.allowExternalLinks ?? true;
+      let urlWithoutWww = baseUrl.replace("www.", "");
+      let mapUrl = req.body.prompt && allowExternalLinks
+        ? `${req.body.prompt} ${urlWithoutWww}`
+        : req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
+        : `site:${urlWithoutWww}`;
+
+      const mapResults = await getMapResults({
+        url: baseUrl,
+        search: req.body.prompt,
+        teamId: req.auth.team_id,
+        plan: req.auth.plan,
+        allowExternalLinks,
+        origin: req.body.origin,
+        limit: req.body.limit,
+        // If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
+        ignoreSitemap: !selfHosted ? true : false,
+        includeMetadata: true,
+        includeSubdomains: req.body.includeSubdomains,
+      });
+
+      let mappedLinks = mapResults.links as MapDocument[];
+      // Limit number of links to MAX_EXTRACT_LIMIT
+      mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
+
+      let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
+      
+      // Filter by path prefix if present
+      // wrong
+      // if (pathPrefix) {
+      //   mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
+      // }
+
+      if (req.body.prompt) {
+        // Get similarity scores between the search query and each link's context
+        const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
+        
+        // First try with high threshold
+        let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
+        
+        // If we don't have enough high-quality links, try with lower threshold
+        if (filteredLinks.length < MIN_REQUIRED_LINKS) {
+          logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
+          filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
+          
+          if (filteredLinks.length === 0) {
+            // If still no results, take top N results regardless of score
+            logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
+            filteredLinks = linksAndScores
+              .sort((a, b) => b.score - a.score)
+              .slice(0, MIN_REQUIRED_LINKS)
+              .map(x => mappedLinks.find(link => link.url === x.link))
+              .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
+          }
+        }
+
+        mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
+      }
+
+      return mappedLinks.map(x => x.url) as string[];
+
+    } else {
+      // Handle direct URLs without glob pattern
+      if (!isUrlBlocked(url)) {
+        return [url];
+      }
+      return [];
+    }
+  });
+
+  // Wait for all URL processing to complete and flatten results
+  const processedUrls = await Promise.all(urlPromises);
+  links.push(...processedUrls.flat());
+
+  if (links.length === 0) {
+    return res.status(400).json({
+      success: false,
+      error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
+    });
+  }
+
+  // Scrape all links in parallel with retries
+  const scrapePromises = links.map(async (url) => {
+    const origin = req.body.origin || "api";
+    const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
+    const jobId = crypto.randomUUID();
+
+    const jobPriority = await getJobPriority({
+      plan: req.auth.plan as PlanType,
+      team_id: req.auth.team_id,
+      basePriority: 10,
+    });
+
+    await addScrapeJob(
+      {
+        url,
+        mode: "single_urls", 
+        team_id: req.auth.team_id,
+        scrapeOptions: scrapeOptions.parse({}),
+        internalOptions: {},
+        plan: req.auth.plan!,
+        origin,
+        is_scrape: true,
+      },
+      {},
+      jobId,
+      jobPriority
+    );
+
+    try {
+      const doc = await waitForJob<Document>(jobId, timeout);
+      await getScrapeQueue().remove(jobId);
+      if (earlyReturn) {
+        return null;
+      }
+      return doc;
+    } catch (e) {
+      logger.error(`Error in scrapeController: ${e}`);
+      if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
+        throw {
+          status: 408,
+          error: "Request timed out"
+        };
+      } else {
+        throw {
+          status: 500,
+          error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
+        };
+      }
+    }
+  });
+
+  try {
+    const results = await Promise.all(scrapePromises);
+    docs.push(...results.filter(doc => doc !== null).map(x => x!));
+  } catch (e) {
+    return res.status(e.status).json({
+      success: false,
+      error: e.error
+    });
+  }
+
+  const completions = await generateOpenAICompletions(
+    logger.child({ method: "extractController/generateOpenAICompletions" }),
+    {
+      mode: "llm",
+      systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided.",
+      prompt: req.body.prompt,
+      schema: req.body.schema,
+    },
+    docs.map(x => buildDocument(x)).join('\n')
+  );
+
+  // TODO: change this later
+  // While on beta, we're billing 5 credits per link discovered/scraped.
+  billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
+    logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
+  });
+
+  let data = completions.extract ?? {};
+  let warning = completions.warning;
+
+  logJob({
+    job_id: id,
+    success: true,
+    message: "Extract completed",
+    num_docs: 1,
+    docs: data,
+    time_taken: (new Date().getTime() - Date.now()) / 1000,
+    team_id: req.auth.team_id,
+    mode: "extract",
+    url: req.body.urls.join(", "),
+    scrapeOptions: req.body,
+    origin: req.body.origin ?? "api",
+    num_tokens: completions.numTokens ?? 0
+  });
+
+  return res.status(200).json({
+    success: true,
+    data: data,
+    scrape_id: id,
+    warning: warning
+  });
+}
+
+/**
+ * Filters links based on their similarity score to the search query.
+ * @param mappedLinks - The list of mapped links to filter.
+ * @param linksAndScores - The list of links and their similarity scores.
+ * @param threshold - The score threshold to filter by.
+ * @returns The filtered list of links.
+ */
+function filterAndProcessLinks(
+  mappedLinks: MapDocument[], 
+  linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
+  threshold: number
+): MapDocument[] {
+  return linksAndScores
+    .filter(x => x.score > threshold)
+    .map(x => mappedLinks.find(link => link.url === x.link))
+    .filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
+}
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -1,6 +1,6 @@
 import { Response } from "express";
 import { v4 as uuidv4 } from "uuid";
-import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
+import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
 import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
 import { MapResponse, MapRequest } from "./types";
 import { configDotenv } from "dotenv";
@ -25,37 +25,61 @@ const MAX_MAP_LIMIT = 5000;
 // Max Links that "Smart /map" can return
 const MAX_FIRE_ENGINE_RESULTS = 1000;

-export async function mapController(
-  req: RequestWithAuth<{}, MapResponse, MapRequest>,
-  res: Response<MapResponse>
-) {
-  const startTime = new Date().getTime();
-
-  req.body = mapRequestSchema.parse(req.body);
-
-  const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
+interface MapResult {
+  success: boolean;
+  links: string[] | any[];
+  scrape_id?: string;
+  job_id: string;
+  time_taken: number;
+}

+export async function getMapResults({
+  url,
+  search,
+  limit = MAX_MAP_LIMIT,
+  ignoreSitemap = false,
+  includeSubdomains = true,
+  crawlerOptions = {},
+  teamId,
+  plan,
+  origin,
+  includeMetadata = false,
+  allowExternalLinks
+}: {
+  url: string;
+  search?: string;
+  limit?: number;
+  ignoreSitemap?: boolean;
+  includeSubdomains?: boolean;
+  crawlerOptions?: any;
+  teamId: string;
+  plan?: string;
+  origin?: string;
+  includeMetadata?: boolean;
+  allowExternalLinks?: boolean;
+}): Promise<MapResult> {
  const id = uuidv4();
-  let links: string[] = [req.body.url];
+  let links: string[] = [url];
+  let mapResults: MapDocument[] = [];

  const sc: StoredCrawl = {
-    originUrl: req.body.url,
+    originUrl: url,
    crawlerOptions: {
-      ...req.body,
-      limit: req.body.sitemapOnly ? 10000000 : limit,
+      ...crawlerOptions,
+      limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
      scrapeOptions: undefined,
    },
    scrapeOptions: scrapeOptions.parse({}),
    internalOptions: {},
-    team_id: req.auth.team_id,
+    team_id: teamId,
    createdAt: Date.now(),
-    plan: req.auth.plan,
+    plan: plan,
  };

  const crawler = crawlToCrawler(id, sc);

  // If sitemapOnly is true, only get links from sitemap
-  if (req.body.sitemapOnly) {
+  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(true, true);
    if (sitemap !== null) {
      sitemap.forEach((x) => {
@ -73,19 +97,18 @@ export async function mapController(
      // links = links.slice(1, limit); // don't slice, unnecessary
    }
  } else {
-    let urlWithoutWww = req.body.url.replace("www.", "");
+    let urlWithoutWww = url.replace("www.", "");

-    let mapUrl = req.body.search
-      ? `"${req.body.search}" site:${urlWithoutWww}`
-      : `site:${req.body.url}`;
+    let mapUrl = search && allowExternalLinks
+      ? `${search} ${urlWithoutWww}`
+      : search ? `${search} site:${urlWithoutWww}`
+      : `site:${url}`;

    const resultsPerPage = 100;
-    const maxPages = Math.ceil(
-      Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
-    );
+    const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);

    const cacheKey = `fireEngineMap:${mapUrl}`;
-    const cachedResult = null;
+    const cachedResult = await redis.get(cacheKey);

    let allResults: any[] = [];
    let pagePromises: Promise<any>[] = [];
@ -110,7 +133,7 @@ export async function mapController(

    // Parallelize sitemap fetch with serper search
    const [sitemap, ...searchResults] = await Promise.all([
-      req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
+      ignoreSitemap ? null : crawler.tryGetSitemap(true),
      ...(cachedResult ? [] : pagePromises),
    ]);

@ -124,7 +147,7 @@ export async function mapController(
      });
    }

-    let mapResults = allResults
+    mapResults = allResults
      .flat()
      .filter((result) => result !== null && result !== undefined);

@ -134,7 +157,7 @@ export async function mapController(
    }

    if (mapResults.length > 0) {
-      if (req.body.search) {
+      if (search) {
        // Ensure all map results are first, maintaining their order
        links = [
          mapResults[0].url,
@ -149,9 +172,8 @@ export async function mapController(
    }

    // Perform cosine similarity between the search query and the list of links
-    if (req.body.search) {
-      const searchQuery = req.body.search.toLowerCase();
-
+    if (search) {
+      const searchQuery = search.toLowerCase();
      links = performCosineSimilarity(links, searchQuery);
    }

@ -166,95 +188,75 @@ export async function mapController(
      .filter((x) => x !== null) as string[];

    // allows for subdomains to be included
-    links = links.filter((x) => isSameDomain(x, req.body.url));
+    links = links.filter((x) => isSameDomain(x, url));

    // if includeSubdomains is false, filter out subdomains
-    if (!req.body.includeSubdomains) {
-      links = links.filter((x) => isSameSubdomain(x, req.body.url));
+    if (!includeSubdomains) {
+      links = links.filter((x) => isSameSubdomain(x, url));
    }

    // remove duplicates that could be due to http/https or www
    links = removeDuplicateUrls(links);
-    links.slice(0, limit);
  }

+  const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
+
+  return {
+    success: true,
+    links: includeMetadata ? mapResults : linksToReturn,
+    scrape_id: origin?.includes("website") ? id : undefined,
+    job_id: id,
+    time_taken: (new Date().getTime() - Date.now()) / 1000,
+  };
+}
+
+export async function mapController(
+  req: RequestWithAuth<{}, MapResponse, MapRequest>,
+  res: Response<MapResponse>
+) {
+  req.body = mapRequestSchema.parse(req.body);
+
+  const result = await getMapResults({
+    url: req.body.url,
+    search: req.body.search,
+    limit: req.body.limit,
+    ignoreSitemap: req.body.ignoreSitemap,
+    includeSubdomains: req.body.includeSubdomains,
+    crawlerOptions: req.body,
+    origin: req.body.origin,
+    teamId: req.auth.team_id,
+    plan: req.auth.plan,
+  });
+
+  // Bill the team
  billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
    logger.error(
      `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
    );
-    // Optionally, you could notify an admin or add to a retry queue here
  });

-  const endTime = new Date().getTime();
-  const timeTakenInSeconds = (endTime - startTime) / 1000;
-
+  // Log the job
  logJob({
-    job_id: id,
-    success: links.length > 0,
+    job_id: result.job_id,
+    success: result.links.length > 0,
    message: "Map completed",
-    num_docs: links.length,
-    docs: links,
-    time_taken: timeTakenInSeconds,
+    num_docs: result.links.length,
+    docs: result.links,
+    time_taken: result.time_taken,
    team_id: req.auth.team_id,
-    mode: "map",
+    mode: "map", 
    url: req.body.url,
    crawlerOptions: {},
    scrapeOptions: {},
-    origin: req.body.origin,
+    origin: req.body.origin ?? "api",
    num_tokens: 0,
  });

-  return res.status(200).json({
-    success: true,
-    links: links,
-    scrape_id: req.body.origin?.includes("website") ? id : undefined,
-  });
-}
+  const response = {
+    success: true as const,
+    links: result.links,
+    scrape_id: result.scrape_id
+  };

-// Subdomain sitemap url checking
-
-// // For each result, check for subdomains, get their sitemaps and add them to the links
-// const processedUrls = new Set();
-// const processedSubdomains = new Set();
-
-// for (const result of links) {
-//   let url;
-//   let hostParts;
-//   try {
-//     url = new URL(result);
-//     hostParts = url.hostname.split('.');
-//   } catch (e) {
-//     continue;
-//   }
-
-//   console.log("hostParts", hostParts);
-//   // Check if it's a subdomain (more than 2 parts, and not 'www')
-//   if (hostParts.length > 2 && hostParts[0] !== 'www') {
-//     const subdomain = hostParts[0];
-//     console.log("subdomain", subdomain);
-//     const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
-//     console.log("subdomainUrl", subdomainUrl);
-
-//     if (!processedSubdomains.has(subdomainUrl)) {
-//       processedSubdomains.add(subdomainUrl);
-
-//       const subdomainCrawl = crawlToCrawler(id, {
-//         originUrl: subdomainUrl,
-//         crawlerOptions: legacyCrawlerOptions(req.body),
-//         pageOptions: {},
-//         team_id: req.auth.team_id,
-//         createdAt: Date.now(),
-//         plan: req.auth.plan,
-//       });
-//       const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
-//       if (subdomainSitemap) {
-//         subdomainSitemap.forEach((x) => {
-//           if (!processedUrls.has(x.url)) {
-//             processedUrls.add(x.url);
-//             links.push(x.url);
-//           }
-//         });
-//       }
-//     }
-//   }
-// }
+  return res.status(200).json(response);
+}
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -151,8 +151,25 @@ export const scrapeOptions = z.object({
 }).strict(strictMessage)


+
 export type ScrapeOptions = z.infer<typeof scrapeOptions>;

+export const extractV1Options = z.object({
+  urls: url.array(),
+  prompt: z.string().optional(),
+  schema: z.any().optional(),
+  limit: z.number().int().positive().finite().safe().optional(),
+  ignoreSitemap: z.boolean().default(false),
+  includeSubdomains: z.boolean().default(true),
+  allowExternalLinks: z.boolean().default(false),
+  origin: z.string().optional().default("api"),
+  timeout: z.number().int().positive().finite().safe().default(60000)
+}).strict(strictMessage)
+
+export type ExtractV1Options = z.infer<typeof extractV1Options>;
+export const extractRequestSchema = extractV1Options;
+export type ExtractRequest = z.infer<typeof extractRequestSchema>;
+
 export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
  url,
  origin: z.string().optional().default("api"),
@ -173,6 +190,8 @@ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend(
  return obj;
 });

+
+
 export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
 export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;

@ -340,6 +359,21 @@ export interface ScrapeResponseRequestTest {
  error?: string;
 }

+export type ExtractResponse =
+  | ErrorResponse
+  | {
+      success: true;
+      warning?: string;
+      data: z.infer<typeof extractRequestSchema>;
+      scrape_id?: string;
+    };
+
+export interface ExtractResponseRequestTest {
+  statusCode: number;
+  body: ExtractResponse;
+  error?: string;
+}
+
 export type CrawlResponse =
  | ErrorResponse
  | {
@ -496,6 +530,13 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
  };
 }

+
+
+export interface MapDocument {
+  url: string;
+  title?: string;
+  description?: string;
+}   
 export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
  return {
    scrapeOptions: scrapeOptions.parse({
--- a/apps/api/src/lib/cache.ts
+++ b/apps/api/src/lib/cache.ts
@ -0,0 +1,50 @@
+import IORedis from "ioredis";
+import { ScrapeOptions } from "../controllers/v1/types";
+import { InternalOptions } from "../scraper/scrapeURL";
+import { logger as _logger } from "./logger";
+const logger = _logger.child({module: "cache"});
+
+export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
+    maxRetriesPerRequest: null,
+}) : null;
+
+export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
+    if (!cacheRedis) return null;
+
+    // these options disqualify a cache
+    if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
+        || (scrapeOptions.actions && scrapeOptions.actions.length > 0)
+    ) {
+        return null;
+    }
+
+    return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
+}
+
+export type CacheEntry = {
+    url: string;
+    html: string;
+    statusCode: number;
+    error?: string;
+};
+
+export async function saveEntryToCache(key: string, entry: CacheEntry) {
+    if (!cacheRedis) return;
+
+    try {
+        await cacheRedis.set(key, JSON.stringify(entry));
+    } catch (error) {
+        logger.warn("Failed to save to cache", { key, error });
+    }
+}
+
+export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
+    if (!cacheRedis) return null;
+
+    try {
+        return JSON.parse(await cacheRedis.get(key) ?? "null");
+    } catch (error) {
+        logger.warn("Failed to get from cache", { key, error });
+        return null;
+    }
+}
--- a/apps/api/src/lib/extract/build-document.ts
+++ b/apps/api/src/lib/extract/build-document.ts
@ -0,0 +1,15 @@
+import { Document } from "../../controllers/v1/types";
+
+export function buildDocument(document: Document): string {
+  const metadata = document.metadata;
+  const markdown = document.markdown;
+
+  // for each key in the metadata allow up to 250 characters
+  const metadataString = Object.entries(metadata).map(([key, value]) => {
+    return `${key}: ${value?.toString().slice(0, 250)}`;
+  }).join('\n');
+
+  const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
+  const documentString = `${markdown}${documentMetadataString}`;
+  return documentString;
+}
--- a/apps/api/src/lib/extract/completions.ts
+++ b/apps/api/src/lib/extract/completions.ts
@ -0,0 +1,124 @@
+// use llmExtract.ts instead
+
+// import OpenAI from "openai";
+// import { encoding_for_model } from "@dqbd/tiktoken";
+// import { TiktokenModel } from "@dqbd/tiktoken";
+// import { ExtractOptions } from "../../controllers/v1/types";
+// import { Document } from "../entities";
+// import { z } from "zod";
+
+// const maxTokens = 32000;
+// const modifier = 4;
+
+// export class LLMRefusalError extends Error {
+//   constructor(refusal: string) {
+//     super("LLM refused to extract the website's content");
+//     this.name = "LLMRefusalError";
+//   }
+// }
+
+// interface GenerateCompletionsParams {
+//   systemPrompt?: string;
+//   prompt?: string;
+//   schema?: any;
+//   pagesContent: string;
+// }
+
+// export async function generateBasicCompletion(prompt: string) {
+//   const openai = new OpenAI();
+//   const model: TiktokenModel =
+//     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+
+//   const completion = await openai.chat.completions.create({
+//     model,
+//     messages: [{ role: "user", content: prompt }],
+//   });
+
+//   return completion.choices[0].message.content;
+// }
+
+// export async function generateFinalExtraction({
+//   pagesContent,
+//   systemPrompt,
+//   prompt,
+//   schema,
+// }: GenerateCompletionsParams): Promise<{
+//   content: string;
+//   metadata: { numTokens: number; warning: string };
+// }> {
+//   const openai = new OpenAI();
+//   const model: TiktokenModel =
+//     (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
+
+//   let extractionContent = pagesContent;
+//   let numTokens = 0;
+//   let warning = "";
+
+//   const encoder = encoding_for_model(model);
+//   try {
+//     const tokens = encoder.encode(extractionContent);
+//     numTokens = tokens.length;
+//   } catch (error) {
+//     extractionContent = extractionContent.slice(0, maxTokens * modifier);
+//     warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
+//   } finally {
+//     encoder.free();
+//   }
+
+//   if (numTokens > maxTokens) {
+//     extractionContent = extractionContent.slice(0, maxTokens * modifier);
+//     warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
+//   }
+
+//   if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
+//     schema = {
+//       type: "object",
+//       properties: {
+//         items: schema,
+//       },
+//       required: ["items"],
+//       additionalProperties: false,
+//     };
+//   } else if (schema) {
+//     schema.additionalProperties = false;
+//     schema.required = Object.keys(schema.properties);
+//   }
+
+//   const jsonCompletion = await openai.beta.chat.completions.parse({
+//     temperature: 0,
+//     model,
+//     messages: [
+//       { role: "system", content: systemPrompt ?? "" },
+//       { role: "user", content: [{ type: "text", text: extractionContent }] },
+//       {
+//         role: "user",
+//         content: prompt
+//           ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
+//           : "Transform the above content into structured JSON output.",
+//       },
+//     ],
+//     response_format: schema
+//       ? {
+//           type: "json_schema",
+//           json_schema: {
+//             name: "websiteContent",
+//             schema: schema,
+//             strict: true,
+//           },
+//         }
+//       : { type: "json_object" },
+//   });
+
+//   if (jsonCompletion.choices[0].message.refusal !== null) {
+//     throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
+//   }
+
+//   const extraction = jsonCompletion.choices[0].message.parsed;
+//   return {
+//     content: extraction ?? "",
+//     metadata: {
+//       numTokens,
+//       warning,
+//     },
+//   };
+// }
--- a/apps/api/src/lib/extract/reranker.ts
+++ b/apps/api/src/lib/extract/reranker.ts
@ -0,0 +1,22 @@
+import { CohereClient } from "cohere-ai";
+import { MapDocument } from "../../controllers/v1/types";
+const cohere = new CohereClient({
+  token: process.env.COHERE_API_KEY,
+});
+
+export async function rerankDocuments(
+  documents: (string | Record<string, string>)[],
+  query: string,
+  topN = 3,
+  model = "rerank-english-v3.0"
+) {
+  const rerank = await cohere.v2.rerank({
+    documents,
+    query,
+    topN,
+    model,
+    returnDocuments: true,
+  });
+
+  return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
+}
--- a/apps/api/src/lib/ranker.test.ts
+++ b/apps/api/src/lib/ranker.test.ts
@ -0,0 +1,68 @@
+import { performRanking } from './ranker';
+
+describe('performRanking', () => {
+  it('should rank links based on similarity to search query', async () => {
+    const linksWithContext = [
+      'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
+      'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
+      'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
+    ];
+
+    const links = [
+      'https://example.com/dogs',
+      'https://example.com/cats', 
+      'https://example.com/pets'
+    ];
+
+    const searchQuery = 'cats training';
+
+    const result = await performRanking(linksWithContext, links, searchQuery);
+
+    // Should return array of objects with link, linkWithContext, score, originalIndex
+    expect(result).toBeInstanceOf(Array);
+    expect(result.length).toBe(3);
+    
+    // First result should be the dogs page since query is about dogs
+    expect(result[0].link).toBe('https://example.com/cats');
+    
+    // Each result should have required properties
+    result.forEach(item => {
+      expect(item).toHaveProperty('link');
+      expect(item).toHaveProperty('linkWithContext');
+      expect(item).toHaveProperty('score');
+      expect(item).toHaveProperty('originalIndex');
+      expect(typeof item.score).toBe('number');
+      expect(item.score).toBeGreaterThanOrEqual(0);
+      expect(item.score).toBeLessThanOrEqual(1);
+    });
+
+    // Scores should be in descending order
+    for (let i = 1; i < result.length; i++) {
+      expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
+    }
+  });
+
+  it('should handle empty inputs', async () => {
+    const result = await performRanking([], [], '');
+    expect(result).toEqual([]);
+  });
+
+  it('should maintain original order for equal scores', async () => {
+    const linksWithContext = [
+      'url: https://example.com/1, title: Similar content A, description: test',
+      'url: https://example.com/2, title: Similar content B, description: test'
+    ];
+
+    const links = [
+      'https://example.com/1',
+      'https://example.com/2'
+    ];
+
+    const searchQuery = 'test';
+
+    const result = await performRanking(linksWithContext, links, searchQuery);
+
+    // If scores are equal, original order should be maintained
+    expect(result[0].originalIndex).toBeLessThan(result[1].originalIndex);
+  });
+});
--- a/apps/api/src/lib/ranker.ts
+++ b/apps/api/src/lib/ranker.ts
@ -0,0 +1,92 @@
+import axios from 'axios';
+import { configDotenv } from 'dotenv';
+import OpenAI from "openai";
+
+configDotenv();
+
+const openai = new OpenAI({
+  apiKey: process.env.OPENAI_API_KEY,
+});
+
+async function getEmbedding(text: string) {
+  const embedding = await openai.embeddings.create({
+    model: "text-embedding-ada-002",
+    input: text,
+    encoding_format: "float",
+  });
+
+  return embedding.data[0].embedding;
+}
+
+const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
+  const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
+  const magnitude1 = Math.sqrt(
+    vec1.reduce((sum, val) => sum + val * val, 0)
+  );
+  const magnitude2 = Math.sqrt(
+    vec2.reduce((sum, val) => sum + val * val, 0)
+  );
+  if (magnitude1 === 0 || magnitude2 === 0) return 0;
+  return dotProduct / (magnitude1 * magnitude2);
+};
+
+// Function to convert text to vector
+const textToVector = (searchQuery: string, text: string): number[] => {
+  const words = searchQuery.toLowerCase().split(/\W+/);
+  return words.map((word) => {
+    const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
+      .length;
+    return count / text.length;
+  });
+};
+
+async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
+  try {
+    // Handle invalid inputs
+    if (!searchQuery || !linksWithContext.length || !links.length) {
+      return [];
+    }
+
+    // Sanitize search query by removing null characters
+    const sanitizedQuery = searchQuery;
+
+    // Generate embeddings for the search query
+    const queryEmbedding = await getEmbedding(sanitizedQuery);
+
+    // Generate embeddings for each link and calculate similarity
+    const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
+      try {
+        const linkEmbedding = await getEmbedding(linkWithContext);
+        const score = cosineSimilarity(queryEmbedding, linkEmbedding);
+        
+        return { 
+          link: links[index],
+          linkWithContext,
+          score,
+          originalIndex: index
+        };
+      } catch (err) {
+        // If embedding fails for a link, return with score 0
+        return {
+          link: links[index],
+          linkWithContext,
+          score: 0,
+          originalIndex: index
+        };
+      }
+    }));
+
+    // Sort links based on similarity scores while preserving original order for equal scores
+    linksAndScores.sort((a, b) => {
+      const scoreDiff = b.score - a.score;
+      return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
+    });
+
+    return linksAndScores;
+  } catch (error) {
+    console.error(`Error performing semantic search: ${error}`);
+    return [];
+  }
+}
+
+export { performRanking };
--- a/apps/api/src/lib/timeout.ts
+++ b/apps/api/src/lib/timeout.ts
@ -1 +1 @@
-export const axiosTimeout = 3000;
+export const axiosTimeout = 5000;
--- a/apps/api/src/routes/v0.ts
+++ b/apps/api/src/routes/v0.ts
@ -27,4 +27,4 @@ v0Router.post("/v0/search", searchController);

 // Health/Probe routes
 v0Router.get("/v0/health/liveness", livenessController);
-v0Router.get("/v0/health/readiness", readinessController);
+v0Router.get("/v0/health/readiness", readinessController);
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
 import { scrapeStatusController } from "../controllers/v1/scrape-status";
 import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
 import { batchScrapeController } from "../controllers/v1/batch-scrape";
+import { extractController } from "../controllers/v1/extract";
 // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
 // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
 // import { searchController } from "../../src/controllers/v1/search";
@ -98,7 +99,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
 function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
    if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
        if (!res.headersSent) {
-            return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
+            return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
        }
    }
    next();
@ -178,6 +179,13 @@ v1Router.ws(
    crawlStatusWSController
 );

+v1Router.post(
+    "/extract",
+    authMiddleware(RateLimiterMode.Scrape),
+    checkCreditsMiddleware(1),
+    wrap(extractController)
+);
+


 // v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -199,3 +207,4 @@ v1Router.delete(
 // Health/Probe routes
 // v1Router.get("/health/liveness", livenessController);
 // v1Router.get("/health/readiness", readinessController);
+
--- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts
@ -0,0 +1,19 @@
+import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
+import { EngineScrapeResult } from "..";
+import { Meta } from "../..";
+import { EngineError } from "../../error";
+
+export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
+    const key = cacheKey(meta.url, meta.options, meta.internalOptions);
+    if (key === null) throw new EngineError("Scrape not eligible for caching");
+
+    const entry = await getEntryFromCache(key);
+    if (entry === null) throw new EngineError("Cache missed");
+
+    return {
+        url: entry.url,
+        html: entry.html,
+        statusCode: entry.statusCode,
+        error: entry.error,
+    };
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
@ -103,4 +103,4 @@ export async function fireEngineCheckStatus(logger: Logger, jobId: string): Prom
            }
        });
    }
-}
+}
--- a/apps/api/src/scraper/scrapeURL/engines/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/index.ts
@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
 import { scrapeURLWithScrapingBee } from "./scrapingbee";
 import { scrapeURLWithFetch } from "./fetch";
 import { scrapeURLWithPlaywright } from "./playwright";
+import { scrapeCache } from "./cache";

-export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
+export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";

 const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
 const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
 const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
+const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;

 export const engines: Engine[] = [
+    // ...(useCache ? [ "cache" as const ] : []),
    ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
    ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
    ...(usePlaywright ? [ "playwright" as const ] : []),
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
 const engineHandlers: {
    [E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
 } = {
+    "cache": scrapeCache,
    "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
    "fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
    "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
@ -95,6 +99,22 @@ export const engineOptions: {
        quality: number,
    }
 } = {
+    "cache": {
+        features: {
+            "actions": false,
+            "waitFor": true,
+            "screenshot": false,
+            "screenshot@fullScreen": false,
+            "pdf": false, // TODO: figure this out
+            "docx": false, // TODO: figure this out
+            "atsv": false,
+            "location": false,
+            "mobile": false,
+            "skipTlsVerification": false,
+            "useFastMode": false,
+        },
+        quality: 1000, // cache should always be tried first
+    },
    "fire-engine;chrome-cdp": {
        features: {
            "actions": true,
--- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts
@ -0,0 +1,26 @@
+import { Document } from "../../../controllers/v1/types";
+import { Meta } from "..";
+import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
+
+export function saveToCache(meta: Meta, document: Document): Document {
+    if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
+
+    if (document.rawHtml === undefined) {
+        throw new Error("rawHtml is undefined -- this transformer is being called out of order");
+    }
+
+    const key = cacheKey(meta.url, meta.options, meta.internalOptions);
+
+    if (key !== null) {
+        const entry: CacheEntry = {
+            html: document.rawHtml!,
+            statusCode: document.metadata.statusCode!,
+            url: document.metadata.url ?? document.metadata.sourceURL!,
+            error: document.metadata.error ?? undefined,
+        };
+
+        saveEntryToCache(key, entry);
+    }
+
+    return document;
+}
--- a/apps/api/src/scraper/scrapeURL/transformers/index.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts
@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
 import { performLLMExtract } from "./llmExtract";
 import { uploadScreenshot } from "./uploadScreenshot";
 import { removeBase64Images } from "./removeBase64Images";
+import { saveToCache } from "./cache";

 export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;

@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document

 // TODO: allow some of these to run in parallel
 export const transformerStack: Transformer[] = [
+    saveToCache,
    deriveHTMLFromRawHTML,
    deriveMarkdownFromHTML,
    deriveLinksFromHTML,
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
    }
 }

-async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
+export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
+    let extract: any;
+    let warning: string | undefined;
+
    const openai = new OpenAI();
    const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";

-    if (document.markdown === undefined) {
+    if (markdown === undefined) {
        throw new Error("document.markdown is undefined -- this is unexpected");
    }

-    let extractionContent = document.markdown;
-
    // count number of tokens
    let numTokens = 0;
    const encoder = encoding_for_model(model as TiktokenModel);
    try {
        // Encode the message into tokens
-        const tokens = encoder.encode(extractionContent);
+        const tokens = encoder.encode(markdown);
    
        // Return the number of tokens
        numTokens = tokens.length;
    } catch (error) {
-        logger.warn("Calculating num tokens of string failed", { error, extractionContent });
+        logger.warn("Calculating num tokens of string failed", { error, markdown });

-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);

-        const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    } finally {
        // Free the encoder resources after use
        encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt

    if (numTokens > maxTokens) {
        // trim the document to the maximum number of tokens, tokens != characters
-        extractionContent = extractionContent.slice(0, maxTokens * modifier);
+        markdown = markdown.slice(0, maxTokens * modifier);

-        const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
-        document.warning = document.warning === undefined ? warning : " " + warning;
+        const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
+        warning = previousWarning === undefined ? w : w + " " + previousWarning;
    }

    let schema = options.schema;
@ -107,12 +108,22 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            required: ["items"],
            additionalProperties: false,
        };
+    } else if (schema && typeof schema === 'object' && !schema.type) {
+      schema = {
+          type: "object",
+          properties: Object.fromEntries(
+              Object.entries(schema).map(([key, value]) => [key, { type: value }])
+          ),
+          required: Object.keys(schema),
+          additionalProperties: false
+      };
    }

    schema = normalizeSchema(schema);

    const jsonCompletion = await openai.beta.chat.completions.parse({
        model,
+        temperature: 0,
        messages: [
            {
                role: "system",
@ -120,7 +131,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
            },
            {
                role: "user",
-                content: [{ type: "text", text: extractionContent }],
+                content: [{ type: "text", text: markdown }],
            },
            {
                role: "user",
@ -143,26 +154,35 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
        throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
    }

-    document.extract = jsonCompletion.choices[0].message.parsed;
+    extract = jsonCompletion.choices[0].message.parsed;

-    if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
+    if (extract === null && jsonCompletion.choices[0].message.content !== null) {
        try {
-            document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
+            extract = JSON.parse(jsonCompletion.choices[0].message.content);
        } catch (e) {
            logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
            throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
        }
    }

-    if (options.schema && options.schema.type === "array") {
-        document.extract = document.extract?.items;
+    // If the users actually wants the items object, they can specify it as 'required' in the schema
+    // otherwise, we just return the items array
+    if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
+        extract = extract?.items;
    }
-    return document;
+    return { extract, warning, numTokens };
 }

 export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
    if (meta.options.formats.includes("extract")) {
-        document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
+        const { extract, warning } = await generateOpenAICompletions(
+          meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
+          meta.options.extract!,
+          document.markdown,
+          document.warning,
+        );
+        document.extract = extract;
+        document.warning = warning;
    }

    return document;
--- a/apps/api/src/services/queue-jobs.ts
+++ b/apps/api/src/services/queue-jobs.ts
@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
          }
        }
      }
-    }, 500);
+    }, 250);
  })
 }
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -106,6 +106,15 @@ export interface FirecrawlCrawlStatusResponse {
  error?: string;
 }

+export interface FirecrawlExtractResponse {
+  statusCode: number;
+  body: {
+    success: boolean;
+    data: any[];
+  };
+  error?: string;
+}
+
 export enum RateLimiterMode {
  Crawl = "crawl",
  CrawlStatus = "crawlStatus",
--- a/apps/js-sdk/example.js
+++ b/apps/js-sdk/example.js
@ -1,4 +1,5 @@
 import FirecrawlApp from 'firecrawl';
+import { z } from 'zod';

 const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});

@ -42,6 +43,18 @@ const main = async () => {
  const mapResult = await app.mapUrl('https://firecrawl.dev');
  console.log(mapResult)

+  // Extract information from a website using LLM:
+  const extractSchema = z.object({
+    title: z.string(),
+    description: z.string(),
+    links: z.array(z.string())
+  });
+
+  const extractResult = await app.extract(['https://firecrawl.dev'], {
+    prompt: "Extract the title, description, and links from the website",
+    schema: extractSchema
+  });
+  console.log(extractResult);

  // Crawl a website with WebSockets:
  const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});
--- a/apps/js-sdk/example.ts
+++ b/apps/js-sdk/example.ts
@ -42,6 +42,19 @@ const main = async () => {
  const mapResult = await app.mapUrl('https://firecrawl.dev');
  console.log(mapResult)

+  // // Extract information from a website using LLM:
+  // const extractSchema = z.object({
+  //   title: z.string(),
+  //   description: z.string(),
+  //   links: z.array(z.string())
+  // });
+
+  // const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
+  //   prompt: "Extract the title, description, and links from the website",
+  //   schema: extractSchema
+  // });
+  // console.log(extractResult);
+
  // Crawl a website with WebSockets:
  const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.8.5",
+  "version": "1.9.0",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -236,6 +236,27 @@ export interface MapResponse {
  error?: string;
 }

+/**
+ * Parameters for extracting information from URLs.
+ * Defines options for extracting information from URLs.
+ */
+export interface ExtractParams {
+  prompt: string;
+  schema?: zt.ZodSchema;
+  systemPrompt?: string;
+  allowExternalLinks?: boolean;
+}
+
+/**
+ * Response interface for extracting information from URLs.
+ * Defines the structure of the response received after extracting information from URLs.
+ */
+export interface ExtractResponse {
+  success: true;
+  data: zt.infer<zt.ZodSchema>;
+  error?: string;
+}
+
 /**
 * Error response interface.
 * Defines the structure of the response received when an error occurs.
@ -245,7 +266,6 @@ export interface ErrorResponse {
  error: string;
 }

-
 /**
 * Custom error class for Firecrawl.
 * Extends the built-in Error class to include a status code.
@ -679,6 +699,44 @@ export default class FirecrawlApp {
    return { success: false, error: "Internal server error." };
  }

+  /**
+   * Extracts information from URLs using the Firecrawl API.
+   * @param url - The URL to extract information from.
+   * @param params - Additional parameters for the extract request.
+   * @returns The response from the extract operation.
+   */
+  async extract(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
+    const headers = this.prepareHeaders();
+
+    if (!params?.prompt) {
+      throw new FirecrawlError("Prompt is required", 400);
+    }
+
+    let jsonData: { urls: string[] } & ExtractParams= { urls,  ...params };
+    let jsonSchema: any;
+    try {
+      jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
+    } catch (error: any) {
+      throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
+    }
+
+    try {
+      const response: AxiosResponse = await this.postRequest(
+        this.apiUrl + `/v1/extract`,
+        { ...jsonData, schema: jsonSchema },
+        headers
+      );
+      if (response.status === 200) {
+        return response.data as ExtractResponse;
+      } else {
+        this.handleError(response, "extract");
+      }
+    } catch (error: any) {
+      throw new FirecrawlError(error.message, 500);
+    }
+    return { success: false, error: "Internal server error." };
+  }
+
  /**
   * Prepares the headers for an API request.
   * @param idempotencyKey - Optional key to ensure idempotency.
--- a/apps/python-sdk/example.py
+++ b/apps/python-sdk/example.py
@ -2,6 +2,8 @@ import time
 import nest_asyncio
 import uuid
 from firecrawl.firecrawl import FirecrawlApp
+from pydantic import BaseModel, Field
+from typing import List

 app = FirecrawlApp(api_key="fc-")

@ -50,9 +52,6 @@ print(crawl_status)

 # LLM Extraction:
 # Define schema to extract contents into using pydantic
-from pydantic import BaseModel, Field
-from typing import List
-
 class ArticleSchema(BaseModel):
    title: str
    points: int 
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
 map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
 print(map_result)

+# Extract URLs:
+class ExtractSchema(BaseModel):
+    title: str
+    description: str
+    links: List[str]
+
+# Define the schema using Pydantic
+extract_schema = ExtractSchema.schema()
+
+# Perform the extraction
+extract_result = app.extract(['https://firecrawl.dev'], {
+    'prompt': "Extract the title, description, and links from the website",
+    'schema': extract_schema
+})
+print(extract_result)
+
 # Crawl a website with WebSockets:
 # inside an async function...
 nest_asyncio.apply()
--- a/apps/python-sdk/firecrawl/init.py
+++ b/apps/python-sdk/firecrawl/init.py
@ -13,7 +13,7 @@ import os

 from .firecrawl import FirecrawlApp # noqa

-__version__ = "1.5.0"
+__version__ = "1.6.0"

 # Define the logger for the Firecrawl project
 logger: logging.Logger = logging.getLogger("firecrawl")
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -12,15 +12,40 @@ Classes:
 import logging
 import os
 import time
-from typing import Any, Dict, Optional, List
+from typing import Any, Dict, Optional, List, Union
 import json

 import requests
+import pydantic
 import websockets

 logger : logging.Logger = logging.getLogger("firecrawl")

 class FirecrawlApp:
+    class ExtractParams(pydantic.BaseModel):
+        """
+        Parameters for the extract operation.
+        """
+        prompt: str
+        schema: Optional[Any] = None
+        system_prompt: Optional[str] = None
+        allow_external_links: Optional[bool] = False
+
+    class ExtractResponse(pydantic.BaseModel):
+        """
+        Response from the extract operation.
+        """
+        success: bool
+        data: Optional[Any] = None
+        error: Optional[str] = None
+
+    class ErrorResponse(pydantic.BaseModel):
+        """
+        Error response.
+        """
+        success: bool
+        error: str
+
    def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
      """
      Initialize the FirecrawlApp instance with API key, API URL.
@ -434,6 +459,48 @@ class FirecrawlApp:
        else:
            self._handle_error(response, 'check batch scrape status')

+
+    def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
+        """
+        Extracts information from a URL using the Firecrawl API.
+
+        Args:
+            urls (List[str]): The URLs to extract information from.
+            params (Optional[ExtractParams]): Additional parameters for the extract request.
+
+        Returns:
+            Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
+        """
+        headers = self._prepare_headers()
+
+        if not params or not params.get('prompt'):
+            raise ValueError("Prompt is required")
+
+        if not params.get('schema'):
+            raise ValueError("Schema is required for extraction")
+
+        jsonData = {'urls': urls, **params}
+        jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
+
+        try:
+            response = self._post_request(
+                f'{self.api_url}/v1/extract',
+                {
+                    **jsonData,
+                    'allowExternalLinks': params.get('allow_external_links', False),
+                    'schema': jsonSchema
+                },
+                headers
+            )
+            if response.status_code == 200:
+                return response.json()
+            else:
+                self._handle_error(response, "extract")
+        except Exception as e:
+            raise ValueError(str(e), 500)
+
+        return {'success': False, 'error': "Internal server error."}
+
    def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
        """
        Prepare the headers for API requests.