Merge pull request #915 from mendableai/nsc/new-extract

Extract (beta)
This commit is contained in:
Nicolas 2024-11-26 10:02:09 -08:00 committed by GitHub
commit 6c33b978f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 2767 additions and 162 deletions

View File

@ -55,7 +55,7 @@
"@bull-board/api": "^5.20.5", "@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5", "@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2", "@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.16", "@dqbd/tiktoken": "^1.0.17",
"@nangohq/node": "^0.40.8", "@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1", "@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0", "@sentry/node": "^8.26.0",
@ -73,6 +73,7 @@
"cacheable-lookup": "^6.1.0", "cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12", "cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1", "cohere": "^1.1.1",
"cohere-ai": "^7.14.0",
"cors": "^2.8.5", "cors": "^2.8.5",
"cron-parser": "^4.9.0", "cron-parser": "^4.9.0",
"date-fns": "^3.6.0", "date-fns": "^3.6.0",

1316
apps/api/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -47,3 +47,31 @@ content-type: application/json
# @name batchScrapeStatus # @name batchScrapeStatus
GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1 GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}} Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Map Website
# @name map
POST {{baseUrl}}/v1/map HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "firecrawl.dev",
"sitemapOnly": true
}
### Extract
# @name extract
POST {{baseUrl}}/v1/extract HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"urls": ["firecrawl.dev"],
"prompt": "What is the title, description and main product of the page?",
"schema": {
"title": "string",
"description": "string",
"mainProduct": "string"
}
}

View File

@ -0,0 +1,249 @@
import request from "supertest";
import dotenv from "dotenv";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for Extract API Routes", () => {
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev/*"],
prompt: "Who are the authors of the blog posts?",
schema: {
type: "object",
properties: { authors: { type: "array", items: { type: "string" } } },
},
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("authors");
let gotItRight = 0;
for (const author of response.body.data?.authors) {
if (author.includes("Caleb Peffer")) gotItRight++;
if (author.includes("Gergő Móricz")) gotItRight++;
if (author.includes("Eric Ciarla")) gotItRight++;
if (author.includes("Nicolas Camara")) gotItRight++;
if (author.includes("Jon")) gotItRight++;
if (author.includes("Wendong")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(1);
}, 60000);
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["firecrawl.dev/*"],
prompt: "Who are the founders of the company?",
allowExternalLinks: true,
schema: {
type: "object",
properties: { founders: { type: "array", items: { type: "string" } } },
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("founders");
console.log(response.body.data?.founders);
let gotItRight = 0;
for (const founder of response.body.data?.founders) {
if (founder.includes("Caleb")) gotItRight++;
if (founder.includes("Eric")) gotItRight++;
if (founder.includes("Nicolas")) gotItRight++;
if (founder.includes("nick")) gotItRight++;
if (founder.includes("eric")) gotItRight++;
if (founder.includes("jon-noronha")) gotItRight++;
}
expect(gotItRight).toBeGreaterThanOrEqual(2);
}, 60000);
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev/*"],
prompt: "What are they hiring for?",
allowExternalLinks: true,
schema: {
type: "array",
items: {
type: "string"
},
required: ["items"]
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
console.log(response.body.data);
let gotItRight = 0;
for (const hiring of response.body.data?.items) {
if (hiring.includes("Developer Support Engineer")) gotItRight++;
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(2);
}, 60000);
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com/*"],
prompt: "Does Fivetran have PCI DSS compliance?",
allowExternalLinks: true,
schema: {
type: "object",
properties: {
pciDssCompliance: { type: "boolean" }
}
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com/*"],
prompt: "What are the Azure Data Connectors they offer?",
schema: {
type: "array",
items: {
type: "object",
properties: {
connector: { type: "string" },
description: { type: "string" },
supportsCaptureDelete: { type: "boolean" }
}
}
}
})
console.log(response.body);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("data");
// expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
prompt: "what applicant tracking system is this company using?",
schema: {
type: "object",
properties: {
isGreenhouseATS: { type: "boolean" },
answer: { type: "string" }
}
},
allowExternalLinks: true
})
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.isGreenhouseATS).toBe(true);
}, 60000);
it.concurrent("should return mintlify api components", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://mintlify.com/docs/*"],
prompt: "what are the 4 API components?",
schema: {
type: "array",
items: {
type: "object",
properties: {
component: { type: "string" }
}
},
required: ["items"]
},
allowExternalLinks: true
})
console.log(response.body.data?.items);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.items.length).toBe(4);
let gotItRight = 0;
for (const component of response.body.data?.items) {
if (component.component.toLowerCase().includes("parameter")) gotItRight++;
if (component.component.toLowerCase().includes("response")) gotItRight++;
if (component.component.toLowerCase().includes("expandable")) gotItRight++;
if (component.component.toLowerCase().includes("sticky")) gotItRight++;
if (component.component.toLowerCase().includes("examples")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(2);
}, 60000);
it.concurrent("should return information about Eric Ciarla", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://ericciarla.com/"],
prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
schema: {
type: "object",
properties: {
name: { type: "string" },
work: { type: "string" },
education: { type: "string" }
},
required: ["name", "work", "education"]
},
allowExternalLinks: true
})
console.log(response.body.data);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.name).toBe("Eric Ciarla");
expect(response.body.data?.work).toBeDefined();
expect(response.body.data?.education).toBeDefined();
}, 60000);
});

View File

@ -0,0 +1,117 @@
import request from "supertest";
import dotenv from "dotenv";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for Map API Routes", () => {
it.concurrent(
"(feat-search)should return links containing 'smart-crawl'",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
search: "smart-crawl",
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
},
60000
);
it.concurrent(
"(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
includeSubdomains: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).toContain(
"docs.firecrawl.dev"
);
},
60000
);
it.concurrent(
"(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).not.toContain(
"docs.firecrawl.dev"
);
},
60000
);
it.concurrent(
"(feat-limit) should return mapped links for firecrawl.dev with a limit",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
limit: 10,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeLessThanOrEqual(10);
},
60000
);
it.concurrent(
"(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://geekflare.com/sitemap_index.xml",
sitemapOnly: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(1900);
},
60000
);
});

View File

@ -0,0 +1,265 @@
import { Request, Response } from "express";
import {
// Document,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
MapDocument,
scrapeOptions,
} from "./types";
import { Document } from "../../lib/entities";
import Redis from "ioredis";
import { configDotenv } from "dotenv";
import { performRanking } from "../../lib/ranker";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { waitForJob } from "../../services/queue-jobs";
import { addScrapeJob } from "../../services/queue-jobs";
import { PlanType } from "../../types";
import { getJobPriority } from "../../lib/job-priority";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
const MAX_EXTRACT_LIMIT = 100;
const MAX_RANKING_LIMIT = 10;
const INITIAL_SCORE_THRESHOLD = 0.75;
const FALLBACK_SCORE_THRESHOLD = 0.5;
const MIN_REQUIRED_LINKS = 1;
/**
* Extracts data from the provided URLs based on the request parameters.
* Currently in beta.
* @param req - The request object containing authentication and extraction details.
* @param res - The response object to send the extraction results.
* @returns A promise that resolves when the extraction process is complete.
*/
export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>
) {
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
req.body = extractRequestSchema.parse(req.body);
const id = crypto.randomUUID();
let links: string[] = [];
let docs: Document[] = [];
const earlyReturn = false;
// Process all URLs in parallel
const urlPromises = req.body.urls.map(async (url) => {
if (url.includes('/*') || req.body.allowExternalLinks) {
// Handle glob pattern URLs
const baseUrl = url.replace('/*', '');
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true;
let urlWithoutWww = baseUrl.replace("www.", "");
let mapUrl = req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
const mapResults = await getMapResults({
url: baseUrl,
search: req.body.prompt,
teamId: req.auth.team_id,
plan: req.auth.plan,
allowExternalLinks,
origin: req.body.origin,
limit: req.body.limit,
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false,
includeMetadata: true,
includeSubdomains: req.body.includeSubdomains,
});
let mappedLinks = mapResults.links as MapDocument[];
// Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
// Filter by path prefix if present
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) {
// Get similarity scores between the search query and each link's context
const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score
logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score)
.slice(0, MIN_REQUIRED_LINKS)
.map(x => mappedLinks.find(link => link.url === x.link))
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
}
}
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
}
return mappedLinks.map(x => x.url) as string[];
} else {
// Handle direct URLs without glob pattern
if (!isUrlBlocked(url)) {
return [url];
}
return [];
}
});
// Wait for all URL processing to complete and flatten results
const processedUrls = await Promise.all(urlPromises);
links.push(...processedUrls.flat());
if (links.length === 0) {
return res.status(400).json({
success: false,
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
});
}
// Scrape all links in parallel with retries
const scrapePromises = links.map(async (url) => {
const origin = req.body.origin || "api";
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
const jobId = crypto.randomUUID();
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10,
});
await addScrapeJob(
{
url,
mode: "single_urls",
team_id: req.auth.team_id,
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
plan: req.auth.plan!,
origin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
try {
const doc = await waitForJob<Document>(jobId, timeout);
await getScrapeQueue().remove(jobId);
if (earlyReturn) {
return null;
}
return doc;
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
throw {
status: 408,
error: "Request timed out"
};
} else {
throw {
status: 500,
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
};
}
}
});
try {
const results = await Promise.all(scrapePromises);
docs.push(...results.filter(doc => doc !== null).map(x => x!));
} catch (e) {
return res.status(e.status).json({
success: false,
error: e.error
});
}
const completions = await generateOpenAICompletions(
logger.child({ method: "extractController/generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided.",
prompt: req.body.prompt,
schema: req.body.schema,
},
docs.map(x => buildDocument(x)).join('\n')
);
// TODO: change this later
// While on beta, we're billing 5 credits per link discovered/scraped.
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
});
let data = completions.extract ?? {};
let warning = completions.warning;
logJob({
job_id: id,
success: true,
message: "Extract completed",
num_docs: 1,
docs: data,
time_taken: (new Date().getTime() - Date.now()) / 1000,
team_id: req.auth.team_id,
mode: "extract",
url: req.body.urls.join(", "),
scrapeOptions: req.body,
origin: req.body.origin ?? "api",
num_tokens: completions.numTokens ?? 0
});
return res.status(200).json({
success: true,
data: data,
scrape_id: id,
warning: warning
});
}
/**
* Filters links based on their similarity score to the search query.
* @param mappedLinks - The list of mapped links to filter.
* @param linksAndScores - The list of links and their similarity scores.
* @param threshold - The score threshold to filter by.
* @returns The filtered list of links.
*/
function filterAndProcessLinks(
mappedLinks: MapDocument[],
linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
threshold: number
): MapDocument[] {
return linksAndScores
.filter(x => x.score > threshold)
.map(x => mappedLinks.find(link => link.url === x.link))
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
}

View File

@ -1,6 +1,6 @@
import { Response } from "express"; import { Response } from "express";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types"; import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types"; import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
@ -25,37 +25,61 @@ const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return // Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000; const MAX_FIRE_ENGINE_RESULTS = 1000;
export async function mapController( interface MapResult {
req: RequestWithAuth<{}, MapResponse, MapRequest>, success: boolean;
res: Response<MapResponse> links: string[] | any[];
) { scrape_id?: string;
const startTime = new Date().getTime(); job_id: string;
time_taken: number;
req.body = mapRequestSchema.parse(req.body); }
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
export async function getMapResults({
url,
search,
limit = MAX_MAP_LIMIT,
ignoreSitemap = false,
includeSubdomains = true,
crawlerOptions = {},
teamId,
plan,
origin,
includeMetadata = false,
allowExternalLinks
}: {
url: string;
search?: string;
limit?: number;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
crawlerOptions?: any;
teamId: string;
plan?: string;
origin?: string;
includeMetadata?: boolean;
allowExternalLinks?: boolean;
}): Promise<MapResult> {
const id = uuidv4(); const id = uuidv4();
let links: string[] = [req.body.url]; let links: string[] = [url];
let mapResults: MapDocument[] = [];
const sc: StoredCrawl = { const sc: StoredCrawl = {
originUrl: req.body.url, originUrl: url,
crawlerOptions: { crawlerOptions: {
...req.body, ...crawlerOptions,
limit: req.body.sitemapOnly ? 10000000 : limit, limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined, scrapeOptions: undefined,
}, },
scrapeOptions: scrapeOptions.parse({}), scrapeOptions: scrapeOptions.parse({}),
internalOptions: {}, internalOptions: {},
team_id: req.auth.team_id, team_id: teamId,
createdAt: Date.now(), createdAt: Date.now(),
plan: req.auth.plan, plan: plan,
}; };
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
if (req.body.sitemapOnly) { if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(true, true); const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) { if (sitemap !== null) {
sitemap.forEach((x) => { sitemap.forEach((x) => {
@ -73,19 +97,18 @@ export async function mapController(
// links = links.slice(1, limit); // don't slice, unnecessary // links = links.slice(1, limit); // don't slice, unnecessary
} }
} else { } else {
let urlWithoutWww = req.body.url.replace("www.", ""); let urlWithoutWww = url.replace("www.", "");
let mapUrl = req.body.search let mapUrl = search && allowExternalLinks
? `"${req.body.search}" site:${urlWithoutWww}` ? `${search} ${urlWithoutWww}`
: `site:${req.body.url}`; : search ? `${search} site:${urlWithoutWww}`
: `site:${url}`;
const resultsPerPage = 100; const resultsPerPage = 100;
const maxPages = Math.ceil( const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
);
const cacheKey = `fireEngineMap:${mapUrl}`; const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null; const cachedResult = await redis.get(cacheKey);
let allResults: any[] = []; let allResults: any[] = [];
let pagePromises: Promise<any>[] = []; let pagePromises: Promise<any>[] = [];
@ -110,7 +133,7 @@ export async function mapController(
// Parallelize sitemap fetch with serper search // Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([ const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true), ignoreSitemap ? null : crawler.tryGetSitemap(true),
...(cachedResult ? [] : pagePromises), ...(cachedResult ? [] : pagePromises),
]); ]);
@ -124,7 +147,7 @@ export async function mapController(
}); });
} }
let mapResults = allResults mapResults = allResults
.flat() .flat()
.filter((result) => result !== null && result !== undefined); .filter((result) => result !== null && result !== undefined);
@ -134,7 +157,7 @@ export async function mapController(
} }
if (mapResults.length > 0) { if (mapResults.length > 0) {
if (req.body.search) { if (search) {
// Ensure all map results are first, maintaining their order // Ensure all map results are first, maintaining their order
links = [ links = [
mapResults[0].url, mapResults[0].url,
@ -149,9 +172,8 @@ export async function mapController(
} }
// Perform cosine similarity between the search query and the list of links // Perform cosine similarity between the search query and the list of links
if (req.body.search) { if (search) {
const searchQuery = req.body.search.toLowerCase(); const searchQuery = search.toLowerCase();
links = performCosineSimilarity(links, searchQuery); links = performCosineSimilarity(links, searchQuery);
} }
@ -166,95 +188,75 @@ export async function mapController(
.filter((x) => x !== null) as string[]; .filter((x) => x !== null) as string[];
// allows for subdomains to be included // allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url)); links = links.filter((x) => isSameDomain(x, url));
// if includeSubdomains is false, filter out subdomains // if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) { if (!includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url)); links = links.filter((x) => isSameSubdomain(x, url));
} }
// remove duplicates that could be due to http/https or www // remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links); links = removeDuplicateUrls(links);
links.slice(0, limit);
} }
const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
return {
success: true,
links: includeMetadata ? mapResults : linksToReturn,
scrape_id: origin?.includes("website") ? id : undefined,
job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000,
};
}
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
req.body = mapRequestSchema.parse(req.body);
const result = await getMapResults({
url: req.body.url,
search: req.body.search,
limit: req.body.limit,
ignoreSitemap: req.body.ignoreSitemap,
includeSubdomains: req.body.includeSubdomains,
crawlerOptions: req.body,
origin: req.body.origin,
teamId: req.auth.team_id,
plan: req.auth.plan,
});
// Bill the team
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => { billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
logger.error( logger.error(
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}` `Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
); );
// Optionally, you could notify an admin or add to a retry queue here
}); });
const endTime = new Date().getTime(); // Log the job
const timeTakenInSeconds = (endTime - startTime) / 1000;
logJob({ logJob({
job_id: id, job_id: result.job_id,
success: links.length > 0, success: result.links.length > 0,
message: "Map completed", message: "Map completed",
num_docs: links.length, num_docs: result.links.length,
docs: links, docs: result.links,
time_taken: timeTakenInSeconds, time_taken: result.time_taken,
team_id: req.auth.team_id, team_id: req.auth.team_id,
mode: "map", mode: "map",
url: req.body.url, url: req.body.url,
crawlerOptions: {}, crawlerOptions: {},
scrapeOptions: {}, scrapeOptions: {},
origin: req.body.origin, origin: req.body.origin ?? "api",
num_tokens: 0, num_tokens: 0,
}); });
return res.status(200).json({ const response = {
success: true, success: true as const,
links: links, links: result.links,
scrape_id: req.body.origin?.includes("website") ? id : undefined, scrape_id: result.scrape_id
}); };
return res.status(200).json(response);
} }
// Subdomain sitemap url checking
// // For each result, check for subdomains, get their sitemaps and add them to the links
// const processedUrls = new Set();
// const processedSubdomains = new Set();
// for (const result of links) {
// let url;
// let hostParts;
// try {
// url = new URL(result);
// hostParts = url.hostname.split('.');
// } catch (e) {
// continue;
// }
// console.log("hostParts", hostParts);
// // Check if it's a subdomain (more than 2 parts, and not 'www')
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
// const subdomain = hostParts[0];
// console.log("subdomain", subdomain);
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
// console.log("subdomainUrl", subdomainUrl);
// if (!processedSubdomains.has(subdomainUrl)) {
// processedSubdomains.add(subdomainUrl);
// const subdomainCrawl = crawlToCrawler(id, {
// originUrl: subdomainUrl,
// crawlerOptions: legacyCrawlerOptions(req.body),
// pageOptions: {},
// team_id: req.auth.team_id,
// createdAt: Date.now(),
// plan: req.auth.plan,
// });
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
// if (subdomainSitemap) {
// subdomainSitemap.forEach((x) => {
// if (!processedUrls.has(x.url)) {
// processedUrls.add(x.url);
// links.push(x.url);
// }
// });
// }
// }
// }
// }

View File

@ -151,8 +151,25 @@ export const scrapeOptions = z.object({
}).strict(strictMessage) }).strict(strictMessage)
export type ScrapeOptions = z.infer<typeof scrapeOptions>; export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const extractV1Options = z.object({
urls: url.array(),
prompt: z.string().optional(),
schema: z.any().optional(),
limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000)
}).strict(strictMessage)
export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
url, url,
origin: z.string().optional().default("api"), origin: z.string().optional().default("api"),
@ -173,6 +190,8 @@ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend(
return obj; return obj;
}); });
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>; export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>; export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
@ -340,6 +359,21 @@ export interface ScrapeResponseRequestTest {
error?: string; error?: string;
} }
export type ExtractResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: z.infer<typeof extractRequestSchema>;
scrape_id?: string;
};
export interface ExtractResponseRequestTest {
statusCode: number;
body: ExtractResponse;
error?: string;
}
export type CrawlResponse = export type CrawlResponse =
| ErrorResponse | ErrorResponse
| { | {
@ -496,6 +530,13 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
}; };
} }
export interface MapDocument {
url: string;
title?: string;
description?: string;
}
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } { export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
return { return {
scrapeOptions: scrapeOptions.parse({ scrapeOptions: scrapeOptions.parse({

50
apps/api/src/lib/cache.ts Normal file
View File

@ -0,0 +1,50 @@
import IORedis from "ioredis";
import { ScrapeOptions } from "../controllers/v1/types";
import { InternalOptions } from "../scraper/scrapeURL";
import { logger as _logger } from "./logger";
const logger = _logger.child({module: "cache"});
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
maxRetriesPerRequest: null,
}) : null;
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
if (!cacheRedis) return null;
// these options disqualify a cache
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
) {
return null;
}
return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
}
export type CacheEntry = {
url: string;
html: string;
statusCode: number;
error?: string;
};
export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
try {
await cacheRedis.set(key, JSON.stringify(entry));
} catch (error) {
logger.warn("Failed to save to cache", { key, error });
}
}
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
if (!cacheRedis) return null;
try {
return JSON.parse(await cacheRedis.get(key) ?? "null");
} catch (error) {
logger.warn("Failed to get from cache", { key, error });
return null;
}
}

View File

@ -0,0 +1,15 @@
import { Document } from "../../controllers/v1/types";
export function buildDocument(document: Document): string {
const metadata = document.metadata;
const markdown = document.markdown;
// for each key in the metadata allow up to 250 characters
const metadataString = Object.entries(metadata).map(([key, value]) => {
return `${key}: ${value?.toString().slice(0, 250)}`;
}).join('\n');
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
const documentString = `${markdown}${documentMetadataString}`;
return documentString;
}

View File

@ -0,0 +1,124 @@
// use llmExtract.ts instead
// import OpenAI from "openai";
// import { encoding_for_model } from "@dqbd/tiktoken";
// import { TiktokenModel } from "@dqbd/tiktoken";
// import { ExtractOptions } from "../../controllers/v1/types";
// import { Document } from "../entities";
// import { z } from "zod";
// const maxTokens = 32000;
// const modifier = 4;
// export class LLMRefusalError extends Error {
// constructor(refusal: string) {
// super("LLM refused to extract the website's content");
// this.name = "LLMRefusalError";
// }
// }
// interface GenerateCompletionsParams {
// systemPrompt?: string;
// prompt?: string;
// schema?: any;
// pagesContent: string;
// }
// export async function generateBasicCompletion(prompt: string) {
// const openai = new OpenAI();
// const model: TiktokenModel =
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
// const completion = await openai.chat.completions.create({
// model,
// messages: [{ role: "user", content: prompt }],
// });
// return completion.choices[0].message.content;
// }
// export async function generateFinalExtraction({
// pagesContent,
// systemPrompt,
// prompt,
// schema,
// }: GenerateCompletionsParams): Promise<{
// content: string;
// metadata: { numTokens: number; warning: string };
// }> {
// const openai = new OpenAI();
// const model: TiktokenModel =
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
// let extractionContent = pagesContent;
// let numTokens = 0;
// let warning = "";
// const encoder = encoding_for_model(model);
// try {
// const tokens = encoder.encode(extractionContent);
// numTokens = tokens.length;
// } catch (error) {
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
// } finally {
// encoder.free();
// }
// if (numTokens > maxTokens) {
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
// }
// if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
// schema = {
// type: "object",
// properties: {
// items: schema,
// },
// required: ["items"],
// additionalProperties: false,
// };
// } else if (schema) {
// schema.additionalProperties = false;
// schema.required = Object.keys(schema.properties);
// }
// const jsonCompletion = await openai.beta.chat.completions.parse({
// temperature: 0,
// model,
// messages: [
// { role: "system", content: systemPrompt ?? "" },
// { role: "user", content: [{ type: "text", text: extractionContent }] },
// {
// role: "user",
// content: prompt
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
// : "Transform the above content into structured JSON output.",
// },
// ],
// response_format: schema
// ? {
// type: "json_schema",
// json_schema: {
// name: "websiteContent",
// schema: schema,
// strict: true,
// },
// }
// : { type: "json_object" },
// });
// if (jsonCompletion.choices[0].message.refusal !== null) {
// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
// }
// const extraction = jsonCompletion.choices[0].message.parsed;
// return {
// content: extraction ?? "",
// metadata: {
// numTokens,
// warning,
// },
// };
// }

View File

@ -0,0 +1,22 @@
import { CohereClient } from "cohere-ai";
import { MapDocument } from "../../controllers/v1/types";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
export async function rerankDocuments(
documents: (string | Record<string, string>)[],
query: string,
topN = 3,
model = "rerank-english-v3.0"
) {
const rerank = await cohere.v2.rerank({
documents,
query,
topN,
model,
returnDocuments: true,
});
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
}

View File

@ -0,0 +1,68 @@
import { performRanking } from './ranker';
describe('performRanking', () => {
it('should rank links based on similarity to search query', async () => {
const linksWithContext = [
'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
];
const links = [
'https://example.com/dogs',
'https://example.com/cats',
'https://example.com/pets'
];
const searchQuery = 'cats training';
const result = await performRanking(linksWithContext, links, searchQuery);
// Should return array of objects with link, linkWithContext, score, originalIndex
expect(result).toBeInstanceOf(Array);
expect(result.length).toBe(3);
// First result should be the dogs page since query is about dogs
expect(result[0].link).toBe('https://example.com/cats');
// Each result should have required properties
result.forEach(item => {
expect(item).toHaveProperty('link');
expect(item).toHaveProperty('linkWithContext');
expect(item).toHaveProperty('score');
expect(item).toHaveProperty('originalIndex');
expect(typeof item.score).toBe('number');
expect(item.score).toBeGreaterThanOrEqual(0);
expect(item.score).toBeLessThanOrEqual(1);
});
// Scores should be in descending order
for (let i = 1; i < result.length; i++) {
expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
}
});
it('should handle empty inputs', async () => {
const result = await performRanking([], [], '');
expect(result).toEqual([]);
});
it('should maintain original order for equal scores', async () => {
const linksWithContext = [
'url: https://example.com/1, title: Similar content A, description: test',
'url: https://example.com/2, title: Similar content B, description: test'
];
const links = [
'https://example.com/1',
'https://example.com/2'
];
const searchQuery = 'test';
const result = await performRanking(linksWithContext, links, searchQuery);
// If scores are equal, original order should be maintained
expect(result[0].originalIndex).toBeLessThan(result[1].originalIndex);
});
});

View File

@ -0,0 +1,92 @@
import axios from 'axios';
import { configDotenv } from 'dotenv';
import OpenAI from "openai";
configDotenv();
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-ada-002",
input: text,
encoding_format: "float",
});
return embedding.data[0].embedding;
}
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const magnitude1 = Math.sqrt(
vec1.reduce((sum, val) => sum + val * val, 0)
);
const magnitude2 = Math.sqrt(
vec2.reduce((sum, val) => sum + val * val, 0)
);
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);
};
// Function to convert text to vector
const textToVector = (searchQuery: string, text: string): number[] => {
const words = searchQuery.toLowerCase().split(/\W+/);
return words.map((word) => {
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
.length;
return count / text.length;
});
};
async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
try {
// Handle invalid inputs
if (!searchQuery || !linksWithContext.length || !links.length) {
return [];
}
// Sanitize search query by removing null characters
const sanitizedQuery = searchQuery;
// Generate embeddings for the search query
const queryEmbedding = await getEmbedding(sanitizedQuery);
// Generate embeddings for each link and calculate similarity
const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
try {
const linkEmbedding = await getEmbedding(linkWithContext);
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
return {
link: links[index],
linkWithContext,
score,
originalIndex: index
};
} catch (err) {
// If embedding fails for a link, return with score 0
return {
link: links[index],
linkWithContext,
score: 0,
originalIndex: index
};
}
}));
// Sort links based on similarity scores while preserving original order for equal scores
linksAndScores.sort((a, b) => {
const scoreDiff = b.score - a.score;
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
});
return linksAndScores;
} catch (error) {
console.error(`Error performing semantic search: ${error}`);
return [];
}
}
export { performRanking };

View File

@ -1 +1 @@
export const axiosTimeout = 3000; export const axiosTimeout = 5000;

View File

@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status"; import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check"; import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape"; import { batchScrapeController } from "../controllers/v1/batch-scrape";
import { extractController } from "../controllers/v1/extract";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview"; // import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status"; // import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search"; // import { searchController } from "../../src/controllers/v1/search";
@ -98,7 +99,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
if (!res.headersSent) { if (!res.headersSent) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
} }
} }
next(); next();
@ -178,6 +179,13 @@ v1Router.ws(
crawlStatusWSController crawlStatusWSController
); );
v1Router.post(
"/extract",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
wrap(extractController)
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController); // v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -199,3 +207,4 @@ v1Router.delete(
// Health/Probe routes // Health/Probe routes
// v1Router.get("/health/liveness", livenessController); // v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController); // v1Router.get("/health/readiness", readinessController);

View File

@ -0,0 +1,19 @@
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { EngineError } from "../../error";
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key === null) throw new EngineError("Scrape not eligible for caching");
const entry = await getEntryFromCache(key);
if (entry === null) throw new EngineError("Cache missed");
return {
url: entry.url,
html: entry.html,
statusCode: entry.statusCode,
error: entry.error,
};
}

View File

@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee"; import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch"; import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright"; import { scrapeURLWithPlaywright } from "./playwright";
import { scrapeCache } from "./cache";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx"; export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined; const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined; const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined; const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [ export const engines: Engine[] = [
// ...(useCache ? [ "cache" as const ] : []),
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []), ...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []), ...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
...(usePlaywright ? [ "playwright" as const ] : []), ...(usePlaywright ? [ "playwright" as const ] : []),
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
const engineHandlers: { const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult> [E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
} = { } = {
"cache": scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP, "fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright, "fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient, "fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
@ -95,6 +99,22 @@ export const engineOptions: {
quality: number, quality: number,
} }
} = { } = {
"cache": {
features: {
"actions": false,
"waitFor": true,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false, // TODO: figure this out
"docx": false, // TODO: figure this out
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 1000, // cache should always be tried first
},
"fire-engine;chrome-cdp": { "fire-engine;chrome-cdp": {
features: { features: {
"actions": true, "actions": true,

View File

@ -0,0 +1,26 @@
import { Document } from "../../../controllers/v1/types";
import { Meta } from "..";
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
export function saveToCache(meta: Meta, document: Document): Document {
if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
if (document.rawHtml === undefined) {
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) {
const entry: CacheEntry = {
html: document.rawHtml!,
statusCode: document.metadata.statusCode!,
url: document.metadata.url ?? document.metadata.sourceURL!,
error: document.metadata.error ?? undefined,
};
saveEntryToCache(key, entry);
}
return document;
}

View File

@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract"; import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot"; import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images"; import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache";
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>; export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
// TODO: allow some of these to run in parallel // TODO: allow some of these to run in parallel
export const transformerStack: Transformer[] = [ export const transformerStack: Transformer[] = [
saveToCache,
deriveHTMLFromRawHTML, deriveHTMLFromRawHTML,
deriveMarkdownFromHTML, deriveMarkdownFromHTML,
deriveLinksFromHTML, deriveLinksFromHTML,

View File

@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
} }
} }
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> { export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
let extract: any;
let warning: string | undefined;
const openai = new OpenAI(); const openai = new OpenAI();
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini"; const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (document.markdown === undefined) { if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected"); throw new Error("document.markdown is undefined -- this is unexpected");
} }
let extractionContent = document.markdown;
// count number of tokens // count number of tokens
let numTokens = 0; let numTokens = 0;
const encoder = encoding_for_model(model as TiktokenModel); const encoder = encoding_for_model(model as TiktokenModel);
try { try {
// Encode the message into tokens // Encode the message into tokens
const tokens = encoder.encode(extractionContent); const tokens = encoder.encode(markdown);
// Return the number of tokens // Return the number of tokens
numTokens = tokens.length; numTokens = tokens.length;
} catch (error) { } catch (error) {
logger.warn("Calculating num tokens of string failed", { error, extractionContent }); logger.warn("Calculating num tokens of string failed", { error, markdown });
extractionContent = extractionContent.slice(0, maxTokens * modifier); markdown = markdown.slice(0, maxTokens * modifier);
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support."; let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
document.warning = document.warning === undefined ? warning : " " + warning; warning = previousWarning === undefined ? w : w + " " + previousWarning;
} finally { } finally {
// Free the encoder resources after use // Free the encoder resources after use
encoder.free(); encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
if (numTokens > maxTokens) { if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters // trim the document to the maximum number of tokens, tokens != characters
extractionContent = extractionContent.slice(0, maxTokens * modifier); markdown = markdown.slice(0, maxTokens * modifier);
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed."; const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
document.warning = document.warning === undefined ? warning : " " + warning; warning = previousWarning === undefined ? w : w + " " + previousWarning;
} }
let schema = options.schema; let schema = options.schema;
@ -107,12 +108,22 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
required: ["items"], required: ["items"],
additionalProperties: false, additionalProperties: false,
}; };
} else if (schema && typeof schema === 'object' && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => [key, { type: value }])
),
required: Object.keys(schema),
additionalProperties: false
};
} }
schema = normalizeSchema(schema); schema = normalizeSchema(schema);
const jsonCompletion = await openai.beta.chat.completions.parse({ const jsonCompletion = await openai.beta.chat.completions.parse({
model, model,
temperature: 0,
messages: [ messages: [
{ {
role: "system", role: "system",
@ -120,7 +131,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
}, },
{ {
role: "user", role: "user",
content: [{ type: "text", text: extractionContent }], content: [{ type: "text", text: markdown }],
}, },
{ {
role: "user", role: "user",
@ -143,26 +154,35 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal); throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
} }
document.extract = jsonCompletion.choices[0].message.parsed; extract = jsonCompletion.choices[0].message.parsed;
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) { if (extract === null && jsonCompletion.choices[0].message.content !== null) {
try { try {
document.extract = JSON.parse(jsonCompletion.choices[0].message.content); extract = JSON.parse(jsonCompletion.choices[0].message.content);
} catch (e) { } catch (e) {
logger.error("Failed to parse returned JSON, no schema specified.", { error: e }); logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object."); throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
} }
} }
if (options.schema && options.schema.type === "array") { // If the users actually wants the items object, they can specify it as 'required' in the schema
document.extract = document.extract?.items; // otherwise, we just return the items array
if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
extract = extract?.items;
} }
return document; return { extract, warning, numTokens };
} }
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> { export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("extract")) { if (meta.options.formats.includes("extract")) {
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!); const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
meta.options.extract!,
document.markdown,
document.warning,
);
document.extract = extract;
document.warning = warning;
} }
return document; return document;

View File

@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
} }
} }
} }
}, 500); }, 250);
}) })
} }

View File

@ -106,6 +106,15 @@ export interface FirecrawlCrawlStatusResponse {
error?: string; error?: string;
} }
export interface FirecrawlExtractResponse {
statusCode: number;
body: {
success: boolean;
data: any[];
};
error?: string;
}
export enum RateLimiterMode { export enum RateLimiterMode {
Crawl = "crawl", Crawl = "crawl",
CrawlStatus = "crawlStatus", CrawlStatus = "crawlStatus",

View File

@ -1,4 +1,5 @@
import FirecrawlApp from 'firecrawl'; import FirecrawlApp from 'firecrawl';
import { z } from 'zod';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"}); const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -42,6 +43,18 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev'); const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult) console.log(mapResult)
// Extract information from a website using LLM:
const extractSchema = z.object({
title: z.string(),
description: z.string(),
links: z.array(z.string())
});
const extractResult = await app.extract(['https://firecrawl.dev'], {
prompt: "Extract the title, description, and links from the website",
schema: extractSchema
});
console.log(extractResult);
// Crawl a website with WebSockets: // Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -42,6 +42,19 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev'); const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult) console.log(mapResult)
// // Extract information from a website using LLM:
// const extractSchema = z.object({
// title: z.string(),
// description: z.string(),
// links: z.array(z.string())
// });
// const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
// prompt: "Extract the title, description, and links from the website",
// schema: extractSchema
// });
// console.log(extractResult);
// Crawl a website with WebSockets: // Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5}); const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -1,6 +1,6 @@
{ {
"name": "@mendable/firecrawl-js", "name": "@mendable/firecrawl-js",
"version": "1.8.5", "version": "1.9.0",
"description": "JavaScript SDK for Firecrawl API", "description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js", "main": "dist/index.js",
"types": "dist/index.d.ts", "types": "dist/index.d.ts",

View File

@ -236,6 +236,27 @@ export interface MapResponse {
error?: string; error?: string;
} }
/**
* Parameters for extracting information from URLs.
* Defines options for extracting information from URLs.
*/
export interface ExtractParams {
prompt: string;
schema?: zt.ZodSchema;
systemPrompt?: string;
allowExternalLinks?: boolean;
}
/**
* Response interface for extracting information from URLs.
* Defines the structure of the response received after extracting information from URLs.
*/
export interface ExtractResponse {
success: true;
data: zt.infer<zt.ZodSchema>;
error?: string;
}
/** /**
* Error response interface. * Error response interface.
* Defines the structure of the response received when an error occurs. * Defines the structure of the response received when an error occurs.
@ -245,7 +266,6 @@ export interface ErrorResponse {
error: string; error: string;
} }
/** /**
* Custom error class for Firecrawl. * Custom error class for Firecrawl.
* Extends the built-in Error class to include a status code. * Extends the built-in Error class to include a status code.
@ -679,6 +699,44 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." }; return { success: false, error: "Internal server error." };
} }
/**
* Extracts information from URLs using the Firecrawl API.
* @param url - The URL to extract information from.
* @param params - Additional parameters for the extract request.
* @returns The response from the extract operation.
*/
async extract(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
const headers = this.prepareHeaders();
if (!params?.prompt) {
throw new FirecrawlError("Prompt is required", 400);
}
let jsonData: { urls: string[] } & ExtractParams= { urls, ...params };
let jsonSchema: any;
try {
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
} catch (error: any) {
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/extract`,
{ ...jsonData, schema: jsonSchema },
headers
);
if (response.status === 200) {
return response.data as ExtractResponse;
} else {
this.handleError(response, "extract");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/** /**
* Prepares the headers for an API request. * Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency. * @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -2,6 +2,8 @@ import time
import nest_asyncio import nest_asyncio
import uuid import uuid
from firecrawl.firecrawl import FirecrawlApp from firecrawl.firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List
app = FirecrawlApp(api_key="fc-") app = FirecrawlApp(api_key="fc-")
@ -50,9 +52,6 @@ print(crawl_status)
# LLM Extraction: # LLM Extraction:
# Define schema to extract contents into using pydantic # Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
from typing import List
class ArticleSchema(BaseModel): class ArticleSchema(BaseModel):
title: str title: str
points: int points: int
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' }) map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
print(map_result) print(map_result)
# Extract URLs:
class ExtractSchema(BaseModel):
title: str
description: str
links: List[str]
# Define the schema using Pydantic
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = app.extract(['https://firecrawl.dev'], {
'prompt': "Extract the title, description, and links from the website",
'schema': extract_schema
})
print(extract_result)
# Crawl a website with WebSockets: # Crawl a website with WebSockets:
# inside an async function... # inside an async function...
nest_asyncio.apply() nest_asyncio.apply()

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa from .firecrawl import FirecrawlApp # noqa
__version__ = "1.5.0" __version__ = "1.6.0"
# Define the logger for the Firecrawl project # Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl") logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -12,15 +12,40 @@ Classes:
import logging import logging
import os import os
import time import time
from typing import Any, Dict, Optional, List from typing import Any, Dict, Optional, List, Union
import json import json
import requests import requests
import pydantic
import websockets import websockets
logger : logging.Logger = logging.getLogger("firecrawl") logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp: class FirecrawlApp:
class ExtractParams(pydantic.BaseModel):
"""
Parameters for the extract operation.
"""
prompt: str
schema: Optional[Any] = None
system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False
class ExtractResponse(pydantic.BaseModel):
"""
Response from the extract operation.
"""
success: bool
data: Optional[Any] = None
error: Optional[str] = None
class ErrorResponse(pydantic.BaseModel):
"""
Error response.
"""
success: bool
error: str
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None: def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
""" """
Initialize the FirecrawlApp instance with API key, API URL. Initialize the FirecrawlApp instance with API key, API URL.
@ -434,6 +459,48 @@ class FirecrawlApp:
else: else:
self._handle_error(response, 'check batch scrape status') self._handle_error(response, 'check batch scrape status')
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
"""
Extracts information from a URL using the Firecrawl API.
Args:
urls (List[str]): The URLs to extract information from.
params (Optional[ExtractParams]): Additional parameters for the extract request.
Returns:
Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
"""
headers = self._prepare_headers()
if not params or not params.get('prompt'):
raise ValueError("Prompt is required")
if not params.get('schema'):
raise ValueError("Schema is required for extraction")
jsonData = {'urls': urls, **params}
jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
try:
response = self._post_request(
f'{self.api_url}/v1/extract',
{
**jsonData,
'allowExternalLinks': params.get('allow_external_links', False),
'schema': jsonSchema
},
headers
)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, "extract")
except Exception as e:
raise ValueError(str(e), 500)
return {'success': False, 'error': "Internal server error."}
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]: def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
""" """
Prepare the headers for API requests. Prepare the headers for API requests.