Merge pull request #915 from mendableai/nsc/new-extract

Extract (beta)
This commit is contained in:
Nicolas 2024-11-26 10:02:09 -08:00 committed by GitHub
commit 6c33b978f3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
32 changed files with 2767 additions and 162 deletions

View File

@ -55,7 +55,7 @@
"@bull-board/api": "^5.20.5",
"@bull-board/express": "^5.20.5",
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.16",
"@dqbd/tiktoken": "^1.0.17",
"@nangohq/node": "^0.40.8",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
@ -73,6 +73,7 @@
"cacheable-lookup": "^6.1.0",
"cheerio": "^1.0.0-rc.12",
"cohere": "^1.1.1",
"cohere-ai": "^7.14.0",
"cors": "^2.8.5",
"cron-parser": "^4.9.0",
"date-fns": "^3.6.0",

1316
apps/api/pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -46,4 +46,32 @@ content-type: application/json
@batchScrapeId = {{batchScrape.response.body.$.id}}
# @name batchScrapeStatus
GET {{baseUrl}}/v1/crawl/{{batchScrapeId}} HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
Authorization: Bearer {{$dotenv TEST_API_KEY}}
### Map Website
# @name map
POST {{baseUrl}}/v1/map HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"url": "firecrawl.dev",
"sitemapOnly": true
}
### Extract
# @name extract
POST {{baseUrl}}/v1/extract HTTP/1.1
Authorization: Bearer {{$dotenv TEST_API_KEY}}
content-type: application/json
{
"urls": ["firecrawl.dev"],
"prompt": "What is the title, description and main product of the page?",
"schema": {
"title": "string",
"description": "string",
"mainProduct": "string"
}
}

View File

@ -0,0 +1,249 @@
import request from "supertest";
import dotenv from "dotenv";
import {
FirecrawlCrawlResponse,
FirecrawlCrawlStatusResponse,
FirecrawlScrapeResponse,
} from "../../types";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for Extract API Routes", () => {
it.concurrent("should return authors of blog posts on firecrawl.dev", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev/*"],
prompt: "Who are the authors of the blog posts?",
schema: {
type: "object",
properties: { authors: { type: "array", items: { type: "string" } } },
},
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("authors");
let gotItRight = 0;
for (const author of response.body.data?.authors) {
if (author.includes("Caleb Peffer")) gotItRight++;
if (author.includes("Gergő Móricz")) gotItRight++;
if (author.includes("Eric Ciarla")) gotItRight++;
if (author.includes("Nicolas Camara")) gotItRight++;
if (author.includes("Jon")) gotItRight++;
if (author.includes("Wendong")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(1);
}, 60000);
it.concurrent("should return founders of firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["firecrawl.dev/*"],
prompt: "Who are the founders of the company?",
allowExternalLinks: true,
schema: {
type: "object",
properties: { founders: { type: "array", items: { type: "string" } } },
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data).toHaveProperty("founders");
console.log(response.body.data?.founders);
let gotItRight = 0;
for (const founder of response.body.data?.founders) {
if (founder.includes("Caleb")) gotItRight++;
if (founder.includes("Eric")) gotItRight++;
if (founder.includes("Nicolas")) gotItRight++;
if (founder.includes("nick")) gotItRight++;
if (founder.includes("eric")) gotItRight++;
if (founder.includes("jon-noronha")) gotItRight++;
}
expect(gotItRight).toBeGreaterThanOrEqual(2);
}, 60000);
it.concurrent("should return hiring opportunities on firecrawl.dev (allowExternalLinks = true)", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://firecrawl.dev/*"],
prompt: "What are they hiring for?",
allowExternalLinks: true,
schema: {
type: "array",
items: {
type: "string"
},
required: ["items"]
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
console.log(response.body.data);
let gotItRight = 0;
for (const hiring of response.body.data?.items) {
if (hiring.includes("Developer Support Engineer")) gotItRight++;
if (hiring.includes("Dev Ops Engineer")) gotItRight++;
if (hiring.includes("Founding Web Automation Engineer")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(2);
}, 60000);
it.concurrent("should return PCI DSS compliance for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com/*"],
prompt: "Does Fivetran have PCI DSS compliance?",
allowExternalLinks: true,
schema: {
type: "object",
properties: {
pciDssCompliance: { type: "boolean" }
}
},
});
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
it.concurrent("should return Azure Data Connectors for Fivetran", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["fivetran.com/*"],
prompt: "What are the Azure Data Connectors they offer?",
schema: {
type: "array",
items: {
type: "object",
properties: {
connector: { type: "string" },
description: { type: "string" },
supportsCaptureDelete: { type: "boolean" }
}
}
}
})
console.log(response.body);
// expect(response.statusCode).toBe(200);
// expect(response.body).toHaveProperty("data");
// expect(response.body.data?.pciDssCompliance).toBe(true);
}, 60000);
it.concurrent("should return Greenhouse Applicant Tracking System for Abnormal Security", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://careers.abnormalsecurity.com/jobs/6119456003?gh_jid=6119456003"],
prompt: "what applicant tracking system is this company using?",
schema: {
type: "object",
properties: {
isGreenhouseATS: { type: "boolean" },
answer: { type: "string" }
}
},
allowExternalLinks: true
})
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.isGreenhouseATS).toBe(true);
}, 60000);
it.concurrent("should return mintlify api components", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://mintlify.com/docs/*"],
prompt: "what are the 4 API components?",
schema: {
type: "array",
items: {
type: "object",
properties: {
component: { type: "string" }
}
},
required: ["items"]
},
allowExternalLinks: true
})
console.log(response.body.data?.items);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.items.length).toBe(4);
let gotItRight = 0;
for (const component of response.body.data?.items) {
if (component.component.toLowerCase().includes("parameter")) gotItRight++;
if (component.component.toLowerCase().includes("response")) gotItRight++;
if (component.component.toLowerCase().includes("expandable")) gotItRight++;
if (component.component.toLowerCase().includes("sticky")) gotItRight++;
if (component.component.toLowerCase().includes("examples")) gotItRight++;
}
expect(gotItRight).toBeGreaterThan(2);
}, 60000);
it.concurrent("should return information about Eric Ciarla", async () => {
const response = await request(TEST_URL)
.post("/v1/extract")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
urls: ["https://ericciarla.com/"],
prompt: "Who is Eric Ciarla? Where does he work? Where did he go to school?",
schema: {
type: "object",
properties: {
name: { type: "string" },
work: { type: "string" },
education: { type: "string" }
},
required: ["name", "work", "education"]
},
allowExternalLinks: true
})
console.log(response.body.data);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("data");
expect(response.body.data?.name).toBe("Eric Ciarla");
expect(response.body.data?.work).toBeDefined();
expect(response.body.data?.education).toBeDefined();
}, 60000);
});

View File

@ -0,0 +1,117 @@
import request from "supertest";
import dotenv from "dotenv";
dotenv.config();
const TEST_URL = "http://127.0.0.1:3002";
describe("E2E Tests for Map API Routes", () => {
it.concurrent(
"(feat-search)should return links containing 'smart-crawl'",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
search: "smart-crawl",
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[0]).toContain("firecrawl.dev/smart-crawl");
},
60000
);
it.concurrent(
"(feat-subdomains) should return mapped links for firecrawl.dev with subdomains included",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
includeSubdomains: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).toContain(
"docs.firecrawl.dev"
);
},
60000
);
it.concurrent(
"(feat-sitemap-only) should return mapped links for firecrawl.dev with sitemap only",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(0);
expect(response.body.links[response.body.links.length - 1]).not.toContain(
"docs.firecrawl.dev"
);
},
60000
);
it.concurrent(
"(feat-limit) should return mapped links for firecrawl.dev with a limit",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://firecrawl.dev",
sitemapOnly: false,
limit: 10,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeLessThanOrEqual(10);
},
60000
);
it.concurrent(
"(feat-sitemap-large) should return more than 1900 links for geekflare sitemap",
async () => {
const response = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send({
url: "https://geekflare.com/sitemap_index.xml",
sitemapOnly: true,
});
console.log(response.body);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("links");
expect(response.body.links.length).toBeGreaterThan(1900);
},
60000
);
});

View File

@ -0,0 +1,265 @@
import { Request, Response } from "express";
import {
// Document,
RequestWithAuth,
ExtractRequest,
extractRequestSchema,
ExtractResponse,
MapDocument,
scrapeOptions,
} from "./types";
import { Document } from "../../lib/entities";
import Redis from "ioredis";
import { configDotenv } from "dotenv";
import { performRanking } from "../../lib/ranker";
import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job";
import { logger } from "../../lib/logger";
import { getScrapeQueue } from "../../services/queue-service";
import { waitForJob } from "../../services/queue-jobs";
import { addScrapeJob } from "../../services/queue-jobs";
import { PlanType } from "../../types";
import { getJobPriority } from "../../lib/job-priority";
import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { getMapResults } from "./map";
import { buildDocument } from "../../lib/extract/build-document";
configDotenv();
const redis = new Redis(process.env.REDIS_URL!);
const MAX_EXTRACT_LIMIT = 100;
const MAX_RANKING_LIMIT = 10;
const INITIAL_SCORE_THRESHOLD = 0.75;
const FALLBACK_SCORE_THRESHOLD = 0.5;
const MIN_REQUIRED_LINKS = 1;
/**
* Extracts data from the provided URLs based on the request parameters.
* Currently in beta.
* @param req - The request object containing authentication and extraction details.
* @param res - The response object to send the extraction results.
* @returns A promise that resolves when the extraction process is complete.
*/
export async function extractController(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
res: Response<ExtractResponse>
) {
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
req.body = extractRequestSchema.parse(req.body);
const id = crypto.randomUUID();
let links: string[] = [];
let docs: Document[] = [];
const earlyReturn = false;
// Process all URLs in parallel
const urlPromises = req.body.urls.map(async (url) => {
if (url.includes('/*') || req.body.allowExternalLinks) {
// Handle glob pattern URLs
const baseUrl = url.replace('/*', '');
// const pathPrefix = baseUrl.split('/').slice(3).join('/'); // Get path after domain if any
const allowExternalLinks = req.body.allowExternalLinks ?? true;
let urlWithoutWww = baseUrl.replace("www.", "");
let mapUrl = req.body.prompt && allowExternalLinks
? `${req.body.prompt} ${urlWithoutWww}`
: req.body.prompt ? `${req.body.prompt} site:${urlWithoutWww}`
: `site:${urlWithoutWww}`;
const mapResults = await getMapResults({
url: baseUrl,
search: req.body.prompt,
teamId: req.auth.team_id,
plan: req.auth.plan,
allowExternalLinks,
origin: req.body.origin,
limit: req.body.limit,
// If we're self-hosted, we don't want to ignore the sitemap, due to our fire-engine mapping
ignoreSitemap: !selfHosted ? true : false,
includeMetadata: true,
includeSubdomains: req.body.includeSubdomains,
});
let mappedLinks = mapResults.links as MapDocument[];
// Limit number of links to MAX_EXTRACT_LIMIT
mappedLinks = mappedLinks.slice(0, MAX_EXTRACT_LIMIT);
let mappedLinksRerank = mappedLinks.map(x => `url: ${x.url}, title: ${x.title}, description: ${x.description}`);
// Filter by path prefix if present
// wrong
// if (pathPrefix) {
// mappedLinks = mappedLinks.filter(x => x.url && x.url.includes(`/${pathPrefix}/`));
// }
if (req.body.prompt) {
// Get similarity scores between the search query and each link's context
const linksAndScores = await performRanking(mappedLinksRerank, mappedLinks.map(l => l.url), mapUrl);
// First try with high threshold
let filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, INITIAL_SCORE_THRESHOLD);
// If we don't have enough high-quality links, try with lower threshold
if (filteredLinks.length < MIN_REQUIRED_LINKS) {
logger.info(`Only found ${filteredLinks.length} links with score > ${INITIAL_SCORE_THRESHOLD}. Trying lower threshold...`);
filteredLinks = filterAndProcessLinks(mappedLinks, linksAndScores, FALLBACK_SCORE_THRESHOLD);
if (filteredLinks.length === 0) {
// If still no results, take top N results regardless of score
logger.warn(`No links found with score > ${FALLBACK_SCORE_THRESHOLD}. Taking top ${MIN_REQUIRED_LINKS} results.`);
filteredLinks = linksAndScores
.sort((a, b) => b.score - a.score)
.slice(0, MIN_REQUIRED_LINKS)
.map(x => mappedLinks.find(link => link.url === x.link))
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
}
}
mappedLinks = filteredLinks.slice(0, MAX_RANKING_LIMIT);
}
return mappedLinks.map(x => x.url) as string[];
} else {
// Handle direct URLs without glob pattern
if (!isUrlBlocked(url)) {
return [url];
}
return [];
}
});
// Wait for all URL processing to complete and flatten results
const processedUrls = await Promise.all(urlPromises);
links.push(...processedUrls.flat());
if (links.length === 0) {
return res.status(400).json({
success: false,
error: "No valid URLs found to scrape. Try adjusting your search criteria or including more URLs."
});
}
// Scrape all links in parallel with retries
const scrapePromises = links.map(async (url) => {
const origin = req.body.origin || "api";
const timeout = Math.floor((req.body.timeout || 40000) * 0.7) || 30000; // Use 70% of total timeout for individual scrapes
const jobId = crypto.randomUUID();
const jobPriority = await getJobPriority({
plan: req.auth.plan as PlanType,
team_id: req.auth.team_id,
basePriority: 10,
});
await addScrapeJob(
{
url,
mode: "single_urls",
team_id: req.auth.team_id,
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
plan: req.auth.plan!,
origin,
is_scrape: true,
},
{},
jobId,
jobPriority
);
try {
const doc = await waitForJob<Document>(jobId, timeout);
await getScrapeQueue().remove(jobId);
if (earlyReturn) {
return null;
}
return doc;
} catch (e) {
logger.error(`Error in scrapeController: ${e}`);
if (e instanceof Error && (e.message.startsWith("Job wait") || e.message === "timeout")) {
throw {
status: 408,
error: "Request timed out"
};
} else {
throw {
status: 500,
error: `(Internal server error) - ${(e && e.message) ? e.message : e}`
};
}
}
});
try {
const results = await Promise.all(scrapePromises);
docs.push(...results.filter(doc => doc !== null).map(x => x!));
} catch (e) {
return res.status(e.status).json({
success: false,
error: e.error
});
}
const completions = await generateOpenAICompletions(
logger.child({ method: "extractController/generateOpenAICompletions" }),
{
mode: "llm",
systemPrompt: "Always prioritize using the provided content to answer the question. Do not make up an answer. Be concise and follow the schema if provided.",
prompt: req.body.prompt,
schema: req.body.schema,
},
docs.map(x => buildDocument(x)).join('\n')
);
// TODO: change this later
// While on beta, we're billing 5 credits per link discovered/scraped.
billTeam(req.auth.team_id, req.acuc?.sub_id, links.length * 5).catch(error => {
logger.error(`Failed to bill team ${req.auth.team_id} for ${links.length * 5} credits: ${error}`);
});
let data = completions.extract ?? {};
let warning = completions.warning;
logJob({
job_id: id,
success: true,
message: "Extract completed",
num_docs: 1,
docs: data,
time_taken: (new Date().getTime() - Date.now()) / 1000,
team_id: req.auth.team_id,
mode: "extract",
url: req.body.urls.join(", "),
scrapeOptions: req.body,
origin: req.body.origin ?? "api",
num_tokens: completions.numTokens ?? 0
});
return res.status(200).json({
success: true,
data: data,
scrape_id: id,
warning: warning
});
}
/**
* Filters links based on their similarity score to the search query.
* @param mappedLinks - The list of mapped links to filter.
* @param linksAndScores - The list of links and their similarity scores.
* @param threshold - The score threshold to filter by.
* @returns The filtered list of links.
*/
function filterAndProcessLinks(
mappedLinks: MapDocument[],
linksAndScores: { link: string, linkWithContext: string, score: number, originalIndex: number }[],
threshold: number
): MapDocument[] {
return linksAndScores
.filter(x => x.score > threshold)
.map(x => mappedLinks.find(link => link.url === x.link))
.filter((x): x is MapDocument => x !== undefined && x.url !== undefined && !isUrlBlocked(x.url));
}

View File

@ -1,6 +1,6 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import { mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
import { MapDocument, mapRequestSchema, RequestWithAuth, scrapeOptions } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
import { MapResponse, MapRequest } from "./types";
import { configDotenv } from "dotenv";
@ -25,37 +25,61 @@ const MAX_MAP_LIMIT = 5000;
// Max Links that "Smart /map" can return
const MAX_FIRE_ENGINE_RESULTS = 1000;
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
const startTime = new Date().getTime();
req.body = mapRequestSchema.parse(req.body);
const limit: number = req.body.limit ?? MAX_MAP_LIMIT;
interface MapResult {
success: boolean;
links: string[] | any[];
scrape_id?: string;
job_id: string;
time_taken: number;
}
export async function getMapResults({
url,
search,
limit = MAX_MAP_LIMIT,
ignoreSitemap = false,
includeSubdomains = true,
crawlerOptions = {},
teamId,
plan,
origin,
includeMetadata = false,
allowExternalLinks
}: {
url: string;
search?: string;
limit?: number;
ignoreSitemap?: boolean;
includeSubdomains?: boolean;
crawlerOptions?: any;
teamId: string;
plan?: string;
origin?: string;
includeMetadata?: boolean;
allowExternalLinks?: boolean;
}): Promise<MapResult> {
const id = uuidv4();
let links: string[] = [req.body.url];
let links: string[] = [url];
let mapResults: MapDocument[] = [];
const sc: StoredCrawl = {
originUrl: req.body.url,
originUrl: url,
crawlerOptions: {
...req.body,
limit: req.body.sitemapOnly ? 10000000 : limit,
...crawlerOptions,
limit: crawlerOptions.sitemapOnly ? 10000000 : limit,
scrapeOptions: undefined,
},
scrapeOptions: scrapeOptions.parse({}),
internalOptions: {},
team_id: req.auth.team_id,
team_id: teamId,
createdAt: Date.now(),
plan: req.auth.plan,
plan: plan,
};
const crawler = crawlToCrawler(id, sc);
// If sitemapOnly is true, only get links from sitemap
if (req.body.sitemapOnly) {
if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap(true, true);
if (sitemap !== null) {
sitemap.forEach((x) => {
@ -73,19 +97,18 @@ export async function mapController(
// links = links.slice(1, limit); // don't slice, unnecessary
}
} else {
let urlWithoutWww = req.body.url.replace("www.", "");
let urlWithoutWww = url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
let mapUrl = search && allowExternalLinks
? `${search} ${urlWithoutWww}`
: search ? `${search} site:${urlWithoutWww}`
: `site:${url}`;
const resultsPerPage = 100;
const maxPages = Math.ceil(
Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage
);
const maxPages = Math.ceil(Math.min(MAX_FIRE_ENGINE_RESULTS, limit) / resultsPerPage);
const cacheKey = `fireEngineMap:${mapUrl}`;
const cachedResult = null;
const cachedResult = await redis.get(cacheKey);
let allResults: any[] = [];
let pagePromises: Promise<any>[] = [];
@ -110,7 +133,7 @@ export async function mapController(
// Parallelize sitemap fetch with serper search
const [sitemap, ...searchResults] = await Promise.all([
req.body.ignoreSitemap ? null : crawler.tryGetSitemap(true),
ignoreSitemap ? null : crawler.tryGetSitemap(true),
...(cachedResult ? [] : pagePromises),
]);
@ -124,7 +147,7 @@ export async function mapController(
});
}
let mapResults = allResults
mapResults = allResults
.flat()
.filter((result) => result !== null && result !== undefined);
@ -134,7 +157,7 @@ export async function mapController(
}
if (mapResults.length > 0) {
if (req.body.search) {
if (search) {
// Ensure all map results are first, maintaining their order
links = [
mapResults[0].url,
@ -149,9 +172,8 @@ export async function mapController(
}
// Perform cosine similarity between the search query and the list of links
if (req.body.search) {
const searchQuery = req.body.search.toLowerCase();
if (search) {
const searchQuery = search.toLowerCase();
links = performCosineSimilarity(links, searchQuery);
}
@ -166,95 +188,75 @@ export async function mapController(
.filter((x) => x !== null) as string[];
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
links = links.filter((x) => isSameDomain(x, url));
// if includeSubdomains is false, filter out subdomains
if (!req.body.includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, req.body.url));
if (!includeSubdomains) {
links = links.filter((x) => isSameSubdomain(x, url));
}
// remove duplicates that could be due to http/https or www
links = removeDuplicateUrls(links);
links.slice(0, limit);
}
const linksToReturn = crawlerOptions.sitemapOnly ? links : links.slice(0, limit);
return {
success: true,
links: includeMetadata ? mapResults : linksToReturn,
scrape_id: origin?.includes("website") ? id : undefined,
job_id: id,
time_taken: (new Date().getTime() - Date.now()) / 1000,
};
}
export async function mapController(
req: RequestWithAuth<{}, MapResponse, MapRequest>,
res: Response<MapResponse>
) {
req.body = mapRequestSchema.parse(req.body);
const result = await getMapResults({
url: req.body.url,
search: req.body.search,
limit: req.body.limit,
ignoreSitemap: req.body.ignoreSitemap,
includeSubdomains: req.body.includeSubdomains,
crawlerOptions: req.body,
origin: req.body.origin,
teamId: req.auth.team_id,
plan: req.auth.plan,
});
// Bill the team
billTeam(req.auth.team_id, req.acuc?.sub_id, 1).catch((error) => {
logger.error(
`Failed to bill team ${req.auth.team_id} for 1 credit: ${error}`
);
// Optionally, you could notify an admin or add to a retry queue here
});
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;
// Log the job
logJob({
job_id: id,
success: links.length > 0,
job_id: result.job_id,
success: result.links.length > 0,
message: "Map completed",
num_docs: links.length,
docs: links,
time_taken: timeTakenInSeconds,
num_docs: result.links.length,
docs: result.links,
time_taken: result.time_taken,
team_id: req.auth.team_id,
mode: "map",
mode: "map",
url: req.body.url,
crawlerOptions: {},
scrapeOptions: {},
origin: req.body.origin,
origin: req.body.origin ?? "api",
num_tokens: 0,
});
return res.status(200).json({
success: true,
links: links,
scrape_id: req.body.origin?.includes("website") ? id : undefined,
});
}
const response = {
success: true as const,
links: result.links,
scrape_id: result.scrape_id
};
// Subdomain sitemap url checking
// // For each result, check for subdomains, get their sitemaps and add them to the links
// const processedUrls = new Set();
// const processedSubdomains = new Set();
// for (const result of links) {
// let url;
// let hostParts;
// try {
// url = new URL(result);
// hostParts = url.hostname.split('.');
// } catch (e) {
// continue;
// }
// console.log("hostParts", hostParts);
// // Check if it's a subdomain (more than 2 parts, and not 'www')
// if (hostParts.length > 2 && hostParts[0] !== 'www') {
// const subdomain = hostParts[0];
// console.log("subdomain", subdomain);
// const subdomainUrl = `${url.protocol}//${subdomain}.${hostParts.slice(-2).join('.')}`;
// console.log("subdomainUrl", subdomainUrl);
// if (!processedSubdomains.has(subdomainUrl)) {
// processedSubdomains.add(subdomainUrl);
// const subdomainCrawl = crawlToCrawler(id, {
// originUrl: subdomainUrl,
// crawlerOptions: legacyCrawlerOptions(req.body),
// pageOptions: {},
// team_id: req.auth.team_id,
// createdAt: Date.now(),
// plan: req.auth.plan,
// });
// const subdomainSitemap = await subdomainCrawl.tryGetSitemap();
// if (subdomainSitemap) {
// subdomainSitemap.forEach((x) => {
// if (!processedUrls.has(x.url)) {
// processedUrls.add(x.url);
// links.push(x.url);
// }
// });
// }
// }
// }
// }
return res.status(200).json(response);
}

View File

@ -151,8 +151,25 @@ export const scrapeOptions = z.object({
}).strict(strictMessage)
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const extractV1Options = z.object({
urls: url.array(),
prompt: z.string().optional(),
schema: z.any().optional(),
limit: z.number().int().positive().finite().safe().optional(),
ignoreSitemap: z.boolean().default(false),
includeSubdomains: z.boolean().default(true),
allowExternalLinks: z.boolean().default(false),
origin: z.string().optional().default("api"),
timeout: z.number().int().positive().finite().safe().default(60000)
}).strict(strictMessage)
export type ExtractV1Options = z.infer<typeof extractV1Options>;
export const extractRequestSchema = extractV1Options;
export type ExtractRequest = z.infer<typeof extractRequestSchema>;
export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend({
url,
origin: z.string().optional().default("api"),
@ -173,6 +190,8 @@ export const scrapeRequestSchema = scrapeOptions.omit({ timeout: true }).extend(
return obj;
});
export type ScrapeRequest = z.infer<typeof scrapeRequestSchema>;
export type ScrapeRequestInput = z.input<typeof scrapeRequestSchema>;
@ -340,6 +359,21 @@ export interface ScrapeResponseRequestTest {
error?: string;
}
export type ExtractResponse =
| ErrorResponse
| {
success: true;
warning?: string;
data: z.infer<typeof extractRequestSchema>;
scrape_id?: string;
};
export interface ExtractResponseRequestTest {
statusCode: number;
body: ExtractResponse;
error?: string;
}
export type CrawlResponse =
| ErrorResponse
| {
@ -496,6 +530,13 @@ export function fromLegacyCrawlerOptions(x: any): { crawlOptions: CrawlerOptions
};
}
export interface MapDocument {
url: string;
title?: string;
description?: string;
}
export function fromLegacyScrapeOptions(pageOptions: PageOptions, extractorOptions: ExtractorOptions | undefined, timeout: number | undefined): { scrapeOptions: ScrapeOptions, internalOptions: InternalOptions } {
return {
scrapeOptions: scrapeOptions.parse({

50
apps/api/src/lib/cache.ts Normal file
View File

@ -0,0 +1,50 @@
import IORedis from "ioredis";
import { ScrapeOptions } from "../controllers/v1/types";
import { InternalOptions } from "../scraper/scrapeURL";
import { logger as _logger } from "./logger";
const logger = _logger.child({module: "cache"});
export const cacheRedis = process.env.CACHE_REDIS_URL ? new IORedis(process.env.CACHE_REDIS_URL, {
maxRetriesPerRequest: null,
}) : null;
export function cacheKey(url: string, scrapeOptions: ScrapeOptions, internalOptions: InternalOptions): string | null {
if (!cacheRedis) return null;
// these options disqualify a cache
if (internalOptions.v0CrawlOnlyUrls || internalOptions.forceEngine || internalOptions.v0UseFastMode || internalOptions.atsv
|| (scrapeOptions.actions && scrapeOptions.actions.length > 0)
) {
return null;
}
return "cache:" + url + ":waitFor:" + scrapeOptions.waitFor;
}
export type CacheEntry = {
url: string;
html: string;
statusCode: number;
error?: string;
};
export async function saveEntryToCache(key: string, entry: CacheEntry) {
if (!cacheRedis) return;
try {
await cacheRedis.set(key, JSON.stringify(entry));
} catch (error) {
logger.warn("Failed to save to cache", { key, error });
}
}
export async function getEntryFromCache(key: string): Promise<CacheEntry | null> {
if (!cacheRedis) return null;
try {
return JSON.parse(await cacheRedis.get(key) ?? "null");
} catch (error) {
logger.warn("Failed to get from cache", { key, error });
return null;
}
}

View File

@ -0,0 +1,15 @@
import { Document } from "../../controllers/v1/types";
export function buildDocument(document: Document): string {
const metadata = document.metadata;
const markdown = document.markdown;
// for each key in the metadata allow up to 250 characters
const metadataString = Object.entries(metadata).map(([key, value]) => {
return `${key}: ${value?.toString().slice(0, 250)}`;
}).join('\n');
const documentMetadataString = `\n- - - - - Page metadata - - - - -\n${metadataString}`;
const documentString = `${markdown}${documentMetadataString}`;
return documentString;
}

View File

@ -0,0 +1,124 @@
// use llmExtract.ts instead
// import OpenAI from "openai";
// import { encoding_for_model } from "@dqbd/tiktoken";
// import { TiktokenModel } from "@dqbd/tiktoken";
// import { ExtractOptions } from "../../controllers/v1/types";
// import { Document } from "../entities";
// import { z } from "zod";
// const maxTokens = 32000;
// const modifier = 4;
// export class LLMRefusalError extends Error {
// constructor(refusal: string) {
// super("LLM refused to extract the website's content");
// this.name = "LLMRefusalError";
// }
// }
// interface GenerateCompletionsParams {
// systemPrompt?: string;
// prompt?: string;
// schema?: any;
// pagesContent: string;
// }
// export async function generateBasicCompletion(prompt: string) {
// const openai = new OpenAI();
// const model: TiktokenModel =
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
// const completion = await openai.chat.completions.create({
// model,
// messages: [{ role: "user", content: prompt }],
// });
// return completion.choices[0].message.content;
// }
// export async function generateFinalExtraction({
// pagesContent,
// systemPrompt,
// prompt,
// schema,
// }: GenerateCompletionsParams): Promise<{
// content: string;
// metadata: { numTokens: number; warning: string };
// }> {
// const openai = new OpenAI();
// const model: TiktokenModel =
// (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
// let extractionContent = pagesContent;
// let numTokens = 0;
// let warning = "";
// const encoder = encoding_for_model(model);
// try {
// const tokens = encoder.encode(extractionContent);
// numTokens = tokens.length;
// } catch (error) {
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
// warning = `Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (${maxTokens}) we support.`;
// } finally {
// encoder.free();
// }
// if (numTokens > maxTokens) {
// extractionContent = extractionContent.slice(0, maxTokens * modifier);
// warning = `The extraction content would have used more tokens (${numTokens}) than the maximum we allow (${maxTokens}). -- the input has been automatically trimmed.`;
// }
// if (schema && (schema.type === "array" || schema._type === "ZodArray")) {
// schema = {
// type: "object",
// properties: {
// items: schema,
// },
// required: ["items"],
// additionalProperties: false,
// };
// } else if (schema) {
// schema.additionalProperties = false;
// schema.required = Object.keys(schema.properties);
// }
// const jsonCompletion = await openai.beta.chat.completions.parse({
// temperature: 0,
// model,
// messages: [
// { role: "system", content: systemPrompt ?? "" },
// { role: "user", content: [{ type: "text", text: extractionContent }] },
// {
// role: "user",
// content: prompt
// ? `Transform the above content into structured JSON output based on the following user request: ${prompt}`
// : "Transform the above content into structured JSON output.",
// },
// ],
// response_format: schema
// ? {
// type: "json_schema",
// json_schema: {
// name: "websiteContent",
// schema: schema,
// strict: true,
// },
// }
// : { type: "json_object" },
// });
// if (jsonCompletion.choices[0].message.refusal !== null) {
// throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
// }
// const extraction = jsonCompletion.choices[0].message.parsed;
// return {
// content: extraction ?? "",
// metadata: {
// numTokens,
// warning,
// },
// };
// }

View File

@ -0,0 +1,22 @@
import { CohereClient } from "cohere-ai";
import { MapDocument } from "../../controllers/v1/types";
const cohere = new CohereClient({
token: process.env.COHERE_API_KEY,
});
export async function rerankDocuments(
documents: (string | Record<string, string>)[],
query: string,
topN = 3,
model = "rerank-english-v3.0"
) {
const rerank = await cohere.v2.rerank({
documents,
query,
topN,
model,
returnDocuments: true,
});
return rerank.results.sort((a, b) => b.relevanceScore - a.relevanceScore).map(x => ({ document: x.document, index: x.index, relevanceScore: x.relevanceScore }));
}

View File

@ -0,0 +1,68 @@
import { performRanking } from './ranker';
describe('performRanking', () => {
it('should rank links based on similarity to search query', async () => {
const linksWithContext = [
'url: https://example.com/dogs, title: All about dogs, description: Learn about different dog breeds',
'url: https://example.com/cats, title: Cat care guide, description: Everything about cats',
'url: https://example.com/pets, title: General pet care, description: Care for all types of pets'
];
const links = [
'https://example.com/dogs',
'https://example.com/cats',
'https://example.com/pets'
];
const searchQuery = 'cats training';
const result = await performRanking(linksWithContext, links, searchQuery);
// Should return array of objects with link, linkWithContext, score, originalIndex
expect(result).toBeInstanceOf(Array);
expect(result.length).toBe(3);
// First result should be the dogs page since query is about dogs
expect(result[0].link).toBe('https://example.com/cats');
// Each result should have required properties
result.forEach(item => {
expect(item).toHaveProperty('link');
expect(item).toHaveProperty('linkWithContext');
expect(item).toHaveProperty('score');
expect(item).toHaveProperty('originalIndex');
expect(typeof item.score).toBe('number');
expect(item.score).toBeGreaterThanOrEqual(0);
expect(item.score).toBeLessThanOrEqual(1);
});
// Scores should be in descending order
for (let i = 1; i < result.length; i++) {
expect(result[i].score).toBeLessThanOrEqual(result[i-1].score);
}
});
it('should handle empty inputs', async () => {
const result = await performRanking([], [], '');
expect(result).toEqual([]);
});
it('should maintain original order for equal scores', async () => {
const linksWithContext = [
'url: https://example.com/1, title: Similar content A, description: test',
'url: https://example.com/2, title: Similar content B, description: test'
];
const links = [
'https://example.com/1',
'https://example.com/2'
];
const searchQuery = 'test';
const result = await performRanking(linksWithContext, links, searchQuery);
// If scores are equal, original order should be maintained
expect(result[0].originalIndex).toBeLessThan(result[1].originalIndex);
});
});

View File

@ -0,0 +1,92 @@
import axios from 'axios';
import { configDotenv } from 'dotenv';
import OpenAI from "openai";
configDotenv();
const openai = new OpenAI({
apiKey: process.env.OPENAI_API_KEY,
});
async function getEmbedding(text: string) {
const embedding = await openai.embeddings.create({
model: "text-embedding-ada-002",
input: text,
encoding_format: "float",
});
return embedding.data[0].embedding;
}
const cosineSimilarity = (vec1: number[], vec2: number[]): number => {
const dotProduct = vec1.reduce((sum, val, i) => sum + val * vec2[i], 0);
const magnitude1 = Math.sqrt(
vec1.reduce((sum, val) => sum + val * val, 0)
);
const magnitude2 = Math.sqrt(
vec2.reduce((sum, val) => sum + val * val, 0)
);
if (magnitude1 === 0 || magnitude2 === 0) return 0;
return dotProduct / (magnitude1 * magnitude2);
};
// Function to convert text to vector
const textToVector = (searchQuery: string, text: string): number[] => {
const words = searchQuery.toLowerCase().split(/\W+/);
return words.map((word) => {
const count = (text.toLowerCase().match(new RegExp(word, "g")) || [])
.length;
return count / text.length;
});
};
async function performRanking(linksWithContext: string[], links: string[], searchQuery: string) {
try {
// Handle invalid inputs
if (!searchQuery || !linksWithContext.length || !links.length) {
return [];
}
// Sanitize search query by removing null characters
const sanitizedQuery = searchQuery;
// Generate embeddings for the search query
const queryEmbedding = await getEmbedding(sanitizedQuery);
// Generate embeddings for each link and calculate similarity
const linksAndScores = await Promise.all(linksWithContext.map(async (linkWithContext, index) => {
try {
const linkEmbedding = await getEmbedding(linkWithContext);
const score = cosineSimilarity(queryEmbedding, linkEmbedding);
return {
link: links[index],
linkWithContext,
score,
originalIndex: index
};
} catch (err) {
// If embedding fails for a link, return with score 0
return {
link: links[index],
linkWithContext,
score: 0,
originalIndex: index
};
}
}));
// Sort links based on similarity scores while preserving original order for equal scores
linksAndScores.sort((a, b) => {
const scoreDiff = b.score - a.score;
return scoreDiff === 0 ? a.originalIndex - b.originalIndex : scoreDiff;
});
return linksAndScores;
} catch (error) {
console.error(`Error performing semantic search: ${error}`);
return [];
}
}
export { performRanking };

View File

@ -1 +1 @@
export const axiosTimeout = 3000;
export const axiosTimeout = 5000;

View File

@ -27,4 +27,4 @@ v0Router.post("/v0/search", searchController);
// Health/Probe routes
v0Router.get("/v0/health/liveness", livenessController);
v0Router.get("/v0/health/readiness", readinessController);
v0Router.get("/v0/health/readiness", readinessController);

View File

@ -18,6 +18,7 @@ import { logger } from "../lib/logger";
import { scrapeStatusController } from "../controllers/v1/scrape-status";
import { concurrencyCheckController } from "../controllers/v1/concurrency-check";
import { batchScrapeController } from "../controllers/v1/batch-scrape";
import { extractController } from "../controllers/v1/extract";
// import { crawlPreviewController } from "../../src/controllers/v1/crawlPreview";
// import { crawlJobStatusPreviewController } from "../../src/controllers/v1/status";
// import { searchController } from "../../src/controllers/v1/search";
@ -98,7 +99,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
if (!res.headersSent) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
return res.status(403).json({ success: false, error: "URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
}
next();
@ -178,6 +179,13 @@ v1Router.ws(
crawlStatusWSController
);
v1Router.post(
"/extract",
authMiddleware(RateLimiterMode.Scrape),
checkCreditsMiddleware(1),
wrap(extractController)
);
// v1Router.post("/crawlWebsitePreview", crawlPreviewController);
@ -199,3 +207,4 @@ v1Router.delete(
// Health/Probe routes
// v1Router.get("/health/liveness", livenessController);
// v1Router.get("/health/readiness", readinessController);

View File

@ -0,0 +1,19 @@
import { cacheKey, getEntryFromCache } from "../../../../lib/cache";
import { EngineScrapeResult } from "..";
import { Meta } from "../..";
import { EngineError } from "../../error";
export async function scrapeCache(meta: Meta): Promise<EngineScrapeResult> {
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key === null) throw new EngineError("Scrape not eligible for caching");
const entry = await getEntryFromCache(key);
if (entry === null) throw new EngineError("Cache missed");
return {
url: entry.url,
html: entry.html,
statusCode: entry.statusCode,
error: entry.error,
};
}

View File

@ -103,4 +103,4 @@ export async function fireEngineCheckStatus(logger: Logger, jobId: string): Prom
}
});
}
}
}

View File

@ -6,14 +6,17 @@ import { scrapePDF } from "./pdf";
import { scrapeURLWithScrapingBee } from "./scrapingbee";
import { scrapeURLWithFetch } from "./fetch";
import { scrapeURLWithPlaywright } from "./playwright";
import { scrapeCache } from "./cache";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx";
export type Engine = "fire-engine;chrome-cdp" | "fire-engine;playwright" | "fire-engine;tlsclient" | "scrapingbee" | "scrapingbeeLoad" | "playwright" | "fetch" | "pdf" | "docx" | "cache";
const useScrapingBee = process.env.SCRAPING_BEE_API_KEY !== '' && process.env.SCRAPING_BEE_API_KEY !== undefined;
const useFireEngine = process.env.FIRE_ENGINE_BETA_URL !== '' && process.env.FIRE_ENGINE_BETA_URL !== undefined;
const usePlaywright = process.env.PLAYWRIGHT_MICROSERVICE_URL !== '' && process.env.PLAYWRIGHT_MICROSERVICE_URL !== undefined;
const useCache = process.env.CACHE_REDIS_URL !== '' && process.env.CACHE_REDIS_URL !== undefined;
export const engines: Engine[] = [
// ...(useCache ? [ "cache" as const ] : []),
...(useFireEngine ? [ "fire-engine;chrome-cdp" as const, "fire-engine;playwright" as const, "fire-engine;tlsclient" as const ] : []),
...(useScrapingBee ? [ "scrapingbee" as const, "scrapingbeeLoad" as const ] : []),
...(usePlaywright ? [ "playwright" as const ] : []),
@ -74,6 +77,7 @@ export type EngineScrapeResult = {
const engineHandlers: {
[E in Engine]: (meta: Meta) => Promise<EngineScrapeResult>
} = {
"cache": scrapeCache,
"fire-engine;chrome-cdp": scrapeURLWithFireEngineChromeCDP,
"fire-engine;playwright": scrapeURLWithFireEnginePlaywright,
"fire-engine;tlsclient": scrapeURLWithFireEngineTLSClient,
@ -95,6 +99,22 @@ export const engineOptions: {
quality: number,
}
} = {
"cache": {
features: {
"actions": false,
"waitFor": true,
"screenshot": false,
"screenshot@fullScreen": false,
"pdf": false, // TODO: figure this out
"docx": false, // TODO: figure this out
"atsv": false,
"location": false,
"mobile": false,
"skipTlsVerification": false,
"useFastMode": false,
},
quality: 1000, // cache should always be tried first
},
"fire-engine;chrome-cdp": {
features: {
"actions": true,

View File

@ -0,0 +1,26 @@
import { Document } from "../../../controllers/v1/types";
import { Meta } from "..";
import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache";
export function saveToCache(meta: Meta, document: Document): Document {
if (document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300) return document;
if (document.rawHtml === undefined) {
throw new Error("rawHtml is undefined -- this transformer is being called out of order");
}
const key = cacheKey(meta.url, meta.options, meta.internalOptions);
if (key !== null) {
const entry: CacheEntry = {
html: document.rawHtml!,
statusCode: document.metadata.statusCode!,
url: document.metadata.url ?? document.metadata.sourceURL!,
error: document.metadata.error ?? undefined,
};
saveEntryToCache(key, entry);
}
return document;
}

View File

@ -7,6 +7,7 @@ import { extractMetadata } from "../lib/extractMetadata";
import { performLLMExtract } from "./llmExtract";
import { uploadScreenshot } from "./uploadScreenshot";
import { removeBase64Images } from "./removeBase64Images";
import { saveToCache } from "./cache";
export type Transformer = (meta: Meta, document: Document) => Document | Promise<Document>;
@ -104,6 +105,7 @@ export function coerceFieldsToFormats(meta: Meta, document: Document): Document
// TODO: allow some of these to run in parallel
export const transformerStack: Transformer[] = [
saveToCache,
deriveHTMLFromRawHTML,
deriveMarkdownFromHTML,
deriveLinksFromHTML,

View File

@ -58,32 +58,33 @@ function normalizeSchema(x: any): any {
}
}
async function generateOpenAICompletions(logger: Logger, document: Document, options: ExtractOptions): Promise<Document> {
export async function generateOpenAICompletions(logger: Logger, options: ExtractOptions, markdown?: string, previousWarning?: string): Promise<{ extract: any, numTokens: number, warning: string | undefined }> {
let extract: any;
let warning: string | undefined;
const openai = new OpenAI();
const model: TiktokenModel = (process.env.MODEL_NAME as TiktokenModel) ?? "gpt-4o-mini";
if (document.markdown === undefined) {
if (markdown === undefined) {
throw new Error("document.markdown is undefined -- this is unexpected");
}
let extractionContent = document.markdown;
// count number of tokens
let numTokens = 0;
const encoder = encoding_for_model(model as TiktokenModel);
try {
// Encode the message into tokens
const tokens = encoder.encode(extractionContent);
const tokens = encoder.encode(markdown);
// Return the number of tokens
numTokens = tokens.length;
} catch (error) {
logger.warn("Calculating num tokens of string failed", { error, extractionContent });
logger.warn("Calculating num tokens of string failed", { error, markdown });
extractionContent = extractionContent.slice(0, maxTokens * modifier);
markdown = markdown.slice(0, maxTokens * modifier);
const warning = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
document.warning = document.warning === undefined ? warning : " " + warning;
let w = "Failed to derive number of LLM tokens the extraction might use -- the input has been automatically trimmed to the maximum number of tokens (" + maxTokens + ") we support.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
} finally {
// Free the encoder resources after use
encoder.free();
@ -91,10 +92,10 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
if (numTokens > maxTokens) {
// trim the document to the maximum number of tokens, tokens != characters
extractionContent = extractionContent.slice(0, maxTokens * modifier);
markdown = markdown.slice(0, maxTokens * modifier);
const warning = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
document.warning = document.warning === undefined ? warning : " " + warning;
const w = "The extraction content would have used more tokens (" + numTokens + ") than the maximum we allow (" + maxTokens + "). -- the input has been automatically trimmed.";
warning = previousWarning === undefined ? w : w + " " + previousWarning;
}
let schema = options.schema;
@ -107,12 +108,22 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
required: ["items"],
additionalProperties: false,
};
} else if (schema && typeof schema === 'object' && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => [key, { type: value }])
),
required: Object.keys(schema),
additionalProperties: false
};
}
schema = normalizeSchema(schema);
const jsonCompletion = await openai.beta.chat.completions.parse({
model,
temperature: 0,
messages: [
{
role: "system",
@ -120,7 +131,7 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
},
{
role: "user",
content: [{ type: "text", text: extractionContent }],
content: [{ type: "text", text: markdown }],
},
{
role: "user",
@ -143,26 +154,35 @@ async function generateOpenAICompletions(logger: Logger, document: Document, opt
throw new LLMRefusalError(jsonCompletion.choices[0].message.refusal);
}
document.extract = jsonCompletion.choices[0].message.parsed;
extract = jsonCompletion.choices[0].message.parsed;
if (document.extract === null && jsonCompletion.choices[0].message.content !== null) {
if (extract === null && jsonCompletion.choices[0].message.content !== null) {
try {
document.extract = JSON.parse(jsonCompletion.choices[0].message.content);
extract = JSON.parse(jsonCompletion.choices[0].message.content);
} catch (e) {
logger.error("Failed to parse returned JSON, no schema specified.", { error: e });
throw new LLMRefusalError("Failed to parse returned JSON. Please specify a schema in the extract object.");
}
}
if (options.schema && options.schema.type === "array") {
document.extract = document.extract?.items;
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if (options.schema && options.schema.type === "array" && !schema?.required?.includes("items")) {
extract = extract?.items;
}
return document;
return { extract, warning, numTokens };
}
export async function performLLMExtract(meta: Meta, document: Document): Promise<Document> {
if (meta.options.formats.includes("extract")) {
document = await generateOpenAICompletions(meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }), document, meta.options.extract!);
const { extract, warning } = await generateOpenAICompletions(
meta.logger.child({ method: "performLLMExtract/generateOpenAICompletions" }),
meta.options.extract!,
document.markdown,
document.warning,
);
document.extract = extract;
document.warning = warning;
}
return document;

View File

@ -109,6 +109,6 @@ export function waitForJob<T = unknown>(jobId: string, timeout: number): Promise
}
}
}
}, 500);
}, 250);
})
}

View File

@ -106,6 +106,15 @@ export interface FirecrawlCrawlStatusResponse {
error?: string;
}
export interface FirecrawlExtractResponse {
statusCode: number;
body: {
success: boolean;
data: any[];
};
error?: string;
}
export enum RateLimiterMode {
Crawl = "crawl",
CrawlStatus = "crawlStatus",

View File

@ -1,4 +1,5 @@
import FirecrawlApp from 'firecrawl';
import { z } from 'zod';
const app = new FirecrawlApp({apiKey: "fc-YOUR_API_KEY"});
@ -42,6 +43,18 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult)
// Extract information from a website using LLM:
const extractSchema = z.object({
title: z.string(),
description: z.string(),
links: z.array(z.string())
});
const extractResult = await app.extract(['https://firecrawl.dev'], {
prompt: "Extract the title, description, and links from the website",
schema: extractSchema
});
console.log(extractResult);
// Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -42,6 +42,19 @@ const main = async () => {
const mapResult = await app.mapUrl('https://firecrawl.dev');
console.log(mapResult)
// // Extract information from a website using LLM:
// const extractSchema = z.object({
// title: z.string(),
// description: z.string(),
// links: z.array(z.string())
// });
// const extractResult = await app.extractUrls(['https://firecrawl.dev'], {
// prompt: "Extract the title, description, and links from the website",
// schema: extractSchema
// });
// console.log(extractResult);
// Crawl a website with WebSockets:
const watch = await app.crawlUrlAndWatch('mendable.ai', { excludePaths: ['blog/*'], limit: 5});

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.8.5",
"version": "1.9.0",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -236,6 +236,27 @@ export interface MapResponse {
error?: string;
}
/**
* Parameters for extracting information from URLs.
* Defines options for extracting information from URLs.
*/
export interface ExtractParams {
prompt: string;
schema?: zt.ZodSchema;
systemPrompt?: string;
allowExternalLinks?: boolean;
}
/**
* Response interface for extracting information from URLs.
* Defines the structure of the response received after extracting information from URLs.
*/
export interface ExtractResponse {
success: true;
data: zt.infer<zt.ZodSchema>;
error?: string;
}
/**
* Error response interface.
* Defines the structure of the response received when an error occurs.
@ -245,7 +266,6 @@ export interface ErrorResponse {
error: string;
}
/**
* Custom error class for Firecrawl.
* Extends the built-in Error class to include a status code.
@ -679,6 +699,44 @@ export default class FirecrawlApp {
return { success: false, error: "Internal server error." };
}
/**
* Extracts information from URLs using the Firecrawl API.
* @param url - The URL to extract information from.
* @param params - Additional parameters for the extract request.
* @returns The response from the extract operation.
*/
async extract(urls: string[], params?: ExtractParams): Promise<ExtractResponse | ErrorResponse> {
const headers = this.prepareHeaders();
if (!params?.prompt) {
throw new FirecrawlError("Prompt is required", 400);
}
let jsonData: { urls: string[] } & ExtractParams= { urls, ...params };
let jsonSchema: any;
try {
jsonSchema = params?.schema ? zodToJsonSchema(params.schema) : undefined;
} catch (error: any) {
throw new FirecrawlError("Invalid schema. Use a valid Zod schema.", 400);
}
try {
const response: AxiosResponse = await this.postRequest(
this.apiUrl + `/v1/extract`,
{ ...jsonData, schema: jsonSchema },
headers
);
if (response.status === 200) {
return response.data as ExtractResponse;
} else {
this.handleError(response, "extract");
}
} catch (error: any) {
throw new FirecrawlError(error.message, 500);
}
return { success: false, error: "Internal server error." };
}
/**
* Prepares the headers for an API request.
* @param idempotencyKey - Optional key to ensure idempotency.

View File

@ -2,6 +2,8 @@ import time
import nest_asyncio
import uuid
from firecrawl.firecrawl import FirecrawlApp
from pydantic import BaseModel, Field
from typing import List
app = FirecrawlApp(api_key="fc-")
@ -50,9 +52,6 @@ print(crawl_status)
# LLM Extraction:
# Define schema to extract contents into using pydantic
from pydantic import BaseModel, Field
from typing import List
class ArticleSchema(BaseModel):
title: str
points: int
@ -115,6 +114,22 @@ llm_extraction_result = app2.scrape_url('https://news.ycombinator.com', {
map_result = app.map_url('https://firecrawl.dev', { 'search': 'blog' })
print(map_result)
# Extract URLs:
class ExtractSchema(BaseModel):
title: str
description: str
links: List[str]
# Define the schema using Pydantic
extract_schema = ExtractSchema.schema()
# Perform the extraction
extract_result = app.extract(['https://firecrawl.dev'], {
'prompt': "Extract the title, description, and links from the website",
'schema': extract_schema
})
print(extract_result)
# Crawl a website with WebSockets:
# inside an async function...
nest_asyncio.apply()

View File

@ -13,7 +13,7 @@ import os
from .firecrawl import FirecrawlApp # noqa
__version__ = "1.5.0"
__version__ = "1.6.0"
# Define the logger for the Firecrawl project
logger: logging.Logger = logging.getLogger("firecrawl")

View File

@ -12,15 +12,40 @@ Classes:
import logging
import os
import time
from typing import Any, Dict, Optional, List
from typing import Any, Dict, Optional, List, Union
import json
import requests
import pydantic
import websockets
logger : logging.Logger = logging.getLogger("firecrawl")
class FirecrawlApp:
class ExtractParams(pydantic.BaseModel):
"""
Parameters for the extract operation.
"""
prompt: str
schema: Optional[Any] = None
system_prompt: Optional[str] = None
allow_external_links: Optional[bool] = False
class ExtractResponse(pydantic.BaseModel):
"""
Response from the extract operation.
"""
success: bool
data: Optional[Any] = None
error: Optional[str] = None
class ErrorResponse(pydantic.BaseModel):
"""
Error response.
"""
success: bool
error: str
def __init__(self, api_key: Optional[str] = None, api_url: Optional[str] = None) -> None:
"""
Initialize the FirecrawlApp instance with API key, API URL.
@ -434,6 +459,48 @@ class FirecrawlApp:
else:
self._handle_error(response, 'check batch scrape status')
def extract(self, urls: List[str], params: Optional[ExtractParams] = None) -> Union[ExtractResponse, ErrorResponse]:
"""
Extracts information from a URL using the Firecrawl API.
Args:
urls (List[str]): The URLs to extract information from.
params (Optional[ExtractParams]): Additional parameters for the extract request.
Returns:
Union[ExtractResponse, ErrorResponse]: The response from the extract operation.
"""
headers = self._prepare_headers()
if not params or not params.get('prompt'):
raise ValueError("Prompt is required")
if not params.get('schema'):
raise ValueError("Schema is required for extraction")
jsonData = {'urls': urls, **params}
jsonSchema = params['schema'].schema() if hasattr(params['schema'], 'schema') else None
try:
response = self._post_request(
f'{self.api_url}/v1/extract',
{
**jsonData,
'allowExternalLinks': params.get('allow_external_links', False),
'schema': jsonSchema
},
headers
)
if response.status_code == 200:
return response.json()
else:
self._handle_error(response, "extract")
except Exception as e:
raise ValueError(str(e), 500)
return {'success': False, 'error': "Internal server error."}
def _prepare_headers(self, idempotency_key: Optional[str] = None) -> Dict[str, str]:
"""
Prepare the headers for API requests.