diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index d10a286f..3bb26e98 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -449,4 +449,161 @@ describe("E2E Tests for v1 API Routes", () => { }); + +describe("POST /v1/map", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).post( + "/v1/map" + ); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key", async () => { + const mapRequest = { + url: "https://roastmywebsite.ai" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + }); + + it.concurrent("should return a successful response with a valid API key and search", async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }, 10000) + + it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + }) + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); +}); }); diff --git a/apps/api/src/controllers/v1/auth.ts b/apps/api/src/controllers/v1/auth.ts index bc6951c9..d4da3c6b 100644 --- a/apps/api/src/controllers/v1/auth.ts +++ b/apps/api/src/controllers/v1/auth.ts @@ -115,6 +115,9 @@ export async function supaAuthenticateUser( case RateLimiterMode.CrawlStatus: rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); break; + case RateLimiterMode.Map: + rateLimiter = getRateLimiter(RateLimiterMode.Map, token); + break; case RateLimiterMode.Preview: rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); @@ -151,7 +154,7 @@ export async function supaAuthenticateUser( if ( token === "this_is_just_a_preview_token" && - (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search) + (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map) ) { return { success: true, team_id: "preview" }; // check the origin of the request and make sure its from firecrawl.dev diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index f95364a3..43f940b7 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -1,22 +1,45 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types"; -import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis"; +import { + CrawlRequest, + crawlRequestSchema, + CrawlResponse, + legacyCrawlerOptions, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; import { Logger } from "../../lib/logger"; -export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response) { +export async function crawlController( + req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, + res: Response +) { req.body = crawlRequestSchema.parse(req.body); - + const id = uuidv4(); await logCrawl(id, req.auth.team_id); - const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions), - pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + const { remainingCredits } = req.account; + // TODO: Get rid of crawlerOptions + const crawlerOptions = legacyCrawlerOptions(req.body); + const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + + crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); + const sc: StoredCrawl = { originUrl: req.body.url, crawlerOptions, @@ -30,15 +53,21 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr try { sc.robots = await crawler.getRobotsTxt(); } catch (e) { - Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`); + Logger.debug( + `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( + e + )}` + ); } await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); if (sitemap !== null) { - const jobs = sitemap.map(x => { + const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); return { @@ -56,33 +85,42 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr opts: { jobId: uuid, priority: 20, - } + }, }; - }) + }); - await lockURLs(id, jobs.map(x => x.data.url)); - await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); await getScrapeQueue().addBulk(jobs); } else { await lockURL(id, sc, req.body.url); - const job = await addScrapeJob({ - url: req.body.url, - mode: "single_urls", - crawlerOptions: crawlerOptions, - team_id: req.auth.team_id, - pageOptions: pageOptions, - origin: "api", - crawl_id: id, - webhook: req.body.webhook, - }, { - priority: 15, - }); + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: req.auth.team_id, + pageOptions: pageOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + }, + { + priority: 15, + } + ); await addCrawlJob(id, job.id); } return res.status(200).json({ success: true, id, - url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`, + url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, }); } diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 78cfda04..76cf1498 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -14,6 +14,7 @@ import { isSameSubdomain, } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; +import { billTeam } from "../../services/billing/credit_billing"; configDotenv(); @@ -26,11 +27,10 @@ export async function mapController( const id = uuidv4(); let links: string[] = [req.body.url]; - const crawlerOptions = legacyCrawlerOptions(req.body); const sc: StoredCrawl = { originUrl: req.body.url, - crawlerOptions, + crawlerOptions: legacyCrawlerOptions(req.body), pageOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), @@ -39,7 +39,7 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); const sitemap = - sc.crawlerOptions.ignoreSitemap || req.body.search + req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); @@ -49,8 +49,10 @@ export async function mapController( }); } + let urlWithoutWww = req.body.url.replace("www.", ""); + let mapUrl = req.body.search - ? `"${req.body.search}" site:${req.body.url}` + ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { @@ -58,16 +60,19 @@ export async function mapController( }); if (mapResults.length > 0) { - mapResults.map((x) => { - if (req.body.search) { - links.unshift(x.url); - } else { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links]; + } else { + mapResults.map((x) => { links.push(x.url); - } - }); + }); + } } - links = links.map((x) => checkAndUpdateURLForMap(x).url); + links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); + + // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); @@ -80,6 +85,8 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = [...new Set(links)]; + await billTeam(req.auth.team_id, 1); + return res.status(200).json({ success: true, links, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 77a9f2dd..4dcb32fc 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -33,6 +33,8 @@ const url = z.preprocess( ) ); +const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; + export const scrapeOptions = z.object({ formats: z .enum([ @@ -53,14 +55,14 @@ export const scrapeOptions = z.object({ timeout: z.number().int().positive().finite().safe().default(30000), // default? waitFor: z.number().int().nonnegative().finite().safe().default(0), parsePDF: z.boolean().default(true), -}); +}).strict(strictMessage); export type ScrapeOptions = z.infer; export const scrapeRequestSchema = scrapeOptions.extend({ url, origin: z.string().optional().default("api"), -}); +}).strict(strictMessage); // export type ScrapeRequest = { // url: string; @@ -83,7 +85,7 @@ const crawlerOptions = z.object({ allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), -}); +}).strict(strictMessage); // export type CrawlerOptions = { // includePaths?: string[]; @@ -97,13 +99,13 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; -export const crawlRequestSchema = z.object({ +export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), - crawlerOptions: crawlerOptions.default({}), scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), webhook: z.string().url().optional(), -}); + limit: z.number().default(10000), +}).strict(strictMessage); // export type CrawlRequest = { // url: string; @@ -116,9 +118,10 @@ export type CrawlRequest = z.infer; export const mapRequestSchema = crawlerOptions.extend({ url: z.string().url(), origin: z.string().optional().default("api"), - includeSubdomains: z.boolean().default(false), + includeSubdomains: z.boolean().default(true), search: z.string().optional(), -}); + ignoreSitemap: z.boolean().default(false), +}).strict(strictMessage); // export type MapRequest = { // url: string; @@ -224,20 +227,26 @@ type AuthObject = { plan: string; }; +type Account = { + remainingCredits: number; +}; + export interface RequestWithMaybeAuth< ReqParams = {}, ReqBody = undefined, ResBody = undefined > extends Request { auth?: AuthObject; + account?: Account; } export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends Request { auth: AuthObject; + account?: Account; } export function legacyCrawlerOptions(x: CrawlerOptions) { diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index c4f002ef..fa2698e7 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -113,7 +113,7 @@ export const checkAndUpdateURLForMap = (url: string) => { } // remove any query params - url = url.split("?")[0]; + url = url.split("?")[0].trim(); return { urlObj: typedUrlObj, url: url }; }; diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 0807bc0f..51f07c85 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -24,12 +24,17 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; -function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { +function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { - if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) { + if (!minimum && req.body) { + minimum = (req.body as any)?.limit ?? 1; + } + const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); + if (!success) { return res.status(402).json({ success: false, error: "Insufficient credits" }); } + req.account = { remainingCredits } next(); })() .catch(err => next(err)); @@ -71,7 +76,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (isUrlBlocked(req.body.url)) { + if (req.body.url && isUrlBlocked(req.body.url)) { return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); } next(); @@ -101,14 +106,14 @@ v1Router.post( blocklistMiddleware, authMiddleware(RateLimiterMode.Crawl), idempotencyMiddleware, - checkCreditsMiddleware(1), + checkCreditsMiddleware(), wrap(crawlController) ); v1Router.post( "/map", blocklistMiddleware, - authMiddleware(RateLimiterMode.Crawl), + authMiddleware(RateLimiterMode.Map), checkCreditsMiddleware(1), wrap(mapController) ); diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index d25289b2..2ad07318 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -168,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) { export async function checkTeamCredits(team_id: string, credits: number) { return withAuth(supaCheckTeamCredits)(team_id, credits); } + // if team has enough credits for the operation, return true, else return false export async function supaCheckTeamCredits(team_id: string, credits: number) { if (team_id === "preview") { - return { success: true, message: "Preview team, no credits used" }; + return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; } // Retrieve the team's active subscription and check for available coupons concurrently @@ -202,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { if (subscriptionError || !subscription) { // If there is no active subscription but there are available coupons if (couponCredits >= credits) { - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; } const { data: creditUsages, error: creditUsageError } = @@ -252,9 +253,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: false, message: "Insufficient credits, please upgrade!", + remainingCredits: FREE_CREDITS - totalCreditsUsed }; } - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed }; } let totalCreditsUsed = 0; @@ -321,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { subscription.current_period_start, subscription.current_period_end ); - return { success: false, message: "Insufficient credits, please upgrade!" }; + return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; } else if (creditUsagePercentage >= 0.8) { // Send email notification for approaching credit limit await sendNotification( @@ -332,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed }; } // Count the total credits used by a team within the current billing period and return the remaining credits. diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index cd4e03a1..afd80f42 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1,10 +1,10 @@ +import "dotenv/config"; import { CustomError } from "../lib/custom-error"; import { getScrapeQueue, redisConnection, scrapeQueueName, } from "./queue-service"; -import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index f1399b13..2682d0a2 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -42,6 +42,19 @@ const RATE_LIMITS = { growth: 500, growthdouble: 500, }, + map:{ + default: 20, + free: 5, + starter: 20, + standard: 40, + standardOld: 40, + scale: 500, + hobby: 10, + standardNew: 50, + standardnew: 50, + growth: 500, + growthdouble: 500, + }, preview: { free: 5, default: 5, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5e63ac78..70a8ab07 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -106,6 +106,7 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", + Map = "map", }