From 27903247b6a802e9dd807af0b52c0c4a80ecc207 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 12:04:08 -0300 Subject: [PATCH 01/10] Nick: map tests and fixes --- .../__tests__/e2e_v1_withAuth/index.test.ts | 60 +++++++++++++++++++ apps/api/src/controllers/v1/auth.ts | 5 +- apps/api/src/routes/v1.ts | 4 +- apps/api/src/services/rate-limiter.ts | 13 ++++ apps/api/src/types.ts | 1 + 5 files changed, 80 insertions(+), 3 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index d10a286f..af094442 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -449,4 +449,64 @@ describe("E2E Tests for v1 API Routes", () => { }); + +describe("POST /v1/map", () => { + it.concurrent("should require authorization", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL).post( + "/v1/map" + ); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return an error response with an invalid API key", async () => { + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer invalid-api-key`) + .set("Content-Type", "application/json") + .send({ url: "https://firecrawl.dev" }); + expect(response.statusCode).toBe(401); + }); + + it.concurrent("should return a successful response with a valid API key", async () => { + const mapRequest = { + url: "https://roastmywebsite.ai", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + }); + + it.concurrent("should return an error for invalid URL", async () => { + const mapRequest = { + url: "invalid-url", + includeSubdomains: true, + search: "test", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(400); + expect(response.body).toHaveProperty("success", false); + expect(response.body).toHaveProperty("error"); + }); +}); }); diff --git a/apps/api/src/controllers/v1/auth.ts b/apps/api/src/controllers/v1/auth.ts index bc6951c9..d4da3c6b 100644 --- a/apps/api/src/controllers/v1/auth.ts +++ b/apps/api/src/controllers/v1/auth.ts @@ -115,6 +115,9 @@ export async function supaAuthenticateUser( case RateLimiterMode.CrawlStatus: rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token); break; + case RateLimiterMode.Map: + rateLimiter = getRateLimiter(RateLimiterMode.Map, token); + break; case RateLimiterMode.Preview: rateLimiter = getRateLimiter(RateLimiterMode.Preview, token); @@ -151,7 +154,7 @@ export async function supaAuthenticateUser( if ( token === "this_is_just_a_preview_token" && - (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search) + (mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map) ) { return { success: true, team_id: "preview" }; // check the origin of the request and make sure its from firecrawl.dev diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 0807bc0f..25e12c63 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -71,7 +71,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction) } function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (isUrlBlocked(req.body.url)) { + if (req.body.url && isUrlBlocked(req.body.url)) { return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." }); } next(); @@ -108,7 +108,7 @@ v1Router.post( v1Router.post( "/map", blocklistMiddleware, - authMiddleware(RateLimiterMode.Crawl), + authMiddleware(RateLimiterMode.Map), checkCreditsMiddleware(1), wrap(mapController) ); diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index f1399b13..2682d0a2 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -42,6 +42,19 @@ const RATE_LIMITS = { growth: 500, growthdouble: 500, }, + map:{ + default: 20, + free: 5, + starter: 20, + standard: 40, + standardOld: 40, + scale: 500, + hobby: 10, + standardNew: 50, + standardnew: 50, + growth: 500, + growthdouble: 500, + }, preview: { free: 5, default: 5, diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5e63ac78..70a8ab07 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -106,6 +106,7 @@ export enum RateLimiterMode { Scrape = "scrape", Preview = "preview", Search = "search", + Map = "map", } From 55dad82df109ee008806f31b27d170ff6e8c8475 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 12:17:53 -0300 Subject: [PATCH 02/10] Nick: fixed map search --- .../__tests__/e2e_v1_withAuth/index.test.ts | 28 +++++++++++++++++-- apps/api/src/controllers/v1/map.ts | 22 ++++++++------- apps/api/src/controllers/v1/types.ts | 1 + apps/api/src/lib/validateUrl.ts | 2 +- 4 files changed, 39 insertions(+), 14 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index af094442..b58e52ab 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -469,9 +469,7 @@ describe("POST /v1/map", () => { it.concurrent("should return a successful response with a valid API key", async () => { const mapRequest = { - url: "https://roastmywebsite.ai", - includeSubdomains: true, - search: "test", + url: "https://roastmywebsite.ai" }; const response: ScrapeResponseRequestTest = await request(TEST_URL) @@ -491,6 +489,30 @@ describe("POST /v1/map", () => { expect(links.length).toBeGreaterThan(0); }); + it.concurrent("should return a successful response with a valid API key and search", async () => { + const mapRequest = { + url: "https://usemotion.com", + search: "pricing" + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("usemotion.com/pricing"); + }); + it.concurrent("should return an error for invalid URL", async () => { const mapRequest = { url: "invalid-url", diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 78cfda04..a50b7615 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -26,11 +26,10 @@ export async function mapController( const id = uuidv4(); let links: string[] = [req.body.url]; - const crawlerOptions = legacyCrawlerOptions(req.body); const sc: StoredCrawl = { originUrl: req.body.url, - crawlerOptions, + crawlerOptions: legacyCrawlerOptions(req.body), pageOptions: {}, team_id: req.auth.team_id, createdAt: Date.now(), @@ -39,7 +38,7 @@ export async function mapController( const crawler = crawlToCrawler(id, sc); const sitemap = - sc.crawlerOptions.ignoreSitemap || req.body.search + req.body.ignoreSitemap ? null : await crawler.tryGetSitemap(); @@ -58,16 +57,19 @@ export async function mapController( }); if (mapResults.length > 0) { - mapResults.map((x) => { - if (req.body.search) { - links.unshift(x.url); - } else { + if (req.body.search) { + // Ensure all map results are first, maintaining their order + links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links]; + } else { + mapResults.map((x) => { links.push(x.url); - } - }); + }); + } } - links = links.map((x) => checkAndUpdateURLForMap(x).url); + links = links.map((x) => checkAndUpdateURLForMap(x).url.trim()); + + // allows for subdomains to be included links = links.filter((x) => isSameDomain(x, req.body.url)); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 77a9f2dd..333f582e 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -118,6 +118,7 @@ export const mapRequestSchema = crawlerOptions.extend({ origin: z.string().optional().default("api"), includeSubdomains: z.boolean().default(false), search: z.string().optional(), + ignoreSitemap: z.boolean().default(false), }); // export type MapRequest = { diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts index c4f002ef..fa2698e7 100644 --- a/apps/api/src/lib/validateUrl.ts +++ b/apps/api/src/lib/validateUrl.ts @@ -113,7 +113,7 @@ export const checkAndUpdateURLForMap = (url: string) => { } // remove any query params - url = url.split("?")[0]; + url = url.split("?")[0].trim(); return { urlObj: typedUrlObj, url: url }; }; From e200ec9e12176a8e4877041f598f0b1a85aacb9e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 12:24:14 -0300 Subject: [PATCH 03/10] Nick: --- .../__tests__/e2e_v1_withAuth/index.test.ts | 75 +++++++++++++++++++ apps/api/src/controllers/v1/map.ts | 4 +- apps/api/src/controllers/v1/types.ts | 2 +- 3 files changed, 79 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index b58e52ab..3bb26e98 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -513,6 +513,81 @@ describe("POST /v1/map", () => { expect(links[0]).toContain("usemotion.com/pricing"); }); + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => { + const mapRequest = { + url: "https://firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }); + + it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: true + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).toContain("docs.firecrawl.dev"); + }, 10000) + + it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => { + const mapRequest = { + url: "https://www.firecrawl.dev", + search: "docs", + includeSubdomains: false + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/map") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(mapRequest); + + expect(response.statusCode).toBe(200); + expect(response.body).toHaveProperty("success", true); + expect(response.body).toHaveProperty("links"); + if (!("links" in response.body)) { + throw new Error("Expected response body to have 'links' property"); + } + const links = response.body.links as unknown[]; + expect(Array.isArray(links)).toBe(true); + expect(links.length).toBeGreaterThan(0); + expect(links[0]).not.toContain("docs.firecrawl.dev"); + }) + it.concurrent("should return an error for invalid URL", async () => { const mapRequest = { url: "invalid-url", diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index a50b7615..08d5ab61 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -48,8 +48,10 @@ export async function mapController( }); } + let urlWithoutWww = req.body.url.replace("www.", ""); + let mapUrl = req.body.search - ? `"${req.body.search}" site:${req.body.url}` + ? `"${req.body.search}" site:${urlWithoutWww}` : `site:${req.body.url}`; // www. seems to exclude subdomains in some cases const mapResults = await fireEngineMap(mapUrl, { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 333f582e..3fe35891 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -116,7 +116,7 @@ export type CrawlRequest = z.infer; export const mapRequestSchema = crawlerOptions.extend({ url: z.string().url(), origin: z.string().optional().default("api"), - includeSubdomains: z.boolean().default(false), + includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), }); From de0dc20a02b635b3f6abe241cf26779919b17d70 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 14:18:14 -0300 Subject: [PATCH 04/10] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index d25289b2..2ad07318 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -168,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) { export async function checkTeamCredits(team_id: string, credits: number) { return withAuth(supaCheckTeamCredits)(team_id, credits); } + // if team has enough credits for the operation, return true, else return false export async function supaCheckTeamCredits(team_id: string, credits: number) { if (team_id === "preview") { - return { success: true, message: "Preview team, no credits used" }; + return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity }; } // Retrieve the team's active subscription and check for available coupons concurrently @@ -202,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { if (subscriptionError || !subscription) { // If there is no active subscription but there are available coupons if (couponCredits >= credits) { - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits }; } const { data: creditUsages, error: creditUsageError } = @@ -252,9 +253,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { return { success: false, message: "Insufficient credits, please upgrade!", + remainingCredits: FREE_CREDITS - totalCreditsUsed }; } - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed }; } let totalCreditsUsed = 0; @@ -321,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { subscription.current_period_start, subscription.current_period_end ); - return { success: false, message: "Insufficient credits, please upgrade!" }; + return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed }; } else if (creditUsagePercentage >= 0.8) { // Send email notification for approaching credit limit await sendNotification( @@ -332,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { ); } - return { success: true, message: "Sufficient credits available" }; + return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed }; } // Count the total credits used by a team within the current billing period and return the remaining credits. From c5ad4dedeb19a3b2df49f46c5789197ddba2ff7b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 14:19:20 -0300 Subject: [PATCH 05/10] Update crawl.ts --- apps/api/src/controllers/v1/crawl.ts | 85 +++++++++++++++++++--------- 1 file changed, 59 insertions(+), 26 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index f95364a3..51055fb5 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -1,21 +1,39 @@ import { Response } from "express"; import { v4 as uuidv4 } from "uuid"; -import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types"; -import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis"; +import { + CrawlRequest, + crawlRequestSchema, + CrawlResponse, + legacyCrawlerOptions, + legacyScrapeOptions, + RequestWithAuth, +} from "./types"; +import { + addCrawlJob, + addCrawlJobs, + crawlToCrawler, + lockURL, + lockURLs, + saveCrawl, + StoredCrawl, +} from "../../lib/crawl-redis"; import { logCrawl } from "../../services/logging/crawl_log"; import { getScrapeQueue } from "../../services/queue-service"; import { addScrapeJob } from "../../services/queue-jobs"; import { Logger } from "../../lib/logger"; -export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response) { +export async function crawlController( + req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, + res: Response +) { req.body = crawlRequestSchema.parse(req.body); - + const id = uuidv4(); await logCrawl(id, req.auth.team_id); - const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions), - pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions); + const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); const sc: StoredCrawl = { originUrl: req.body.url, @@ -30,15 +48,21 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr try { sc.robots = await crawler.getRobotsTxt(); } catch (e) { - Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`); + Logger.debug( + `[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify( + e + )}` + ); } await saveCrawl(id, sc); - const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap(); + const sitemap = sc.crawlerOptions.ignoreSitemap + ? null + : await crawler.tryGetSitemap(); if (sitemap !== null) { - const jobs = sitemap.map(x => { + const jobs = sitemap.map((x) => { const url = x.url; const uuid = uuidv4(); return { @@ -56,33 +80,42 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr opts: { jobId: uuid, priority: 20, - } + }, }; - }) + }); - await lockURLs(id, jobs.map(x => x.data.url)); - await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); + await lockURLs( + id, + jobs.map((x) => x.data.url) + ); + await addCrawlJobs( + id, + jobs.map((x) => x.opts.jobId) + ); await getScrapeQueue().addBulk(jobs); } else { await lockURL(id, sc, req.body.url); - const job = await addScrapeJob({ - url: req.body.url, - mode: "single_urls", - crawlerOptions: crawlerOptions, - team_id: req.auth.team_id, - pageOptions: pageOptions, - origin: "api", - crawl_id: id, - webhook: req.body.webhook, - }, { - priority: 15, - }); + const job = await addScrapeJob( + { + url: req.body.url, + mode: "single_urls", + crawlerOptions: crawlerOptions, + team_id: req.auth.team_id, + pageOptions: pageOptions, + origin: "api", + crawl_id: id, + webhook: req.body.webhook, + }, + { + priority: 15, + } + ); await addCrawlJob(id, job.id); } return res.status(200).json({ success: true, id, - url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`, + url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`, }); } From 70d50b3640812efceb6e3494b675a333c20ae5ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 20 Aug 2024 19:25:19 +0200 Subject: [PATCH 06/10] fix(queue-worker): move dotenv config up --- apps/api/src/services/queue-worker.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index cd4e03a1..afd80f42 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -1,10 +1,10 @@ +import "dotenv/config"; import { CustomError } from "../lib/custom-error"; import { getScrapeQueue, redisConnection, scrapeQueueName, } from "./queue-service"; -import "dotenv/config"; import { logtail } from "./logtail"; import { startWebScraperPipeline } from "../main/runWebScraper"; import { callWebhook } from "./webhook"; From cf32893c2e3272f0ce737bc7e23b06c5bb1d2142 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 20 Aug 2024 19:31:26 +0200 Subject: [PATCH 07/10] add strict enforcement + move crawlerOptions to top-level in /crawl --- apps/api/src/controllers/v1/types.ts | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3fe35891..afb8ab18 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -33,6 +33,8 @@ const url = z.preprocess( ) ); +const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes"; + export const scrapeOptions = z.object({ formats: z .enum([ @@ -53,14 +55,14 @@ export const scrapeOptions = z.object({ timeout: z.number().int().positive().finite().safe().default(30000), // default? waitFor: z.number().int().nonnegative().finite().safe().default(0), parsePDF: z.boolean().default(true), -}); +}).strict(strictMessage); export type ScrapeOptions = z.infer; export const scrapeRequestSchema = scrapeOptions.extend({ url, origin: z.string().optional().default("api"), -}); +}).strict(strictMessage); // export type ScrapeRequest = { // url: string; @@ -83,7 +85,7 @@ const crawlerOptions = z.object({ allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME??? allowExternalLinks: z.boolean().default(false), ignoreSitemap: z.boolean().default(true), -}); +}).strict(strictMessage); // export type CrawlerOptions = { // includePaths?: string[]; @@ -97,13 +99,12 @@ const crawlerOptions = z.object({ export type CrawlerOptions = z.infer; -export const crawlRequestSchema = z.object({ +export const crawlRequestSchema = crawlerOptions.extend({ url, origin: z.string().optional().default("api"), - crawlerOptions: crawlerOptions.default({}), scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), webhook: z.string().url().optional(), -}); +}).strict(strictMessage); // export type CrawlRequest = { // url: string; @@ -119,7 +120,7 @@ export const mapRequestSchema = crawlerOptions.extend({ includeSubdomains: z.boolean().default(true), search: z.string().optional(), ignoreSitemap: z.boolean().default(false), -}); +}).strict(strictMessage); // export type MapRequest = { // url: string; From b36faeaf540add44f0921bd6ff0d5a4360f5eb5d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 14:39:52 -0300 Subject: [PATCH 08/10] Nick: --- apps/api/src/controllers/v1/crawl.ts | 5 +++++ apps/api/src/controllers/v1/types.ts | 11 +++++++++-- apps/api/src/routes/v1.ts | 11 ++++++++--- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 51055fb5..6166af15 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -32,9 +32,14 @@ export async function crawlController( await logCrawl(id, req.auth.team_id); + const { remainingCredits } = req.account; + + // TODO: Get rid of crawlerOptions const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions); const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); + crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); + const sc: StoredCrawl = { originUrl: req.body.url, crawlerOptions, diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 3fe35891..39f34abe 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -100,9 +100,10 @@ export type CrawlerOptions = z.infer; export const crawlRequestSchema = z.object({ url, origin: z.string().optional().default("api"), - crawlerOptions: crawlerOptions.default({}), + crawlerOptions: crawlerOptions.default({}), // TODO: Get rid of this scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}), webhook: z.string().url().optional(), + limit: z.number().default(10000), // }); // export type CrawlRequest = { @@ -225,20 +226,26 @@ type AuthObject = { plan: string; }; +type Account = { + remainingCredits: number; +}; + export interface RequestWithMaybeAuth< ReqParams = {}, ReqBody = undefined, ResBody = undefined > extends Request { auth?: AuthObject; + account?: Account; } export interface RequestWithAuth< ReqParams = {}, ReqBody = undefined, - ResBody = undefined + ResBody = undefined, > extends Request { auth: AuthObject; + account?: Account; } export function legacyCrawlerOptions(x: CrawlerOptions) { diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 25e12c63..51f07c85 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -24,12 +24,17 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist"; // import { livenessController } from "../controllers/v1/liveness"; // import { readinessController } from "../controllers/v1/readiness"; -function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { +function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void { return (req, res, next) => { (async () => { - if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) { + if (!minimum && req.body) { + minimum = (req.body as any)?.limit ?? 1; + } + const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum); + if (!success) { return res.status(402).json({ success: false, error: "Insufficient credits" }); } + req.account = { remainingCredits } next(); })() .catch(err => next(err)); @@ -101,7 +106,7 @@ v1Router.post( blocklistMiddleware, authMiddleware(RateLimiterMode.Crawl), idempotencyMiddleware, - checkCreditsMiddleware(1), + checkCreditsMiddleware(), wrap(crawlController) ); From 39388cdc352af1b62690c5dcd1d86ae2344fda23 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 14:41:43 -0300 Subject: [PATCH 09/10] Update crawl.ts --- apps/api/src/controllers/v1/crawl.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 6166af15..43f940b7 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -35,7 +35,7 @@ export async function crawlController( const { remainingCredits } = req.account; // TODO: Get rid of crawlerOptions - const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions); + const crawlerOptions = legacyCrawlerOptions(req.body); const pageOptions = legacyScrapeOptions(req.body.scrapeOptions); crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit); From 0c48c8a4369986ce5093722f372d89f07add0ca3 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 20 Aug 2024 16:43:46 -0300 Subject: [PATCH 10/10] Nick: billing for map --- apps/api/src/controllers/v1/map.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 08d5ab61..76cf1498 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -14,6 +14,7 @@ import { isSameSubdomain, } from "../../lib/validateUrl"; import { fireEngineMap } from "../../search/fireEngine"; +import { billTeam } from "../../services/billing/credit_billing"; configDotenv(); @@ -84,6 +85,8 @@ export async function mapController( // remove duplicates that could be due to http/https or www links = [...new Set(links)]; + await billTeam(req.auth.team_id, 1); + return res.status(200).json({ success: true, links,