Merge remote-tracking branch 'origin/v1-webscraper' into v1/node-sdk

This commit is contained in:
rafaelsideguide 2024-08-20 17:08:07 -03:00
commit 70d81ca69e
11 changed files with 294 additions and 59 deletions

View File

@ -449,4 +449,161 @@ describe("E2E Tests for v1 API Routes", () => {
});
describe("POST /v1/map", () => {
it.concurrent("should require authorization", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
"/v1/map"
);
expect(response.statusCode).toBe(401);
});
it.concurrent("should return an error response with an invalid API key", async () => {
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer invalid-api-key`)
.set("Content-Type", "application/json")
.send({ url: "https://firecrawl.dev" });
expect(response.statusCode).toBe(401);
});
it.concurrent("should return a successful response with a valid API key", async () => {
const mapRequest = {
url: "https://roastmywebsite.ai"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
});
it.concurrent("should return a successful response with a valid API key and search", async () => {
const mapRequest = {
url: "https://usemotion.com",
search: "pricing"
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("usemotion.com/pricing");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
const mapRequest = {
url: "https://firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
});
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: true
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).toContain("docs.firecrawl.dev");
}, 10000)
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
const mapRequest = {
url: "https://www.firecrawl.dev",
search: "docs",
includeSubdomains: false
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(200);
expect(response.body).toHaveProperty("success", true);
expect(response.body).toHaveProperty("links");
if (!("links" in response.body)) {
throw new Error("Expected response body to have 'links' property");
}
const links = response.body.links as unknown[];
expect(Array.isArray(links)).toBe(true);
expect(links.length).toBeGreaterThan(0);
expect(links[0]).not.toContain("docs.firecrawl.dev");
})
it.concurrent("should return an error for invalid URL", async () => {
const mapRequest = {
url: "invalid-url",
includeSubdomains: true,
search: "test",
};
const response: ScrapeResponseRequestTest = await request(TEST_URL)
.post("/v1/map")
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
.set("Content-Type", "application/json")
.send(mapRequest);
expect(response.statusCode).toBe(400);
expect(response.body).toHaveProperty("success", false);
expect(response.body).toHaveProperty("error");
});
});
});

View File

@ -115,6 +115,9 @@ export async function supaAuthenticateUser(
case RateLimiterMode.CrawlStatus:
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
break;
case RateLimiterMode.Map:
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
break;
case RateLimiterMode.Preview:
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
@ -151,7 +154,7 @@ export async function supaAuthenticateUser(
if (
token === "this_is_just_a_preview_token" &&
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
) {
return { success: true, team_id: "preview" };
// check the origin of the request and make sure its from firecrawl.dev

View File

@ -1,21 +1,44 @@
import { Response } from "express";
import { v4 as uuidv4 } from "uuid";
import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
import {
CrawlRequest,
crawlRequestSchema,
CrawlResponse,
legacyCrawlerOptions,
legacyScrapeOptions,
RequestWithAuth,
} from "./types";
import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../lib/crawl-redis";
import { logCrawl } from "../../services/logging/crawl_log";
import { getScrapeQueue } from "../../services/queue-service";
import { addScrapeJob } from "../../services/queue-jobs";
import { Logger } from "../../lib/logger";
export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
export async function crawlController(
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
res: Response<CrawlResponse>
) {
req.body = crawlRequestSchema.parse(req.body);
const id = uuidv4();
await logCrawl(id, req.auth.team_id);
const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions),
pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
const { remainingCredits } = req.account;
// TODO: Get rid of crawlerOptions
const crawlerOptions = legacyCrawlerOptions(req.body);
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
const sc: StoredCrawl = {
originUrl: req.body.url,
@ -30,15 +53,21 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
try {
sc.robots = await crawler.getRobotsTxt();
} catch (e) {
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
Logger.debug(
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
e
)}`
);
}
await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
const sitemap = sc.crawlerOptions.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) {
const jobs = sitemap.map(x => {
const jobs = sitemap.map((x) => {
const url = x.url;
const uuid = uuidv4();
return {
@ -56,16 +85,23 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
opts: {
jobId: uuid,
priority: 20,
}
},
};
})
});
await lockURLs(id, jobs.map(x => x.data.url));
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
await lockURLs(
id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs);
} else {
await lockURL(id, sc, req.body.url);
const job = await addScrapeJob({
const job = await addScrapeJob(
{
url: req.body.url,
mode: "single_urls",
crawlerOptions: crawlerOptions,
@ -74,15 +110,17 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
origin: "api",
crawl_id: id,
webhook: req.body.webhook,
}, {
},
{
priority: 15,
});
}
);
await addCrawlJob(id, job.id);
}
return res.status(200).json({
success: true,
id,
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
});
}

View File

@ -14,6 +14,7 @@ import {
isSameSubdomain,
} from "../../lib/validateUrl";
import { fireEngineMap } from "../../search/fireEngine";
import { billTeam } from "../../services/billing/credit_billing";
configDotenv();
@ -26,11 +27,10 @@ export async function mapController(
const id = uuidv4();
let links: string[] = [req.body.url];
const crawlerOptions = legacyCrawlerOptions(req.body);
const sc: StoredCrawl = {
originUrl: req.body.url,
crawlerOptions,
crawlerOptions: legacyCrawlerOptions(req.body),
pageOptions: {},
team_id: req.auth.team_id,
createdAt: Date.now(),
@ -39,7 +39,7 @@ export async function mapController(
const crawler = crawlToCrawler(id, sc);
const sitemap =
sc.crawlerOptions.ignoreSitemap || req.body.search
req.body.ignoreSitemap
? null
: await crawler.tryGetSitemap();
@ -49,8 +49,10 @@ export async function mapController(
});
}
let urlWithoutWww = req.body.url.replace("www.", "");
let mapUrl = req.body.search
? `"${req.body.search}" site:${req.body.url}`
? `"${req.body.search}" site:${urlWithoutWww}`
: `site:${req.body.url}`;
// www. seems to exclude subdomains in some cases
const mapResults = await fireEngineMap(mapUrl, {
@ -58,16 +60,19 @@ export async function mapController(
});
if (mapResults.length > 0) {
mapResults.map((x) => {
if (req.body.search) {
links.unshift(x.url);
// Ensure all map results are first, maintaining their order
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
} else {
mapResults.map((x) => {
links.push(x.url);
}
});
}
}
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
links = links.map((x) => checkAndUpdateURLForMap(x).url);
// allows for subdomains to be included
links = links.filter((x) => isSameDomain(x, req.body.url));
@ -80,6 +85,8 @@ export async function mapController(
// remove duplicates that could be due to http/https or www
links = [...new Set(links)];
await billTeam(req.auth.team_id, 1);
return res.status(200).json({
success: true,
links,

View File

@ -33,6 +33,8 @@ const url = z.preprocess(
)
);
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
export const scrapeOptions = z.object({
formats: z
.enum([
@ -53,14 +55,14 @@ export const scrapeOptions = z.object({
timeout: z.number().int().positive().finite().safe().default(30000), // default?
waitFor: z.number().int().nonnegative().finite().safe().default(0),
parsePDF: z.boolean().default(true),
});
}).strict(strictMessage);
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
export const scrapeRequestSchema = scrapeOptions.extend({
url,
origin: z.string().optional().default("api"),
});
}).strict(strictMessage);
// export type ScrapeRequest = {
// url: string;
@ -83,7 +85,7 @@ const crawlerOptions = z.object({
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
allowExternalLinks: z.boolean().default(false),
ignoreSitemap: z.boolean().default(true),
});
}).strict(strictMessage);
// export type CrawlerOptions = {
// includePaths?: string[];
@ -97,13 +99,13 @@ const crawlerOptions = z.object({
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
export const crawlRequestSchema = z.object({
export const crawlRequestSchema = crawlerOptions.extend({
url,
origin: z.string().optional().default("api"),
crawlerOptions: crawlerOptions.default({}),
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
webhook: z.string().url().optional(),
});
limit: z.number().default(10000),
}).strict(strictMessage);
// export type CrawlRequest = {
// url: string;
@ -116,9 +118,10 @@ export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
export const mapRequestSchema = crawlerOptions.extend({
url: z.string().url(),
origin: z.string().optional().default("api"),
includeSubdomains: z.boolean().default(false),
includeSubdomains: z.boolean().default(true),
search: z.string().optional(),
});
ignoreSitemap: z.boolean().default(false),
}).strict(strictMessage);
// export type MapRequest = {
// url: string;
@ -224,20 +227,26 @@ type AuthObject = {
plan: string;
};
type Account = {
remainingCredits: number;
};
export interface RequestWithMaybeAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
> extends Request<ReqParams, ReqBody, ResBody> {
auth?: AuthObject;
account?: Account;
}
export interface RequestWithAuth<
ReqParams = {},
ReqBody = undefined,
ResBody = undefined
ResBody = undefined,
> extends Request<ReqParams, ReqBody, ResBody> {
auth: AuthObject;
account?: Account;
}
export function legacyCrawlerOptions(x: CrawlerOptions) {

View File

@ -113,7 +113,7 @@ export const checkAndUpdateURLForMap = (url: string) => {
}
// remove any query params
url = url.split("?")[0];
url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};

View File

@ -24,12 +24,17 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
// import { livenessController } from "../controllers/v1/liveness";
// import { readinessController } from "../controllers/v1/readiness";
function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
return (req, res, next) => {
(async () => {
if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) {
if (!minimum && req.body) {
minimum = (req.body as any)?.limit ?? 1;
}
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
if (!success) {
return res.status(402).json({ success: false, error: "Insufficient credits" });
}
req.account = { remainingCredits }
next();
})()
.catch(err => next(err));
@ -71,7 +76,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (isUrlBlocked(req.body.url)) {
if (req.body.url && isUrlBlocked(req.body.url)) {
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
}
next();
@ -101,14 +106,14 @@ v1Router.post(
blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
idempotencyMiddleware,
checkCreditsMiddleware(1),
checkCreditsMiddleware(),
wrap(crawlController)
);
v1Router.post(
"/map",
blocklistMiddleware,
authMiddleware(RateLimiterMode.Crawl),
authMiddleware(RateLimiterMode.Map),
checkCreditsMiddleware(1),
wrap(mapController)
);

View File

@ -168,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
export async function checkTeamCredits(team_id: string, credits: number) {
return withAuth(supaCheckTeamCredits)(team_id, credits);
}
// if team has enough credits for the operation, return true, else return false
export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (team_id === "preview") {
return { success: true, message: "Preview team, no credits used" };
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
}
// Retrieve the team's active subscription and check for available coupons concurrently
@ -202,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
if (subscriptionError || !subscription) {
// If there is no active subscription but there are available coupons
if (couponCredits >= credits) {
return { success: true, message: "Sufficient credits available" };
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
}
const { data: creditUsages, error: creditUsageError } =
@ -252,9 +253,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
return {
success: false,
message: "Insufficient credits, please upgrade!",
remainingCredits: FREE_CREDITS - totalCreditsUsed
};
}
return { success: true, message: "Sufficient credits available" };
return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
}
let totalCreditsUsed = 0;
@ -321,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
subscription.current_period_start,
subscription.current_period_end
);
return { success: false, message: "Insufficient credits, please upgrade!" };
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
} else if (creditUsagePercentage >= 0.8) {
// Send email notification for approaching credit limit
await sendNotification(
@ -332,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
);
}
return { success: true, message: "Sufficient credits available" };
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
}
// Count the total credits used by a team within the current billing period and return the remaining credits.

View File

@ -1,10 +1,10 @@
import "dotenv/config";
import { CustomError } from "../lib/custom-error";
import {
getScrapeQueue,
redisConnection,
scrapeQueueName,
} from "./queue-service";
import "dotenv/config";
import { logtail } from "./logtail";
import { startWebScraperPipeline } from "../main/runWebScraper";
import { callWebhook } from "./webhook";

View File

@ -42,6 +42,19 @@ const RATE_LIMITS = {
growth: 500,
growthdouble: 500,
},
map:{
default: 20,
free: 5,
starter: 20,
standard: 40,
standardOld: 40,
scale: 500,
hobby: 10,
standardNew: 50,
standardnew: 50,
growth: 500,
growthdouble: 500,
},
preview: {
free: 5,
default: 5,

View File

@ -106,6 +106,7 @@ export enum RateLimiterMode {
Scrape = "scrape",
Preview = "preview",
Search = "search",
Map = "map",
}