mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 17:00:44 +08:00
Merge remote-tracking branch 'origin/v1-webscraper' into v1/node-sdk
This commit is contained in:
commit
70d81ca69e
@ -449,4 +449,161 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
|
|
||||||
|
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("POST /v1/map", () => {
|
||||||
|
it.concurrent("should require authorization", async () => {
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL).post(
|
||||||
|
"/v1/map"
|
||||||
|
);
|
||||||
|
expect(response.statusCode).toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.concurrent("should return an error response with an invalid API key", async () => {
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer invalid-api-key`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send({ url: "https://firecrawl.dev" });
|
||||||
|
expect(response.statusCode).toBe(401);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "https://roastmywebsite.ai"
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success", true);
|
||||||
|
expect(response.body).toHaveProperty("links");
|
||||||
|
if (!("links" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'links' property");
|
||||||
|
}
|
||||||
|
const links = response.body.links as unknown[];
|
||||||
|
expect(Array.isArray(links)).toBe(true);
|
||||||
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key and search", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "https://usemotion.com",
|
||||||
|
search: "pricing"
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success", true);
|
||||||
|
expect(response.body).toHaveProperty("links");
|
||||||
|
if (!("links" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'links' property");
|
||||||
|
}
|
||||||
|
const links = response.body.links as unknown[];
|
||||||
|
expect(Array.isArray(links)).toBe(true);
|
||||||
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
expect(links[0]).toContain("usemotion.com/pricing");
|
||||||
|
});
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "https://firecrawl.dev",
|
||||||
|
search: "docs",
|
||||||
|
includeSubdomains: true
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success", true);
|
||||||
|
expect(response.body).toHaveProperty("links");
|
||||||
|
if (!("links" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'links' property");
|
||||||
|
}
|
||||||
|
const links = response.body.links as unknown[];
|
||||||
|
expect(Array.isArray(links)).toBe(true);
|
||||||
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||||
|
});
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key and search and allowSubdomains and www", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "https://www.firecrawl.dev",
|
||||||
|
search: "docs",
|
||||||
|
includeSubdomains: true
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success", true);
|
||||||
|
expect(response.body).toHaveProperty("links");
|
||||||
|
if (!("links" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'links' property");
|
||||||
|
}
|
||||||
|
const links = response.body.links as unknown[];
|
||||||
|
expect(Array.isArray(links)).toBe(true);
|
||||||
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
expect(links[0]).toContain("docs.firecrawl.dev");
|
||||||
|
}, 10000)
|
||||||
|
|
||||||
|
it.concurrent("should return a successful response with a valid API key and search and not allowSubdomains and www", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "https://www.firecrawl.dev",
|
||||||
|
search: "docs",
|
||||||
|
includeSubdomains: false
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
expect(response.body).toHaveProperty("success", true);
|
||||||
|
expect(response.body).toHaveProperty("links");
|
||||||
|
if (!("links" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'links' property");
|
||||||
|
}
|
||||||
|
const links = response.body.links as unknown[];
|
||||||
|
expect(Array.isArray(links)).toBe(true);
|
||||||
|
expect(links.length).toBeGreaterThan(0);
|
||||||
|
expect(links[0]).not.toContain("docs.firecrawl.dev");
|
||||||
|
})
|
||||||
|
|
||||||
|
it.concurrent("should return an error for invalid URL", async () => {
|
||||||
|
const mapRequest = {
|
||||||
|
url: "invalid-url",
|
||||||
|
includeSubdomains: true,
|
||||||
|
search: "test",
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/map")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(mapRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(400);
|
||||||
|
expect(response.body).toHaveProperty("success", false);
|
||||||
|
expect(response.body).toHaveProperty("error");
|
||||||
|
});
|
||||||
|
});
|
||||||
});
|
});
|
||||||
|
@ -115,6 +115,9 @@ export async function supaAuthenticateUser(
|
|||||||
case RateLimiterMode.CrawlStatus:
|
case RateLimiterMode.CrawlStatus:
|
||||||
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
rateLimiter = getRateLimiter(RateLimiterMode.CrawlStatus, token);
|
||||||
break;
|
break;
|
||||||
|
case RateLimiterMode.Map:
|
||||||
|
rateLimiter = getRateLimiter(RateLimiterMode.Map, token);
|
||||||
|
break;
|
||||||
|
|
||||||
case RateLimiterMode.Preview:
|
case RateLimiterMode.Preview:
|
||||||
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
rateLimiter = getRateLimiter(RateLimiterMode.Preview, token);
|
||||||
@ -151,7 +154,7 @@ export async function supaAuthenticateUser(
|
|||||||
|
|
||||||
if (
|
if (
|
||||||
token === "this_is_just_a_preview_token" &&
|
token === "this_is_just_a_preview_token" &&
|
||||||
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search)
|
(mode === RateLimiterMode.Scrape || mode === RateLimiterMode.Preview || mode === RateLimiterMode.Search || mode === RateLimiterMode.Map)
|
||||||
) {
|
) {
|
||||||
return { success: true, team_id: "preview" };
|
return { success: true, team_id: "preview" };
|
||||||
// check the origin of the request and make sure its from firecrawl.dev
|
// check the origin of the request and make sure its from firecrawl.dev
|
||||||
|
@ -1,21 +1,44 @@
|
|||||||
import { Response } from "express";
|
import { Response } from "express";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { CrawlRequest, crawlRequestSchema, CrawlResponse, legacyCrawlerOptions, legacyScrapeOptions, RequestWithAuth } from "./types";
|
import {
|
||||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../lib/crawl-redis";
|
CrawlRequest,
|
||||||
|
crawlRequestSchema,
|
||||||
|
CrawlResponse,
|
||||||
|
legacyCrawlerOptions,
|
||||||
|
legacyScrapeOptions,
|
||||||
|
RequestWithAuth,
|
||||||
|
} from "./types";
|
||||||
|
import {
|
||||||
|
addCrawlJob,
|
||||||
|
addCrawlJobs,
|
||||||
|
crawlToCrawler,
|
||||||
|
lockURL,
|
||||||
|
lockURLs,
|
||||||
|
saveCrawl,
|
||||||
|
StoredCrawl,
|
||||||
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob } from "../../services/queue-jobs";
|
||||||
import { Logger } from "../../lib/logger";
|
import { Logger } from "../../lib/logger";
|
||||||
|
|
||||||
export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>, res: Response<CrawlResponse>) {
|
export async function crawlController(
|
||||||
|
req: RequestWithAuth<{}, CrawlResponse, CrawlRequest>,
|
||||||
|
res: Response<CrawlResponse>
|
||||||
|
) {
|
||||||
req.body = crawlRequestSchema.parse(req.body);
|
req.body = crawlRequestSchema.parse(req.body);
|
||||||
|
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
|
|
||||||
await logCrawl(id, req.auth.team_id);
|
await logCrawl(id, req.auth.team_id);
|
||||||
|
|
||||||
const crawlerOptions = legacyCrawlerOptions(req.body.crawlerOptions),
|
const { remainingCredits } = req.account;
|
||||||
pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
|
||||||
|
// TODO: Get rid of crawlerOptions
|
||||||
|
const crawlerOptions = legacyCrawlerOptions(req.body);
|
||||||
|
const pageOptions = legacyScrapeOptions(req.body.scrapeOptions);
|
||||||
|
|
||||||
|
crawlerOptions.limit = Math.min(remainingCredits, crawlerOptions.limit);
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
@ -30,15 +53,21 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
|
|||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt();
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
Logger.debug(`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(e)}`);
|
Logger.debug(
|
||||||
|
`[Crawl] Failed to get robots.txt (this is probably fine!): ${JSON.stringify(
|
||||||
|
e
|
||||||
|
)}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions.ignoreSitemap
|
||||||
|
? null
|
||||||
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
const jobs = sitemap.map(x => {
|
const jobs = sitemap.map((x) => {
|
||||||
const url = x.url;
|
const url = x.url;
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
@ -56,16 +85,23 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
|
|||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: 20,
|
priority: 20,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
})
|
});
|
||||||
|
|
||||||
await lockURLs(id, jobs.map(x => x.data.url));
|
await lockURLs(
|
||||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
id,
|
||||||
|
jobs.map((x) => x.data.url)
|
||||||
|
);
|
||||||
|
await addCrawlJobs(
|
||||||
|
id,
|
||||||
|
jobs.map((x) => x.opts.jobId)
|
||||||
|
);
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, req.body.url);
|
await lockURL(id, sc, req.body.url);
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob(
|
||||||
|
{
|
||||||
url: req.body.url,
|
url: req.body.url,
|
||||||
mode: "single_urls",
|
mode: "single_urls",
|
||||||
crawlerOptions: crawlerOptions,
|
crawlerOptions: crawlerOptions,
|
||||||
@ -74,15 +110,17 @@ export async function crawlController(req: RequestWithAuth<{}, CrawlResponse, Cr
|
|||||||
origin: "api",
|
origin: "api",
|
||||||
crawl_id: id,
|
crawl_id: id,
|
||||||
webhook: req.body.webhook,
|
webhook: req.body.webhook,
|
||||||
}, {
|
},
|
||||||
|
{
|
||||||
priority: 15,
|
priority: 15,
|
||||||
});
|
}
|
||||||
|
);
|
||||||
await addCrawlJob(id, job.id);
|
await addCrawlJob(id, job.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
id,
|
id,
|
||||||
url: `${req.protocol}://${req.get('host')}/v1/crawl/${id}`,
|
url: `${req.protocol}://${req.get("host")}/v1/crawl/${id}`,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -14,6 +14,7 @@ import {
|
|||||||
isSameSubdomain,
|
isSameSubdomain,
|
||||||
} from "../../lib/validateUrl";
|
} from "../../lib/validateUrl";
|
||||||
import { fireEngineMap } from "../../search/fireEngine";
|
import { fireEngineMap } from "../../search/fireEngine";
|
||||||
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -26,11 +27,10 @@ export async function mapController(
|
|||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [req.body.url];
|
let links: string[] = [req.body.url];
|
||||||
|
|
||||||
const crawlerOptions = legacyCrawlerOptions(req.body);
|
|
||||||
|
|
||||||
const sc: StoredCrawl = {
|
const sc: StoredCrawl = {
|
||||||
originUrl: req.body.url,
|
originUrl: req.body.url,
|
||||||
crawlerOptions,
|
crawlerOptions: legacyCrawlerOptions(req.body),
|
||||||
pageOptions: {},
|
pageOptions: {},
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
@ -39,7 +39,7 @@ export async function mapController(
|
|||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
const sitemap =
|
const sitemap =
|
||||||
sc.crawlerOptions.ignoreSitemap || req.body.search
|
req.body.ignoreSitemap
|
||||||
? null
|
? null
|
||||||
: await crawler.tryGetSitemap();
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
@ -49,8 +49,10 @@ export async function mapController(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let urlWithoutWww = req.body.url.replace("www.", "");
|
||||||
|
|
||||||
let mapUrl = req.body.search
|
let mapUrl = req.body.search
|
||||||
? `"${req.body.search}" site:${req.body.url}`
|
? `"${req.body.search}" site:${urlWithoutWww}`
|
||||||
: `site:${req.body.url}`;
|
: `site:${req.body.url}`;
|
||||||
// www. seems to exclude subdomains in some cases
|
// www. seems to exclude subdomains in some cases
|
||||||
const mapResults = await fireEngineMap(mapUrl, {
|
const mapResults = await fireEngineMap(mapUrl, {
|
||||||
@ -58,16 +60,19 @@ export async function mapController(
|
|||||||
});
|
});
|
||||||
|
|
||||||
if (mapResults.length > 0) {
|
if (mapResults.length > 0) {
|
||||||
mapResults.map((x) => {
|
|
||||||
if (req.body.search) {
|
if (req.body.search) {
|
||||||
links.unshift(x.url);
|
// Ensure all map results are first, maintaining their order
|
||||||
|
links = [mapResults[0].url, ...mapResults.slice(1).map(x => x.url), ...links];
|
||||||
} else {
|
} else {
|
||||||
|
mapResults.map((x) => {
|
||||||
links.push(x.url);
|
links.push(x.url);
|
||||||
}
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
links = links.map((x) => checkAndUpdateURLForMap(x).url.trim());
|
||||||
|
|
||||||
|
|
||||||
links = links.map((x) => checkAndUpdateURLForMap(x).url);
|
|
||||||
|
|
||||||
// allows for subdomains to be included
|
// allows for subdomains to be included
|
||||||
links = links.filter((x) => isSameDomain(x, req.body.url));
|
links = links.filter((x) => isSameDomain(x, req.body.url));
|
||||||
@ -80,6 +85,8 @@ export async function mapController(
|
|||||||
// remove duplicates that could be due to http/https or www
|
// remove duplicates that could be due to http/https or www
|
||||||
links = [...new Set(links)];
|
links = [...new Set(links)];
|
||||||
|
|
||||||
|
await billTeam(req.auth.team_id, 1);
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
success: true,
|
success: true,
|
||||||
links,
|
links,
|
||||||
|
@ -33,6 +33,8 @@ const url = z.preprocess(
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const strictMessage = "Unrecognized key in body -- please review the v1 API documentation for request body changes";
|
||||||
|
|
||||||
export const scrapeOptions = z.object({
|
export const scrapeOptions = z.object({
|
||||||
formats: z
|
formats: z
|
||||||
.enum([
|
.enum([
|
||||||
@ -53,14 +55,14 @@ export const scrapeOptions = z.object({
|
|||||||
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
timeout: z.number().int().positive().finite().safe().default(30000), // default?
|
||||||
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
waitFor: z.number().int().nonnegative().finite().safe().default(0),
|
||||||
parsePDF: z.boolean().default(true),
|
parsePDF: z.boolean().default(true),
|
||||||
});
|
}).strict(strictMessage);
|
||||||
|
|
||||||
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
export type ScrapeOptions = z.infer<typeof scrapeOptions>;
|
||||||
|
|
||||||
export const scrapeRequestSchema = scrapeOptions.extend({
|
export const scrapeRequestSchema = scrapeOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
});
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type ScrapeRequest = {
|
// export type ScrapeRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -83,7 +85,7 @@ const crawlerOptions = z.object({
|
|||||||
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
allowBackwardLinks: z.boolean().default(false), // >> TODO: CHANGE THIS NAME???
|
||||||
allowExternalLinks: z.boolean().default(false),
|
allowExternalLinks: z.boolean().default(false),
|
||||||
ignoreSitemap: z.boolean().default(true),
|
ignoreSitemap: z.boolean().default(true),
|
||||||
});
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type CrawlerOptions = {
|
// export type CrawlerOptions = {
|
||||||
// includePaths?: string[];
|
// includePaths?: string[];
|
||||||
@ -97,13 +99,13 @@ const crawlerOptions = z.object({
|
|||||||
|
|
||||||
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
export type CrawlerOptions = z.infer<typeof crawlerOptions>;
|
||||||
|
|
||||||
export const crawlRequestSchema = z.object({
|
export const crawlRequestSchema = crawlerOptions.extend({
|
||||||
url,
|
url,
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
crawlerOptions: crawlerOptions.default({}),
|
|
||||||
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
scrapeOptions: scrapeOptions.omit({ timeout: true }).default({}),
|
||||||
webhook: z.string().url().optional(),
|
webhook: z.string().url().optional(),
|
||||||
});
|
limit: z.number().default(10000),
|
||||||
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type CrawlRequest = {
|
// export type CrawlRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -116,9 +118,10 @@ export type CrawlRequest = z.infer<typeof crawlRequestSchema>;
|
|||||||
export const mapRequestSchema = crawlerOptions.extend({
|
export const mapRequestSchema = crawlerOptions.extend({
|
||||||
url: z.string().url(),
|
url: z.string().url(),
|
||||||
origin: z.string().optional().default("api"),
|
origin: z.string().optional().default("api"),
|
||||||
includeSubdomains: z.boolean().default(false),
|
includeSubdomains: z.boolean().default(true),
|
||||||
search: z.string().optional(),
|
search: z.string().optional(),
|
||||||
});
|
ignoreSitemap: z.boolean().default(false),
|
||||||
|
}).strict(strictMessage);
|
||||||
|
|
||||||
// export type MapRequest = {
|
// export type MapRequest = {
|
||||||
// url: string;
|
// url: string;
|
||||||
@ -224,20 +227,26 @@ type AuthObject = {
|
|||||||
plan: string;
|
plan: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
type Account = {
|
||||||
|
remainingCredits: number;
|
||||||
|
};
|
||||||
|
|
||||||
export interface RequestWithMaybeAuth<
|
export interface RequestWithMaybeAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth?: AuthObject;
|
auth?: AuthObject;
|
||||||
|
account?: Account;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RequestWithAuth<
|
export interface RequestWithAuth<
|
||||||
ReqParams = {},
|
ReqParams = {},
|
||||||
ReqBody = undefined,
|
ReqBody = undefined,
|
||||||
ResBody = undefined
|
ResBody = undefined,
|
||||||
> extends Request<ReqParams, ReqBody, ResBody> {
|
> extends Request<ReqParams, ReqBody, ResBody> {
|
||||||
auth: AuthObject;
|
auth: AuthObject;
|
||||||
|
account?: Account;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
export function legacyCrawlerOptions(x: CrawlerOptions) {
|
||||||
|
@ -113,7 +113,7 @@ export const checkAndUpdateURLForMap = (url: string) => {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// remove any query params
|
// remove any query params
|
||||||
url = url.split("?")[0];
|
url = url.split("?")[0].trim();
|
||||||
|
|
||||||
return { urlObj: typedUrlObj, url: url };
|
return { urlObj: typedUrlObj, url: url };
|
||||||
};
|
};
|
||||||
|
@ -24,12 +24,17 @@ import { isUrlBlocked } from "../scraper/WebScraper/utils/blocklist";
|
|||||||
// import { livenessController } from "../controllers/v1/liveness";
|
// import { livenessController } from "../controllers/v1/liveness";
|
||||||
// import { readinessController } from "../controllers/v1/readiness";
|
// import { readinessController } from "../controllers/v1/readiness";
|
||||||
|
|
||||||
function checkCreditsMiddleware(minimum: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
function checkCreditsMiddleware(minimum?: number): (req: RequestWithAuth, res: Response, next: NextFunction) => void {
|
||||||
return (req, res, next) => {
|
return (req, res, next) => {
|
||||||
(async () => {
|
(async () => {
|
||||||
if (!(await checkTeamCredits(req.auth.team_id, minimum)).success) {
|
if (!minimum && req.body) {
|
||||||
|
minimum = (req.body as any)?.limit ?? 1;
|
||||||
|
}
|
||||||
|
const { success, message, remainingCredits } = await checkTeamCredits(req.auth.team_id, minimum);
|
||||||
|
if (!success) {
|
||||||
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
return res.status(402).json({ success: false, error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
|
req.account = { remainingCredits }
|
||||||
next();
|
next();
|
||||||
})()
|
})()
|
||||||
.catch(err => next(err));
|
.catch(err => next(err));
|
||||||
@ -71,7 +76,7 @@ function idempotencyMiddleware(req: Request, res: Response, next: NextFunction)
|
|||||||
}
|
}
|
||||||
|
|
||||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||||
if (isUrlBlocked(req.body.url)) {
|
if (req.body.url && isUrlBlocked(req.body.url)) {
|
||||||
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
return res.status(403).json({ success: false, error: "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions." });
|
||||||
}
|
}
|
||||||
next();
|
next();
|
||||||
@ -101,14 +106,14 @@ v1Router.post(
|
|||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
authMiddleware(RateLimiterMode.Crawl),
|
authMiddleware(RateLimiterMode.Crawl),
|
||||||
idempotencyMiddleware,
|
idempotencyMiddleware,
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(),
|
||||||
wrap(crawlController)
|
wrap(crawlController)
|
||||||
);
|
);
|
||||||
|
|
||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/map",
|
"/map",
|
||||||
blocklistMiddleware,
|
blocklistMiddleware,
|
||||||
authMiddleware(RateLimiterMode.Crawl),
|
authMiddleware(RateLimiterMode.Map),
|
||||||
checkCreditsMiddleware(1),
|
checkCreditsMiddleware(1),
|
||||||
wrap(mapController)
|
wrap(mapController)
|
||||||
);
|
);
|
||||||
|
@ -168,10 +168,11 @@ export async function supaBillTeam(team_id: string, credits: number) {
|
|||||||
export async function checkTeamCredits(team_id: string, credits: number) {
|
export async function checkTeamCredits(team_id: string, credits: number) {
|
||||||
return withAuth(supaCheckTeamCredits)(team_id, credits);
|
return withAuth(supaCheckTeamCredits)(team_id, credits);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if team has enough credits for the operation, return true, else return false
|
// if team has enough credits for the operation, return true, else return false
|
||||||
export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
||||||
if (team_id === "preview") {
|
if (team_id === "preview") {
|
||||||
return { success: true, message: "Preview team, no credits used" };
|
return { success: true, message: "Preview team, no credits used", remainingCredits: Infinity };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Retrieve the team's active subscription and check for available coupons concurrently
|
// Retrieve the team's active subscription and check for available coupons concurrently
|
||||||
@ -202,7 +203,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
if (subscriptionError || !subscription) {
|
if (subscriptionError || !subscription) {
|
||||||
// If there is no active subscription but there are available coupons
|
// If there is no active subscription but there are available coupons
|
||||||
if (couponCredits >= credits) {
|
if (couponCredits >= credits) {
|
||||||
return { success: true, message: "Sufficient credits available" };
|
return { success: true, message: "Sufficient credits available", remainingCredits: couponCredits };
|
||||||
}
|
}
|
||||||
|
|
||||||
const { data: creditUsages, error: creditUsageError } =
|
const { data: creditUsages, error: creditUsageError } =
|
||||||
@ -252,9 +253,10 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
message: "Insufficient credits, please upgrade!",
|
message: "Insufficient credits, please upgrade!",
|
||||||
|
remainingCredits: FREE_CREDITS - totalCreditsUsed
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
return { success: true, message: "Sufficient credits available" };
|
return { success: true, message: "Sufficient credits available", remainingCredits: FREE_CREDITS - totalCreditsUsed };
|
||||||
}
|
}
|
||||||
|
|
||||||
let totalCreditsUsed = 0;
|
let totalCreditsUsed = 0;
|
||||||
@ -321,7 +323,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
subscription.current_period_start,
|
subscription.current_period_start,
|
||||||
subscription.current_period_end
|
subscription.current_period_end
|
||||||
);
|
);
|
||||||
return { success: false, message: "Insufficient credits, please upgrade!" };
|
return { success: false, message: "Insufficient credits, please upgrade!", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||||
} else if (creditUsagePercentage >= 0.8) {
|
} else if (creditUsagePercentage >= 0.8) {
|
||||||
// Send email notification for approaching credit limit
|
// Send email notification for approaching credit limit
|
||||||
await sendNotification(
|
await sendNotification(
|
||||||
@ -332,7 +334,7 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return { success: true, message: "Sufficient credits available" };
|
return { success: true, message: "Sufficient credits available", remainingCredits: creditLimit - adjustedCreditsUsed };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
// Count the total credits used by a team within the current billing period and return the remaining credits.
|
||||||
|
@ -1,10 +1,10 @@
|
|||||||
|
import "dotenv/config";
|
||||||
import { CustomError } from "../lib/custom-error";
|
import { CustomError } from "../lib/custom-error";
|
||||||
import {
|
import {
|
||||||
getScrapeQueue,
|
getScrapeQueue,
|
||||||
redisConnection,
|
redisConnection,
|
||||||
scrapeQueueName,
|
scrapeQueueName,
|
||||||
} from "./queue-service";
|
} from "./queue-service";
|
||||||
import "dotenv/config";
|
|
||||||
import { logtail } from "./logtail";
|
import { logtail } from "./logtail";
|
||||||
import { startWebScraperPipeline } from "../main/runWebScraper";
|
import { startWebScraperPipeline } from "../main/runWebScraper";
|
||||||
import { callWebhook } from "./webhook";
|
import { callWebhook } from "./webhook";
|
||||||
|
@ -42,6 +42,19 @@ const RATE_LIMITS = {
|
|||||||
growth: 500,
|
growth: 500,
|
||||||
growthdouble: 500,
|
growthdouble: 500,
|
||||||
},
|
},
|
||||||
|
map:{
|
||||||
|
default: 20,
|
||||||
|
free: 5,
|
||||||
|
starter: 20,
|
||||||
|
standard: 40,
|
||||||
|
standardOld: 40,
|
||||||
|
scale: 500,
|
||||||
|
hobby: 10,
|
||||||
|
standardNew: 50,
|
||||||
|
standardnew: 50,
|
||||||
|
growth: 500,
|
||||||
|
growthdouble: 500,
|
||||||
|
},
|
||||||
preview: {
|
preview: {
|
||||||
free: 5,
|
free: 5,
|
||||||
default: 5,
|
default: 5,
|
||||||
|
@ -106,6 +106,7 @@ export enum RateLimiterMode {
|
|||||||
Scrape = "scrape",
|
Scrape = "scrape",
|
||||||
Preview = "preview",
|
Preview = "preview",
|
||||||
Search = "search",
|
Search = "search",
|
||||||
|
Map = "map",
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user