Merge pull request #554 from mendableai/nsc/check-team-credits-limit

Check team credits based on the crawl limit
This commit is contained in:
Nicolas 2024-08-19 11:54:47 -03:00 committed by GitHub
commit 5a44191344
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
import { logCrawl } from "../../src/services/logging/crawl_log"; import { logCrawl } from "../../src/services/logging/crawl_log";
import { validateIdempotencyKey } from "../../src/services/idempotency/validate"; import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
import { createIdempotencyKey } from "../../src/services/idempotency/create"; import { createIdempotencyKey } from "../../src/services/idempotency/create";
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values"; import {
defaultCrawlPageOptions,
defaultCrawlerOptions,
defaultOrigin,
} from "../../src/lib/default-values";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger"; import { Logger } from "../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import {
addCrawlJob,
addCrawlJobs,
crawlToCrawler,
lockURL,
lockURLs,
saveCrawl,
StoredCrawl,
} from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service"; import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl"; import { checkAndUpdateURL } from "../../src/lib/validateUrl";
@ -38,8 +50,16 @@ export async function crawlController(req: Request, res: Response) {
} }
} }
const crawlerOptions = {
...defaultCrawlerOptions,
...req.body.crawlerOptions,
};
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
const limitCheck = crawlerOptions?.limit ?? 1;
const { success: creditsCheckSuccess, message: creditsCheckMessage } = const { success: creditsCheckSuccess, message: creditsCheckMessage } =
await checkTeamCredits(team_id, 1); await checkTeamCredits(team_id, limitCheck);
if (!creditsCheckSuccess) { if (!creditsCheckSuccess) {
return res.status(402).json({ error: "Insufficient credits" }); return res.status(402).json({ error: "Insufficient credits" });
} }
@ -57,9 +77,7 @@ export async function crawlController(req: Request, res: Response) {
} }
if (isUrlBlocked(url)) { if (isUrlBlocked(url)) {
return res return res.status(403).json({
.status(403)
.json({
error: error:
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.", "Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
}); });
@ -67,9 +85,6 @@ export async function crawlController(req: Request, res: Response) {
const mode = req.body.mode ?? "crawl"; const mode = req.body.mode ?? "crawl";
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this? // if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
// try { // try {
// const a = new WebScraperDataProvider(); // const a = new WebScraperDataProvider();
@ -119,10 +134,12 @@ export async function crawlController(req: Request, res: Response) {
await saveCrawl(id, sc); await saveCrawl(id, sc);
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap(); const sitemap = sc.crawlerOptions?.ignoreSitemap
? null
: await crawler.tryGetSitemap();
if (sitemap !== null) { if (sitemap !== null) {
const jobs = sitemap.map(x => { const jobs = sitemap.map((x) => {
const url = x.url; const url = x.url;
const uuid = uuidv4(); const uuid = uuidv4();
return { return {
@ -140,16 +157,23 @@ export async function crawlController(req: Request, res: Response) {
opts: { opts: {
jobId: uuid, jobId: uuid,
priority: 20, priority: 20,
} },
}; };
}) });
await lockURLs(id, jobs.map(x => x.data.url)); await lockURLs(
await addCrawlJobs(id, jobs.map(x => x.opts.jobId)); id,
jobs.map((x) => x.data.url)
);
await addCrawlJobs(
id,
jobs.map((x) => x.opts.jobId)
);
await getScrapeQueue().addBulk(jobs); await getScrapeQueue().addBulk(jobs);
} else { } else {
await lockURL(id, sc, url); await lockURL(id, sc, url);
const job = await addScrapeJob({ const job = await addScrapeJob(
{
url, url,
mode: "single_urls", mode: "single_urls",
crawlerOptions: crawlerOptions, crawlerOptions: crawlerOptions,
@ -157,9 +181,11 @@ export async function crawlController(req: Request, res: Response) {
pageOptions: pageOptions, pageOptions: pageOptions,
origin: req.body.origin ?? defaultOrigin, origin: req.body.origin ?? defaultOrigin,
crawl_id: id, crawl_id: id,
}, { },
{
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
}); }
);
await addCrawlJob(id, job.id); await addCrawlJob(id, job.id);
} }