mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:49:00 +08:00
Merge pull request #554 from mendableai/nsc/check-team-credits-limit
Check team credits based on the crawl limit
This commit is contained in:
commit
5a44191344
@ -7,10 +7,22 @@ import { isUrlBlocked } from "../../src/scraper/WebScraper/utils/blocklist";
|
|||||||
import { logCrawl } from "../../src/services/logging/crawl_log";
|
import { logCrawl } from "../../src/services/logging/crawl_log";
|
||||||
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
import { validateIdempotencyKey } from "../../src/services/idempotency/validate";
|
||||||
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
import { createIdempotencyKey } from "../../src/services/idempotency/create";
|
||||||
import { defaultCrawlPageOptions, defaultCrawlerOptions, defaultOrigin } from "../../src/lib/default-values";
|
import {
|
||||||
|
defaultCrawlPageOptions,
|
||||||
|
defaultCrawlerOptions,
|
||||||
|
defaultOrigin,
|
||||||
|
} from "../../src/lib/default-values";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
import { Logger } from "../../src/lib/logger";
|
import { Logger } from "../../src/lib/logger";
|
||||||
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
|
import {
|
||||||
|
addCrawlJob,
|
||||||
|
addCrawlJobs,
|
||||||
|
crawlToCrawler,
|
||||||
|
lockURL,
|
||||||
|
lockURLs,
|
||||||
|
saveCrawl,
|
||||||
|
StoredCrawl,
|
||||||
|
} from "../../src/lib/crawl-redis";
|
||||||
import { getScrapeQueue } from "../../src/services/queue-service";
|
import { getScrapeQueue } from "../../src/services/queue-service";
|
||||||
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
|
||||||
|
|
||||||
@ -38,8 +50,16 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const crawlerOptions = {
|
||||||
|
...defaultCrawlerOptions,
|
||||||
|
...req.body.crawlerOptions,
|
||||||
|
};
|
||||||
|
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
||||||
|
|
||||||
|
const limitCheck = crawlerOptions?.limit ?? 1;
|
||||||
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
const { success: creditsCheckSuccess, message: creditsCheckMessage } =
|
||||||
await checkTeamCredits(team_id, 1);
|
await checkTeamCredits(team_id, limitCheck);
|
||||||
|
|
||||||
if (!creditsCheckSuccess) {
|
if (!creditsCheckSuccess) {
|
||||||
return res.status(402).json({ error: "Insufficient credits" });
|
return res.status(402).json({ error: "Insufficient credits" });
|
||||||
}
|
}
|
||||||
@ -57,19 +77,14 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url)) {
|
||||||
return res
|
return res.status(403).json({
|
||||||
.status(403)
|
error:
|
||||||
.json({
|
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
||||||
error:
|
});
|
||||||
"Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it.",
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const mode = req.body.mode ?? "crawl";
|
const mode = req.body.mode ?? "crawl";
|
||||||
|
|
||||||
const crawlerOptions = { ...defaultCrawlerOptions, ...req.body.crawlerOptions };
|
|
||||||
const pageOptions = { ...defaultCrawlPageOptions, ...req.body.pageOptions };
|
|
||||||
|
|
||||||
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
// if (mode === "single_urls" && !url.includes(",")) { // NOTE: do we need this?
|
||||||
// try {
|
// try {
|
||||||
// const a = new WebScraperDataProvider();
|
// const a = new WebScraperDataProvider();
|
||||||
@ -119,10 +134,12 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const sitemap = sc.crawlerOptions?.ignoreSitemap ? null : await crawler.tryGetSitemap();
|
const sitemap = sc.crawlerOptions?.ignoreSitemap
|
||||||
|
? null
|
||||||
|
: await crawler.tryGetSitemap();
|
||||||
|
|
||||||
if (sitemap !== null) {
|
if (sitemap !== null) {
|
||||||
const jobs = sitemap.map(x => {
|
const jobs = sitemap.map((x) => {
|
||||||
const url = x.url;
|
const url = x.url;
|
||||||
const uuid = uuidv4();
|
const uuid = uuidv4();
|
||||||
return {
|
return {
|
||||||
@ -140,26 +157,35 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
opts: {
|
opts: {
|
||||||
jobId: uuid,
|
jobId: uuid,
|
||||||
priority: 20,
|
priority: 20,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
})
|
});
|
||||||
|
|
||||||
await lockURLs(id, jobs.map(x => x.data.url));
|
await lockURLs(
|
||||||
await addCrawlJobs(id, jobs.map(x => x.opts.jobId));
|
id,
|
||||||
|
jobs.map((x) => x.data.url)
|
||||||
|
);
|
||||||
|
await addCrawlJobs(
|
||||||
|
id,
|
||||||
|
jobs.map((x) => x.opts.jobId)
|
||||||
|
);
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await getScrapeQueue().addBulk(jobs);
|
||||||
} else {
|
} else {
|
||||||
await lockURL(id, sc, url);
|
await lockURL(id, sc, url);
|
||||||
const job = await addScrapeJob({
|
const job = await addScrapeJob(
|
||||||
url,
|
{
|
||||||
mode: "single_urls",
|
url,
|
||||||
crawlerOptions: crawlerOptions,
|
mode: "single_urls",
|
||||||
team_id: team_id,
|
crawlerOptions: crawlerOptions,
|
||||||
pageOptions: pageOptions,
|
team_id: team_id,
|
||||||
origin: req.body.origin ?? defaultOrigin,
|
pageOptions: pageOptions,
|
||||||
crawl_id: id,
|
origin: req.body.origin ?? defaultOrigin,
|
||||||
}, {
|
crawl_id: id,
|
||||||
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
},
|
||||||
});
|
{
|
||||||
|
priority: 15, // prioritize request 0 of crawl jobs same as scrape jobs
|
||||||
|
}
|
||||||
|
);
|
||||||
await addCrawlJob(id, job.id);
|
await addCrawlJob(id, job.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user