mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:46:04 +08:00
fix(queue-jobs): fix concurrency limit
This commit is contained in:
parent
72d6a8179e
commit
2de659d810
@ -18,7 +18,7 @@ import {
|
|||||||
} from "../../lib/crawl-redis";
|
} from "../../lib/crawl-redis";
|
||||||
import { logCrawl } from "../../services/logging/crawl_log";
|
import { logCrawl } from "../../services/logging/crawl_log";
|
||||||
import { getScrapeQueue } from "../../services/queue-service";
|
import { getScrapeQueue } from "../../services/queue-service";
|
||||||
import { addScrapeJob } from "../../services/queue-jobs";
|
import { addScrapeJob, addScrapeJobs } from "../../services/queue-jobs";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { getJobPriority } from "../../lib/job-priority";
|
import { getJobPriority } from "../../lib/job-priority";
|
||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
@ -139,9 +139,9 @@ export async function crawlController(
|
|||||||
name: uuid,
|
name: uuid,
|
||||||
data: {
|
data: {
|
||||||
url,
|
url,
|
||||||
mode: "single_urls",
|
mode: "single_urls" as const,
|
||||||
team_id: req.auth.team_id,
|
team_id: req.auth.team_id,
|
||||||
plan: req.auth.plan,
|
plan: req.auth.plan!,
|
||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
internalOptions: sc.internalOptions,
|
internalOptions: sc.internalOptions,
|
||||||
@ -170,7 +170,7 @@ export async function crawlController(
|
|||||||
jobs.map((x) => x.opts.jobId),
|
jobs.map((x) => x.opts.jobId),
|
||||||
);
|
);
|
||||||
logger.debug("Adding scrape jobs to BullMQ...");
|
logger.debug("Adding scrape jobs to BullMQ...");
|
||||||
await getScrapeQueue().addBulk(jobs);
|
await addScrapeJobs(jobs);
|
||||||
} else {
|
} else {
|
||||||
logger.debug("Sitemap not found or ignored.", {
|
logger.debug("Sitemap not found or ignored.", {
|
||||||
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
ignoreSitemap: sc.crawlerOptions.ignoreSitemap,
|
||||||
|
@ -11,11 +11,50 @@ import {
|
|||||||
pushConcurrencyLimitedJob,
|
pushConcurrencyLimitedJob,
|
||||||
} from "../lib/concurrency-limit";
|
} from "../lib/concurrency-limit";
|
||||||
|
|
||||||
|
async function _addScrapeJobToConcurrencyQueue(
|
||||||
|
webScraperOptions: any,
|
||||||
|
options: any,
|
||||||
|
jobId: string,
|
||||||
|
jobPriority: number,
|
||||||
|
) {
|
||||||
|
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
|
||||||
|
id: jobId,
|
||||||
|
data: webScraperOptions,
|
||||||
|
opts: {
|
||||||
|
...options,
|
||||||
|
priority: jobPriority,
|
||||||
|
jobId: jobId,
|
||||||
|
},
|
||||||
|
priority: jobPriority,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function _addScrapeJobToBullMQ(
|
||||||
|
webScraperOptions: any,
|
||||||
|
options: any,
|
||||||
|
jobId: string,
|
||||||
|
jobPriority: number,
|
||||||
|
) {
|
||||||
|
if (
|
||||||
|
webScraperOptions &&
|
||||||
|
webScraperOptions.team_id &&
|
||||||
|
webScraperOptions.plan
|
||||||
|
) {
|
||||||
|
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
|
||||||
|
}
|
||||||
|
|
||||||
|
await getScrapeQueue().add(jobId, webScraperOptions, {
|
||||||
|
...options,
|
||||||
|
priority: jobPriority,
|
||||||
|
jobId,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
async function addScrapeJobRaw(
|
async function addScrapeJobRaw(
|
||||||
webScraperOptions: any,
|
webScraperOptions: any,
|
||||||
options: any,
|
options: any,
|
||||||
jobId: string,
|
jobId: string,
|
||||||
jobPriority: number = 10,
|
jobPriority: number,
|
||||||
) {
|
) {
|
||||||
let concurrencyLimited = false;
|
let concurrencyLimited = false;
|
||||||
|
|
||||||
@ -33,30 +72,9 @@ async function addScrapeJobRaw(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (concurrencyLimited) {
|
if (concurrencyLimited) {
|
||||||
await pushConcurrencyLimitedJob(webScraperOptions.team_id, {
|
await _addScrapeJobToConcurrencyQueue(webScraperOptions, options, jobId, jobPriority);
|
||||||
id: jobId,
|
|
||||||
data: webScraperOptions,
|
|
||||||
opts: {
|
|
||||||
...options,
|
|
||||||
priority: jobPriority,
|
|
||||||
jobId: jobId,
|
|
||||||
},
|
|
||||||
priority: jobPriority,
|
|
||||||
});
|
|
||||||
} else {
|
} else {
|
||||||
if (
|
await _addScrapeJobToBullMQ(webScraperOptions, options, jobId, jobPriority);
|
||||||
webScraperOptions &&
|
|
||||||
webScraperOptions.team_id &&
|
|
||||||
webScraperOptions.plan
|
|
||||||
) {
|
|
||||||
await pushConcurrencyLimitActiveJob(webScraperOptions.team_id, jobId);
|
|
||||||
}
|
|
||||||
|
|
||||||
await getScrapeQueue().add(jobId, webScraperOptions, {
|
|
||||||
...options,
|
|
||||||
priority: jobPriority,
|
|
||||||
jobId,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -109,11 +127,87 @@ export async function addScrapeJobs(
|
|||||||
}[],
|
}[],
|
||||||
) {
|
) {
|
||||||
if (jobs.length === 0) return true;
|
if (jobs.length === 0) return true;
|
||||||
// TODO: better
|
|
||||||
|
let countCanBeDirectlyAdded = Infinity;
|
||||||
|
|
||||||
|
if (
|
||||||
|
jobs[0].data &&
|
||||||
|
jobs[0].data.team_id &&
|
||||||
|
jobs[0].data.plan
|
||||||
|
) {
|
||||||
|
const now = Date.now();
|
||||||
|
const limit = await getConcurrencyLimitMax(jobs[0].data.plan);
|
||||||
|
console.log("CC limit", limit);
|
||||||
|
cleanOldConcurrencyLimitEntries(jobs[0].data.team_id, now);
|
||||||
|
|
||||||
|
countCanBeDirectlyAdded = Math.max(limit - (await getConcurrencyLimitActiveJobs(jobs[0].data.team_id, now)).length, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
const addToBull = jobs.slice(0, countCanBeDirectlyAdded);
|
||||||
|
const addToCQ = jobs.slice(countCanBeDirectlyAdded);
|
||||||
|
|
||||||
await Promise.all(
|
await Promise.all(
|
||||||
jobs.map((job) =>
|
addToBull.map(async (job) => {
|
||||||
addScrapeJob(job.data, job.opts, job.opts.jobId, job.opts.priority),
|
const size = JSON.stringify(job.data).length;
|
||||||
),
|
return await Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Add scrape job",
|
||||||
|
op: "queue.publish",
|
||||||
|
attributes: {
|
||||||
|
"messaging.message.id": job.opts.jobId,
|
||||||
|
"messaging.destination.name": getScrapeQueue().name,
|
||||||
|
"messaging.message.body.size": size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async (span) => {
|
||||||
|
await _addScrapeJobToBullMQ(
|
||||||
|
{
|
||||||
|
...job.data,
|
||||||
|
sentry: {
|
||||||
|
trace: Sentry.spanToTraceHeader(span),
|
||||||
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
|
size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
job.opts,
|
||||||
|
job.opts.jobId,
|
||||||
|
job.opts.priority,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
await Promise.all(
|
||||||
|
addToCQ.map(async (job) => {
|
||||||
|
const size = JSON.stringify(job.data).length;
|
||||||
|
return await Sentry.startSpan(
|
||||||
|
{
|
||||||
|
name: "Add scrape job",
|
||||||
|
op: "queue.publish",
|
||||||
|
attributes: {
|
||||||
|
"messaging.message.id": job.opts.jobId,
|
||||||
|
"messaging.destination.name": getScrapeQueue().name,
|
||||||
|
"messaging.message.body.size": size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
async (span) => {
|
||||||
|
await _addScrapeJobToConcurrencyQueue(
|
||||||
|
{
|
||||||
|
...job.data,
|
||||||
|
sentry: {
|
||||||
|
trace: Sentry.spanToTraceHeader(span),
|
||||||
|
baggage: Sentry.spanToBaggageHeader(span),
|
||||||
|
size,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
job.opts,
|
||||||
|
job.opts.jobId,
|
||||||
|
job.opts.priority,
|
||||||
|
);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user