mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 21:09:00 +08:00
fix(crawl-status): consider concurrency limited jobs as prioritized (#1184)
This commit is contained in:
parent
7ac2b99210
commit
fd8b38902a
@ -2,27 +2,15 @@ import { Response } from "express";
|
||||
import {
|
||||
CrawlErrorsResponse,
|
||||
CrawlStatusParams,
|
||||
CrawlStatusResponse,
|
||||
ErrorResponse,
|
||||
RequestWithAuth,
|
||||
} from "./types";
|
||||
import {
|
||||
getCrawl,
|
||||
getCrawlExpiry,
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue, redisConnection } from "../../services/queue-service";
|
||||
import {
|
||||
supabaseGetJobById,
|
||||
supabaseGetJobsById,
|
||||
} from "../../lib/supabase-jobs";
|
||||
import { configDotenv } from "dotenv";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { Job } from "bullmq";
|
||||
configDotenv();
|
||||
|
||||
export async function getJob(id: string) {
|
||||
|
@ -17,7 +17,6 @@ import {
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlFinished,
|
||||
isCrawlFinishedLocked,
|
||||
} from "../../lib/crawl-redis";
|
||||
@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service";
|
||||
import { getJob, getJobs } from "./crawl-status";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { Job, JobState } from "bullmq";
|
||||
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||
|
||||
type ErrorMessage = {
|
||||
type: "error";
|
||||
@ -127,16 +127,16 @@ async function crawlStatusWS(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
|
||||
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
!throttledJobsSet.has(id) &&
|
||||
if (throttledJobsSet.has(id)) {
|
||||
validJobStatuses.push([id, "prioritized"]);
|
||||
validJobIDs.push(id);
|
||||
} else if (
|
||||
status !== "failed" &&
|
||||
status !== "unknown"
|
||||
) {
|
||||
|
@ -11,7 +11,6 @@ import {
|
||||
getCrawlJobs,
|
||||
getDoneJobsOrdered,
|
||||
getDoneJobsOrderedLength,
|
||||
getThrottledJobs,
|
||||
isCrawlKickoffFinished,
|
||||
} from "../../lib/crawl-redis";
|
||||
import { getScrapeQueue } from "../../services/queue-service";
|
||||
@ -23,6 +22,7 @@ import { configDotenv } from "dotenv";
|
||||
import type { Job, JobState } from "bullmq";
|
||||
import { logger } from "../../lib/logger";
|
||||
import { supabase_service } from "../../services/supabase";
|
||||
import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
|
||||
configDotenv();
|
||||
|
||||
export type PseudoJob<T> = {
|
||||
@ -137,16 +137,17 @@ export async function crawlStatusController(
|
||||
async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
|
||||
),
|
||||
);
|
||||
const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
|
||||
|
||||
const throttledJobsSet = new Set(throttledJobs);
|
||||
const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
|
||||
|
||||
const validJobStatuses: [string, JobState | "unknown"][] = [];
|
||||
const validJobIDs: string[] = [];
|
||||
|
||||
for (const [id, status] of jobStatuses) {
|
||||
if (
|
||||
!throttledJobsSet.has(id) &&
|
||||
if (throttledJobsSet.has(id)) {
|
||||
validJobStatuses.push([id, "prioritized"]);
|
||||
validJobIDs.push(id);
|
||||
} else if (
|
||||
status !== "failed" &&
|
||||
status !== "unknown"
|
||||
) {
|
||||
|
@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob(
|
||||
);
|
||||
}
|
||||
|
||||
export async function getConcurrencyLimitedJobs(
|
||||
team_id: string,
|
||||
) {
|
||||
return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
|
||||
}
|
||||
|
||||
export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
|
||||
const count = await redisConnection.zcard(constructQueueKey(team_id));
|
||||
|
@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise<number> {
|
||||
return await redisConnection.scard("crawl:" + id + ":jobs");
|
||||
}
|
||||
|
||||
export async function getThrottledJobs(teamId: string): Promise<string[]> {
|
||||
return await redisConnection.zrangebyscore(
|
||||
"concurrency-limiter:" + teamId + ":throttled",
|
||||
Date.now(),
|
||||
Infinity,
|
||||
);
|
||||
}
|
||||
|
||||
export function normalizeURL(url: string, sc: StoredCrawl): string {
|
||||
const urlO = new URL(url);
|
||||
if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user