fix(crawl-status): consider concurrency limited jobs as prioritized (#1184)

2025-08-12 11:49:07 +08:00 · 2025-02-16 15:52:17 +01:00 · 2025-02-16 15:52:17 +01:00 · fd8b38902a
commit fd8b38902a
parent 7ac2b99210
5 changed files with 19 additions and 33 deletions
--- a/apps/api/src/controllers/v1/crawl-errors.ts
+++ b/apps/api/src/controllers/v1/crawl-errors.ts
@ -2,27 +2,15 @@ import { Response } from "express";
 import {
  CrawlErrorsResponse,
  CrawlStatusParams,
  CrawlStatusResponse,
  ErrorResponse,
  RequestWithAuth,
 } from "./types";
 import {
  getCrawl,
  getCrawlExpiry,
  getCrawlJobs,
  getDoneJobsOrdered,
  getDoneJobsOrderedLength,
  getThrottledJobs,
  isCrawlFinished,
 } from "../../lib/crawl-redis";
 import { getScrapeQueue, redisConnection } from "../../services/queue-service";
 import {
  supabaseGetJobById,
  supabaseGetJobsById,
 } from "../../lib/supabase-jobs";
 import { configDotenv } from "dotenv";
-import { Job, JobState } from "bullmq";
+import { Job } from "bullmq";
 import { logger } from "../../lib/logger";
 configDotenv();
 export async function getJob(id: string) {
--- a/apps/api/src/controllers/v1/crawl-status-ws.ts
+++ b/apps/api/src/controllers/v1/crawl-status-ws.ts
@ -17,7 +17,6 @@ import {
  getCrawlJobs,
  getDoneJobsOrdered,
  getDoneJobsOrderedLength,
  getThrottledJobs,
  isCrawlFinished,
  isCrawlFinishedLocked,
 } from "../../lib/crawl-redis";
@ -25,6 +24,7 @@ import { getScrapeQueue } from "../../services/queue-service";
 import { getJob, getJobs } from "./crawl-status";
 import * as Sentry from "@sentry/node";
 import { Job, JobState } from "bullmq";
 import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
 type ErrorMessage = {
  type: "error";
@ -127,16 +127,16 @@ async function crawlStatusWS(
      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
    ),
  );
-  const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
+  const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
  const throttledJobsSet = new Set(throttledJobs);
  const validJobStatuses: [string, JobState | "unknown"][] = [];
  const validJobIDs: string[] = [];
  for (const [id, status] of jobStatuses) {
-    if (
+    if (throttledJobsSet.has(id)) {
-      !throttledJobsSet.has(id) &&
+      validJobStatuses.push([id, "prioritized"]);
      validJobIDs.push(id);
    } else if (
      status !== "failed" &&
      status !== "unknown"
    ) {
--- a/apps/api/src/controllers/v1/crawl-status.ts
+++ b/apps/api/src/controllers/v1/crawl-status.ts
@ -11,7 +11,6 @@ import {
  getCrawlJobs,
  getDoneJobsOrdered,
  getDoneJobsOrderedLength,
  getThrottledJobs,
  isCrawlKickoffFinished,
 } from "../../lib/crawl-redis";
 import { getScrapeQueue } from "../../services/queue-service";
@ -23,6 +22,7 @@ import { configDotenv } from "dotenv";
 import type { Job, JobState } from "bullmq";
 import { logger } from "../../lib/logger";
 import { supabase_service } from "../../services/supabase";
 import { getConcurrencyLimitedJobs } from "../../lib/concurrency-limit";
 configDotenv();
 export type PseudoJob<T> = {
@ -137,16 +137,17 @@ export async function crawlStatusController(
      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
    ),
  );
  const throttledJobs = new Set(...(await getThrottledJobs(req.auth.team_id)));
-  const throttledJobsSet = new Set(throttledJobs);
+  const throttledJobsSet = await getConcurrencyLimitedJobs(req.auth.team_id);
  const validJobStatuses: [string, JobState | "unknown"][] = [];
  const validJobIDs: string[] = [];
  for (const [id, status] of jobStatuses) {
-    if (
+    if (throttledJobsSet.has(id)) {
-      !throttledJobsSet.has(id) &&
+      validJobStatuses.push([id, "prioritized"]);
      validJobIDs.push(id);
    } else if (
      status !== "failed" &&
      status !== "unknown"
    ) {
--- a/apps/api/src/lib/concurrency-limit.ts
+++ b/apps/api/src/lib/concurrency-limit.ts
@ -100,6 +100,11 @@ export async function pushConcurrencyLimitedJob(
  );
 }
 export async function getConcurrencyLimitedJobs(
  team_id: string,
 ) {
  return new Set((await redisConnection.zrange(constructQueueKey(team_id), 0, -1)).map(x => JSON.parse(x).id));
 }
 export async function getConcurrencyQueueJobsCount(team_id: string): Promise<number> {
  const count = await redisConnection.zcard(constructQueueKey(team_id));
--- a/apps/api/src/lib/crawl-redis.ts
+++ b/apps/api/src/lib/crawl-redis.ts
@ -184,14 +184,6 @@ export async function getCrawlJobCount(id: string): Promise<number> {
  return await redisConnection.scard("crawl:" + id + ":jobs");
 }
 export async function getThrottledJobs(teamId: string): Promise<string[]> {
  return await redisConnection.zrangebyscore(
    "concurrency-limiter:" + teamId + ":throttled",
    Date.now(),
    Infinity,
  );
 }
 export function normalizeURL(url: string, sc: StoredCrawl): string {
  const urlO = new URL(url);
  if (!sc.crawlerOptions || sc.crawlerOptions.ignoreQueryParameters) {