feat(v1): crawl/batch scrape errors route

2025-08-05 20:16:03 +08:00 · 2025-01-17 17:12:04 +01:00 · 2025-01-17 17:12:04 +01:00 · e6531278f6
commit e6531278f6
parent dcd3d6d98d
4 changed files with 117 additions and 0 deletions
--- a/apps/api/src/controllers/v1/crawl-errors.ts
+++ b/apps/api/src/controllers/v1/crawl-errors.ts
@ -0,0 +1,81 @@
+import { Response } from "express";
+import {
+    CrawlErrorsResponse,
+  CrawlStatusParams,
+  CrawlStatusResponse,
+  ErrorResponse,
+  RequestWithAuth,
+} from "./types";
+import {
+  getCrawl,
+  getCrawlExpiry,
+  getCrawlJobs,
+  getDoneJobsOrdered,
+  getDoneJobsOrderedLength,
+  getThrottledJobs,
+  isCrawlFinished,
+} from "../../lib/crawl-redis";
+import { getScrapeQueue, redisConnection } from "../../services/queue-service";
+import {
+  supabaseGetJobById,
+  supabaseGetJobsById,
+} from "../../lib/supabase-jobs";
+import { configDotenv } from "dotenv";
+import { Job, JobState } from "bullmq";
+import { logger } from "../../lib/logger";
+configDotenv();
+
+export async function getJob(id: string) {
+  const job = await getScrapeQueue().getJob(id);
+  if (!job) return job;
+
+  return job;
+}
+
+export async function getJobs(ids: string[]) {
+  const jobs: (Job & { id: string })[] = (
+    await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
+  ).filter((x) => x) as (Job & { id: string })[];
+
+  return jobs;
+}
+
+export async function crawlErrorsController(
+  req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
+  res: Response<CrawlErrorsResponse>,
+) {
+  const sc = await getCrawl(req.params.jobId);
+  if (!sc) {
+    return res.status(404).json({ success: false, error: "Job not found" });
+  }
+
+  if (sc.team_id !== req.auth.team_id) {
+    return res.status(403).json({ success: false, error: "Forbidden" });
+  }
+
+  let jobStatuses = await Promise.all(
+    (await getCrawlJobs(req.params.jobId)).map(
+      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
+    ),
+  );
+
+  const failedJobIDs: string[] = [];
+
+  for (const [id, status] of jobStatuses) {
+    if (
+      status === "failed"
+    ) {
+      failedJobIDs.push(id);
+    }
+  }
+
+  res.status(200).json({
+    errors: (await getJobs(failedJobIDs)).map(x => ({
+        id: x.id,
+        timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
+        url: x.data.url,
+        error: x.failedReason,
+    })),
+    robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
+  });
+}
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -568,6 +568,19 @@ export type CrawlStatusResponse =
      data: Document[];
    };

+
+export type CrawlErrorsResponse =
+  | ErrorResponse
+  | {
+      errors: {
+        id: string,
+        timestamp?: string,
+        url: string,
+        error: string,
+      }[];
+      robotsBlocked: string[];
+    };
+
 type AuthObject = {
  team_id: string;
  plan: PlanType | undefined;
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
 import { creditUsageController } from "../controllers/v1/credit-usage";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { searchController } from "../controllers/v1/search";
+import { crawlErrorsController } from "../controllers/v1/crawl-errors";

 function checkCreditsMiddleware(
  minimum?: number,
@ -192,6 +193,18 @@ v1Router.get(
  wrap((req: any, res): any => crawlStatusController(req, res, true)),
 );

+v1Router.get(
+  "/crawl/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
+v1Router.get(
+  "/batch/scrape/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
 v1Router.get(
  "/scrape/:jobId",
  authMiddleware(RateLimiterMode.CrawlStatus),
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -299,6 +299,16 @@ export class WebCrawler {
        this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
      ) {
        return fullUrl;
+      } else if (
+        this.isInternalLink(fullUrl) &&
+        this.noSections(fullUrl) &&
+        !this.matchesExcludes(path) &&
+        !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
+      ) {
+        (async() => {
+          await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
+          await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
+        })();
      }
    } else {
      // EXTERNAL LINKS