From e6531278f65494de8ad9afeeecafeb6d2553fd7d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Fri, 17 Jan 2025 17:12:04 +0100
Subject: [PATCH] feat(v1): crawl/batch scrape errors route

---
 apps/api/src/controllers/v1/crawl-errors.ts | 81 +++++++++++++++++++++
 apps/api/src/controllers/v1/types.ts        | 13 ++++
 apps/api/src/routes/v1.ts                   | 13 ++++
 apps/api/src/scraper/WebScraper/crawler.ts  | 10 +++
 4 files changed, 117 insertions(+)
 create mode 100644 apps/api/src/controllers/v1/crawl-errors.ts

diff --git a/apps/api/src/controllers/v1/crawl-errors.ts b/apps/api/src/controllers/v1/crawl-errors.ts
new file mode 100644
index 00000000..b64d02fa
--- /dev/null
+++ b/apps/api/src/controllers/v1/crawl-errors.ts
@@ -0,0 +1,81 @@
+import { Response } from "express";
+import {
+    CrawlErrorsResponse,
+  CrawlStatusParams,
+  CrawlStatusResponse,
+  ErrorResponse,
+  RequestWithAuth,
+} from "./types";
+import {
+  getCrawl,
+  getCrawlExpiry,
+  getCrawlJobs,
+  getDoneJobsOrdered,
+  getDoneJobsOrderedLength,
+  getThrottledJobs,
+  isCrawlFinished,
+} from "../../lib/crawl-redis";
+import { getScrapeQueue, redisConnection } from "../../services/queue-service";
+import {
+  supabaseGetJobById,
+  supabaseGetJobsById,
+} from "../../lib/supabase-jobs";
+import { configDotenv } from "dotenv";
+import { Job, JobState } from "bullmq";
+import { logger } from "../../lib/logger";
+configDotenv();
+
+export async function getJob(id: string) {
+  const job = await getScrapeQueue().getJob(id);
+  if (!job) return job;
+
+  return job;
+}
+
+export async function getJobs(ids: string[]) {
+  const jobs: (Job & { id: string })[] = (
+    await Promise.all(ids.map((x) => getScrapeQueue().getJob(x)))
+  ).filter((x) => x) as (Job & { id: string })[];
+
+  return jobs;
+}
+
+export async function crawlErrorsController(
+  req: RequestWithAuth<CrawlStatusParams, undefined, CrawlErrorsResponse>,
+  res: Response<CrawlErrorsResponse>,
+) {
+  const sc = await getCrawl(req.params.jobId);
+  if (!sc) {
+    return res.status(404).json({ success: false, error: "Job not found" });
+  }
+
+  if (sc.team_id !== req.auth.team_id) {
+    return res.status(403).json({ success: false, error: "Forbidden" });
+  }
+
+  let jobStatuses = await Promise.all(
+    (await getCrawlJobs(req.params.jobId)).map(
+      async (x) => [x, await getScrapeQueue().getJobState(x)] as const,
+    ),
+  );
+
+  const failedJobIDs: string[] = [];
+
+  for (const [id, status] of jobStatuses) {
+    if (
+      status === "failed"
+    ) {
+      failedJobIDs.push(id);
+    }
+  }
+
+  res.status(200).json({
+    errors: (await getJobs(failedJobIDs)).map(x => ({
+        id: x.id,
+        timestamp: x.finishedOn !== undefined ? (new Date(x.finishedOn).toISOString()) : undefined,
+        url: x.data.url,
+        error: x.failedReason,
+    })),
+    robotsBlocked: await redisConnection.smembers("crawl:" + req.params.jobId + ":robots_blocked"),
+  });
+}
diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
index ac3743e8..ee141625 100644
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@@ -568,6 +568,19 @@ export type CrawlStatusResponse =
       data: Document[];
     };
 
+
+export type CrawlErrorsResponse =
+  | ErrorResponse
+  | {
+      errors: {
+        id: string,
+        timestamp?: string,
+        url: string,
+        error: string,
+      }[];
+      robotsBlocked: string[];
+    };
+
 type AuthObject = {
   team_id: string;
   plan: PlanType | undefined;
diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts
index a916dd40..4aacfe18 100644
--- a/apps/api/src/routes/v1.ts
+++ b/apps/api/src/routes/v1.ts
@@ -28,6 +28,7 @@ import { extractStatusController } from "../controllers/v1/extract-status";
 import { creditUsageController } from "../controllers/v1/credit-usage";
 import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
 import { searchController } from "../controllers/v1/search";
+import { crawlErrorsController } from "../controllers/v1/crawl-errors";
 
 function checkCreditsMiddleware(
   minimum?: number,
@@ -192,6 +193,18 @@ v1Router.get(
   wrap((req: any, res): any => crawlStatusController(req, res, true)),
 );
 
+v1Router.get(
+  "/crawl/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
+v1Router.get(
+  "/batch/scrape/:jobId/errors",
+  authMiddleware(RateLimiterMode.CrawlStatus),
+  wrap(crawlErrorsController),
+);
+
 v1Router.get(
   "/scrape/:jobId",
   authMiddleware(RateLimiterMode.CrawlStatus),
diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts
index a6645152..7d4be97b 100644
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@@ -299,6 +299,16 @@ export class WebCrawler {
         this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
       ) {
         return fullUrl;
+      } else if (
+        this.isInternalLink(fullUrl) &&
+        this.noSections(fullUrl) &&
+        !this.matchesExcludes(path) &&
+        !this.isRobotsAllowed(fullUrl, this.ignoreRobotsTxt)
+      ) {
+        (async() => {
+          await redisConnection.sadd("crawl:" + this.jobId + ":robots_blocked", fullUrl);
+          await redisConnection.expire("crawl:" + this.jobId + ":robots_blocked", 24 * 60 * 60, "NX");
+        })();
       }
     } else {
       // EXTERNAL LINKS