From e74e4bcefc5ebf97ef8fbe726c21e924bfef7b2b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= <mo.geryy@gmail.com>
Date: Fri, 13 Dec 2024 23:46:33 +0100
Subject: [PATCH] feat(runWebScraper): retry a scrape max 3 times in a crawl if
 the status code is failure

---
 apps/api/logview.js                           |  16 +-
 apps/api/src/controllers/v0/scrape.ts         |  16 +-
 apps/api/src/controllers/v1/extract.ts        |   4 +-
 apps/api/src/controllers/v1/types.ts          |   2 +-
 apps/api/src/main/runWebScraper.ts            | 139 ++++++++++--------
 .../scraper/scrapeURL/lib/extractMetadata.ts  |   2 +-
 apps/api/src/types.ts                         |   1 +
 7 files changed, 108 insertions(+), 72 deletions(-)

diff --git a/apps/api/logview.js b/apps/api/logview.js
index 232d2cda..3c0db523 100644
--- a/apps/api/logview.js
+++ b/apps/api/logview.js
@@ -1,7 +1,19 @@
 const fs = require("fs");
 
-const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
-    .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
+// METHOD: Winston log file
+// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
+//     .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
+
+// METHOD: GCloud export
+const logs = [
+    "downloaded-logs-20241213-225607.json",
+    "downloaded-logs-20241213-225654.json",
+    "downloaded-logs-20241213-225720.json",
+    "downloaded-logs-20241213-225758.json",
+    "downloaded-logs-20241213-225825.json",
+    "downloaded-logs-20241213-225843.json",
+].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
+
 
 const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
 
diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts
index 8501e502..96e6ea4f 100644
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
 import { PlanType, RateLimiterMode } from "../../types";
 import { logJob } from "../../services/logging/log_job";
 import {
-  Document,
   fromLegacyCombo,
   toLegacyDocument,
   url as urlSchema,
@@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node";
 import { getJobPriority } from "../../lib/job-priority";
 import { fromLegacyScrapeOptions } from "../v1/types";
 import { ZodError } from "zod";
+import { Document as V0Document } from "./../../lib/entities";
 
 export async function scrapeHelper(
   jobId: string,
@@ -42,7 +42,7 @@ export async function scrapeHelper(
 ): Promise<{
   success: boolean;
   error?: string;
-  data?: Document | { url: string };
+  data?: V0Document | { url: string };
   returnCode: number;
 }> {
   const url = urlSchema.parse(req.body.url);
@@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
     const endTime = new Date().getTime();
     const timeTakenInSeconds = (endTime - startTime) / 1000;
     const numTokens =
-      result.data && (result.data as Document).markdown
+      result.data && (result.data as V0Document).markdown
         ? numTokensFromString(
-            (result.data as Document).markdown!,
+            (result.data as V0Document).markdown!,
             "gpt-3.5-turbo",
           )
         : 0;
@@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) {
 
     let doc = result.data;
     if (!pageOptions || !pageOptions.includeRawHtml) {
-      if (doc && (doc as Document).rawHtml) {
-        delete (doc as Document).rawHtml;
+      if (doc && (doc as V0Document).rawHtml) {
+        delete (doc as V0Document).rawHtml;
       }
     }
 
     if (pageOptions && pageOptions.includeExtract) {
-      if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
-        delete (doc as Document).markdown;
+      if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
+        delete (doc as V0Document).markdown;
       }
     }
 
diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts
index 0c286253..d05dbf6e 100644
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@@ -1,6 +1,6 @@
 import { Request, Response } from "express";
 import {
-  // Document,
+  Document,
   RequestWithAuth,
   ExtractRequest,
   extractRequestSchema,
@@ -8,7 +8,7 @@ import {
   MapDocument,
   scrapeOptions,
 } from "./types";
-import { Document } from "../../lib/entities";
+// import { Document } from "../../lib/entities";
 import Redis from "ioredis";
 import { configDotenv } from "dotenv";
 import { performRanking } from "../../lib/ranker";
diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts
index 076d8b0b..d3f110c8 100644
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@@ -396,7 +396,7 @@ export type Document = {
     articleSection?: string;
     url?: string;
     sourceURL?: string;
-    statusCode?: number;
+    statusCode: number;
     error?: string;
     [key: string]: string | string[] | number | undefined;
   };
diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts
index dc907371..411acfe6 100644
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
     bull_job_id: job.id.toString(),
     priority: job.opts.priority,
     is_scrape: job.data.is_scrape ?? false,
+    is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
   });
 }
 
@@ -63,73 +64,63 @@ export async function runWebScraper({
   bull_job_id,
   priority,
   is_scrape = false,
+  is_crawl = false,
 }: RunWebScraperParams): Promise<ScrapeUrlResponse> {
+  const tries = is_crawl ? 3 : 1;
+
   let response: ScrapeUrlResponse | undefined = undefined;
   let engines: EngineResultsTracker = {};
-  try {
-    response = await scrapeURL(bull_job_id, url, scrapeOptions, {
-      priority,
-      ...internalOptions,
-    });
-    if (!response.success) {
-      if (response.error instanceof Error) {
-        throw response.error;
-      } else {
-        throw new Error(
-          "scrapeURL error: " +
-            (Array.isArray(response.error)
-              ? JSON.stringify(response.error)
-              : typeof response.error === "object"
-                ? JSON.stringify({ ...response.error })
-                : response.error),
-        );
-      }
+  let error: any = undefined;
+
+  for (let i = 0; i < tries; i++) {
+    if (i > 0) {
+      logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error });
     }
 
-    if (is_scrape === false) {
-      let creditsToBeBilled = 1; // Assuming 1 credit per document
-      if (scrapeOptions.extract) {
-        creditsToBeBilled = 5;
-      }
+    response = undefined;
+    engines = {};
+    error = undefined;
 
-      billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
-        logger.error(
-          `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
-        );
-        // Optionally, you could notify an admin or add to a retry queue here
+    try {
+      response = await scrapeURL(bull_job_id, url, scrapeOptions, {
+        priority,
+        ...internalOptions,
       });
+      if (!response.success) {
+        if (response.error instanceof Error) {
+          throw response.error;
+        } else {
+          throw new Error(
+            "scrapeURL error: " +
+              (Array.isArray(response.error)
+                ? JSON.stringify(response.error)
+                : typeof response.error === "object"
+                  ? JSON.stringify({ ...response.error })
+                  : response.error),
+          );
+        }
+      }
+  
+      // This is where the returnvalue from the job is set
+      // onSuccess(response.document, mode);
+  
+      engines = response.engines;
+
+      if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) {
+        // status code is good -- do not attempt retry
+        break;
+      }
+    } catch (error) {
+      engines =
+        response !== undefined
+          ? response.engines
+          : typeof error === "object" && error !== null
+            ? ((error as any).results ?? {})
+            : {};
     }
+  }
 
-    // This is where the returnvalue from the job is set
-    // onSuccess(response.document, mode);
-
-    engines = response.engines;
-    return response;
-  } catch (error) {
-    engines =
-      response !== undefined
-        ? response.engines
-        : typeof error === "object" && error !== null
-          ? ((error as any).results ?? {})
-          : {};
-
-    if (response !== undefined) {
-      return {
-        ...response,
-        success: false,
-        error,
-      };
-    } else {
-      return {
-        success: false,
-        error,
-        logs: ["no logs -- error coming from runWebScraper"],
-        engines,
-      };
-    }
-    // onError(error);
-  } finally {
-    const engineOrder = Object.entries(engines)
+  const engineOrder = Object.entries(engines)
       .sort((a, b) => a[1].startedAt - b[1].startedAt)
       .map((x) => x[0]) as Engine[];
 
@@ -158,6 +149,38 @@ export async function runWebScraper({
         },
       });
     }
+
+  if (error === undefined && response?.success) {
+    if (is_scrape === false) {
+      let creditsToBeBilled = 1; // Assuming 1 credit per document
+      if (scrapeOptions.extract) {
+        creditsToBeBilled = 5;
+      }
+
+      billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
+        logger.error(
+          `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
+        );
+        // Optionally, you could notify an admin or add to a retry queue here
+      });
+    }
+
+    return response;
+  } else {
+    if (response !== undefined) {
+      return {
+        ...response,
+        success: false,
+        error,
+      };
+    } else {
+      return {
+        success: false,
+        error,
+        logs: ["no logs -- error coming from runWebScraper"],
+        engines,
+      };
+    }
   }
 }
 
diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
index 040bf0ee..c67f9cbd 100644
--- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
@@ -5,7 +5,7 @@ import { Meta } from "..";
 export function extractMetadata(
   meta: Meta,
   html: string,
-): Document["metadata"] {
+): Partial<Document["metadata"]> {
   let title: string | undefined = undefined;
   let description: string | undefined = undefined;
   let language: string | undefined = undefined;
diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts
index 5325a0ad..9db79bc5 100644
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@@ -55,6 +55,7 @@ export interface RunWebScraperParams {
   bull_job_id: string;
   priority?: number;
   is_scrape?: boolean;
+  is_crawl?: boolean;
 }
 
 export type RunWebScraperResult =