feat(runWebScraper): retry a scrape max 3 times in a crawl if the status code is failure

2025-08-14 13:45:57 +08:00 · 2024-12-13 23:46:33 +01:00 · 2024-12-13 23:46:33 +01:00 · e74e4bcefc
commit e74e4bcefc
parent 6b17a53d4b
7 changed files with 108 additions and 72 deletions
--- a/apps/api/logview.js
+++ b/apps/api/logview.js
@ -1,7 +1,19 @@
 const fs = require("fs");
-const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
+// METHOD: Winston log file
-    .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
+// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8")
 //     .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x));
 // METHOD: GCloud export
 const logs = [
    "downloaded-logs-20241213-225607.json",
    "downloaded-logs-20241213-225654.json",
    "downloaded-logs-20241213-225720.json",
    "downloaded-logs-20241213-225758.json",
    "downloaded-logs-20241213-225825.json",
    "downloaded-logs-20241213-225843.json",
 ].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload);
 const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))];
--- a/apps/api/src/controllers/v0/scrape.ts
+++ b/apps/api/src/controllers/v0/scrape.ts
@ -8,7 +8,6 @@ import { authenticateUser } from "../auth";
 import { PlanType, RateLimiterMode } from "../../types";
 import { logJob } from "../../services/logging/log_job";
 import {
  Document,
  fromLegacyCombo,
  toLegacyDocument,
  url as urlSchema,
@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node";
 import { getJobPriority } from "../../lib/job-priority";
 import { fromLegacyScrapeOptions } from "../v1/types";
 import { ZodError } from "zod";
 import { Document as V0Document } from "./../../lib/entities";
 export async function scrapeHelper(
  jobId: string,
@ -42,7 +42,7 @@ export async function scrapeHelper(
 ): Promise<{
  success: boolean;
  error?: string;
-  data?: Document | { url: string };
+  data?: V0Document | { url: string };
  returnCode: number;
 }> {
  const url = urlSchema.parse(req.body.url);
@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) {
    const endTime = new Date().getTime();
    const timeTakenInSeconds = (endTime - startTime) / 1000;
    const numTokens =
-      result.data && (result.data as Document).markdown
+      result.data && (result.data as V0Document).markdown
        ? numTokensFromString(
-            (result.data as Document).markdown!,
+            (result.data as V0Document).markdown!,
            "gpt-3.5-turbo",
          )
        : 0;
@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) {
    let doc = result.data;
    if (!pageOptions || !pageOptions.includeRawHtml) {
-      if (doc && (doc as Document).rawHtml) {
+      if (doc && (doc as V0Document).rawHtml) {
-        delete (doc as Document).rawHtml;
+        delete (doc as V0Document).rawHtml;
      }
    }
    if (pageOptions && pageOptions.includeExtract) {
-      if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) {
+      if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) {
-        delete (doc as Document).markdown;
+        delete (doc as V0Document).markdown;
      }
    }
--- a/apps/api/src/controllers/v1/extract.ts
+++ b/apps/api/src/controllers/v1/extract.ts
@ -1,6 +1,6 @@
 import { Request, Response } from "express";
 import {
-  // Document,
+  Document,
  RequestWithAuth,
  ExtractRequest,
  extractRequestSchema,
@ -8,7 +8,7 @@ import {
  MapDocument,
  scrapeOptions,
 } from "./types";
-import { Document } from "../../lib/entities";
+// import { Document } from "../../lib/entities";
 import Redis from "ioredis";
 import { configDotenv } from "dotenv";
 import { performRanking } from "../../lib/ranker";
--- a/apps/api/src/controllers/v1/types.ts
+++ b/apps/api/src/controllers/v1/types.ts
@ -396,7 +396,7 @@ export type Document = {
    articleSection?: string;
    url?: string;
    sourceURL?: string;
-    statusCode?: number;
+    statusCode: number;
    error?: string;
    [key: string]: string | string[] | number | undefined;
  };
--- a/apps/api/src/main/runWebScraper.ts
+++ b/apps/api/src/main/runWebScraper.ts
@ -49,6 +49,7 @@ export async function startWebScraperPipeline({
    bull_job_id: job.id.toString(),
    priority: job.opts.priority,
    is_scrape: job.data.is_scrape ?? false,
    is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null),
  });
 }
@ -63,9 +64,23 @@ export async function runWebScraper({
  bull_job_id,
  priority,
  is_scrape = false,
  is_crawl = false,
 }: RunWebScraperParams): Promise<ScrapeUrlResponse> {
  const tries = is_crawl ? 3 : 1;
  let response: ScrapeUrlResponse | undefined = undefined;
  let engines: EngineResultsTracker = {};
  let error: any = undefined;
  for (let i = 0; i < tries; i++) {
    if (i > 0) {
      logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error });
    }
    response = undefined;
    engines = {};
    error = undefined;
    try {
      response = await scrapeURL(bull_job_id, url, scrapeOptions, {
        priority,
@ -86,25 +101,15 @@ export async function runWebScraper({
        }
      }
    if (is_scrape === false) {
      let creditsToBeBilled = 1; // Assuming 1 credit per document
      if (scrapeOptions.extract) {
        creditsToBeBilled = 5;
      }
      billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
        logger.error(
          `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
        );
        // Optionally, you could notify an admin or add to a retry queue here
      });
    }
      // This is where the returnvalue from the job is set
      // onSuccess(response.document, mode);
      engines = response.engines;
-    return response;
+
      if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) {
        // status code is good -- do not attempt retry
        break;
      }
    } catch (error) {
      engines =
        response !== undefined
@ -112,23 +117,9 @@ export async function runWebScraper({
          : typeof error === "object" && error !== null
            ? ((error as any).results ?? {})
            : {};
    if (response !== undefined) {
      return {
        ...response,
        success: false,
        error,
      };
    } else {
      return {
        success: false,
        error,
        logs: ["no logs -- error coming from runWebScraper"],
        engines,
      };
    }
-    // onError(error);
+  }
-  } finally {
+
  const engineOrder = Object.entries(engines)
      .sort((a, b) => a[1].startedAt - b[1].startedAt)
      .map((x) => x[0]) as Engine[];
@ -158,6 +149,38 @@ export async function runWebScraper({
        },
      });
    }
  if (error === undefined && response?.success) {
    if (is_scrape === false) {
      let creditsToBeBilled = 1; // Assuming 1 credit per document
      if (scrapeOptions.extract) {
        creditsToBeBilled = 5;
      }
      billTeam(team_id, undefined, creditsToBeBilled).catch((error) => {
        logger.error(
          `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`,
        );
        // Optionally, you could notify an admin or add to a retry queue here
      });
    }
    return response;
  } else {
    if (response !== undefined) {
      return {
        ...response,
        success: false,
        error,
      };
    } else {
      return {
        success: false,
        error,
        logs: ["no logs -- error coming from runWebScraper"],
        engines,
      };
    }
  }
 }
--- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts
@ -5,7 +5,7 @@ import { Meta } from "..";
 export function extractMetadata(
  meta: Meta,
  html: string,
-): Document["metadata"] {
+): Partial<Document["metadata"]> {
  let title: string | undefined = undefined;
  let description: string | undefined = undefined;
  let language: string | undefined = undefined;
--- a/apps/api/src/types.ts
+++ b/apps/api/src/types.ts
@ -55,6 +55,7 @@ export interface RunWebScraperParams {
  bull_job_id: string;
  priority?: number;
  is_scrape?: boolean;
  is_crawl?: boolean;
 }
 export type RunWebScraperResult =