From e74e4bcefc5ebf97ef8fbe726c21e924bfef7b2b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 13 Dec 2024 23:46:33 +0100 Subject: [PATCH] feat(runWebScraper): retry a scrape max 3 times in a crawl if the status code is failure --- apps/api/logview.js | 16 +- apps/api/src/controllers/v0/scrape.ts | 16 +- apps/api/src/controllers/v1/extract.ts | 4 +- apps/api/src/controllers/v1/types.ts | 2 +- apps/api/src/main/runWebScraper.ts | 139 ++++++++++-------- .../scraper/scrapeURL/lib/extractMetadata.ts | 2 +- apps/api/src/types.ts | 1 + 7 files changed, 108 insertions(+), 72 deletions(-) diff --git a/apps/api/logview.js b/apps/api/logview.js index 232d2cda..3c0db523 100644 --- a/apps/api/logview.js +++ b/apps/api/logview.js @@ -1,7 +1,19 @@ const fs = require("fs"); -const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") - .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); +// METHOD: Winston log file +// const logs = fs.readFileSync("7a373219-0eb4-4e47-b2df-e90e12afd5c1.log", "utf8") +// .split("\n").filter(x => x.trim().length > 0).map(x => JSON.parse(x)); + +// METHOD: GCloud export +const logs = [ + "downloaded-logs-20241213-225607.json", + "downloaded-logs-20241213-225654.json", + "downloaded-logs-20241213-225720.json", + "downloaded-logs-20241213-225758.json", + "downloaded-logs-20241213-225825.json", + "downloaded-logs-20241213-225843.json", +].flatMap(x => JSON.parse(fs.readFileSync(x, "utf8"))).map(x => x.jsonPayload); + const crawlIds = [...new Set(logs.map(x => x.crawlId).filter(x => x))]; diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 8501e502..96e6ea4f 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -8,7 +8,6 @@ import { authenticateUser } from "../auth"; import { PlanType, RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { - Document, fromLegacyCombo, toLegacyDocument, url as urlSchema, @@ -29,6 +28,7 @@ import * as Sentry from "@sentry/node"; import { getJobPriority } from "../../lib/job-priority"; import { fromLegacyScrapeOptions } from "../v1/types"; import { ZodError } from "zod"; +import { Document as V0Document } from "./../../lib/entities"; export async function scrapeHelper( jobId: string, @@ -42,7 +42,7 @@ export async function scrapeHelper( ): Promise<{ success: boolean; error?: string; - data?: Document | { url: string }; + data?: V0Document | { url: string }; returnCode: number; }> { const url = urlSchema.parse(req.body.url); @@ -241,9 +241,9 @@ export async function scrapeController(req: Request, res: Response) { const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; const numTokens = - result.data && (result.data as Document).markdown + result.data && (result.data as V0Document).markdown ? numTokensFromString( - (result.data as Document).markdown!, + (result.data as V0Document).markdown!, "gpt-3.5-turbo", ) : 0; @@ -276,14 +276,14 @@ export async function scrapeController(req: Request, res: Response) { let doc = result.data; if (!pageOptions || !pageOptions.includeRawHtml) { - if (doc && (doc as Document).rawHtml) { - delete (doc as Document).rawHtml; + if (doc && (doc as V0Document).rawHtml) { + delete (doc as V0Document).rawHtml; } } if (pageOptions && pageOptions.includeExtract) { - if (!pageOptions.includeMarkdown && doc && (doc as Document).markdown) { - delete (doc as Document).markdown; + if (!pageOptions.includeMarkdown && doc && (doc as V0Document).markdown) { + delete (doc as V0Document).markdown; } } diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 0c286253..d05dbf6e 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -1,6 +1,6 @@ import { Request, Response } from "express"; import { - // Document, + Document, RequestWithAuth, ExtractRequest, extractRequestSchema, @@ -8,7 +8,7 @@ import { MapDocument, scrapeOptions, } from "./types"; -import { Document } from "../../lib/entities"; +// import { Document } from "../../lib/entities"; import Redis from "ioredis"; import { configDotenv } from "dotenv"; import { performRanking } from "../../lib/ranker"; diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 076d8b0b..d3f110c8 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -396,7 +396,7 @@ export type Document = { articleSection?: string; url?: string; sourceURL?: string; - statusCode?: number; + statusCode: number; error?: string; [key: string]: string | string[] | number | undefined; }; diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index dc907371..411acfe6 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -49,6 +49,7 @@ export async function startWebScraperPipeline({ bull_job_id: job.id.toString(), priority: job.opts.priority, is_scrape: job.data.is_scrape ?? false, + is_crawl: !!(job.data.crawl_id && job.data.crawlerOptions !== null), }); } @@ -63,73 +64,63 @@ export async function runWebScraper({ bull_job_id, priority, is_scrape = false, + is_crawl = false, }: RunWebScraperParams): Promise { + const tries = is_crawl ? 3 : 1; + let response: ScrapeUrlResponse | undefined = undefined; let engines: EngineResultsTracker = {}; - try { - response = await scrapeURL(bull_job_id, url, scrapeOptions, { - priority, - ...internalOptions, - }); - if (!response.success) { - if (response.error instanceof Error) { - throw response.error; - } else { - throw new Error( - "scrapeURL error: " + - (Array.isArray(response.error) - ? JSON.stringify(response.error) - : typeof response.error === "object" - ? JSON.stringify({ ...response.error }) - : response.error), - ); - } + let error: any = undefined; + + for (let i = 0; i < tries; i++) { + if (i > 0) { + logger.debug("Retrying scrape...", { scrapeId: bull_job_id, jobId: bull_job_id, method: "runWebScraper", module: "runWebScraper", tries, i, previousStatusCode: (response as any)?.document?.metadata?.statusCode, previousError: error }); } - if (is_scrape === false) { - let creditsToBeBilled = 1; // Assuming 1 credit per document - if (scrapeOptions.extract) { - creditsToBeBilled = 5; - } + response = undefined; + engines = {}; + error = undefined; - billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { - logger.error( - `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, - ); - // Optionally, you could notify an admin or add to a retry queue here + try { + response = await scrapeURL(bull_job_id, url, scrapeOptions, { + priority, + ...internalOptions, }); + if (!response.success) { + if (response.error instanceof Error) { + throw response.error; + } else { + throw new Error( + "scrapeURL error: " + + (Array.isArray(response.error) + ? JSON.stringify(response.error) + : typeof response.error === "object" + ? JSON.stringify({ ...response.error }) + : response.error), + ); + } + } + + // This is where the returnvalue from the job is set + // onSuccess(response.document, mode); + + engines = response.engines; + + if ((response.document.metadata.statusCode >= 200 && response.document.metadata.statusCode < 300) || response.document.metadata.statusCode === 304) { + // status code is good -- do not attempt retry + break; + } + } catch (error) { + engines = + response !== undefined + ? response.engines + : typeof error === "object" && error !== null + ? ((error as any).results ?? {}) + : {}; } + } - // This is where the returnvalue from the job is set - // onSuccess(response.document, mode); - - engines = response.engines; - return response; - } catch (error) { - engines = - response !== undefined - ? response.engines - : typeof error === "object" && error !== null - ? ((error as any).results ?? {}) - : {}; - - if (response !== undefined) { - return { - ...response, - success: false, - error, - }; - } else { - return { - success: false, - error, - logs: ["no logs -- error coming from runWebScraper"], - engines, - }; - } - // onError(error); - } finally { - const engineOrder = Object.entries(engines) + const engineOrder = Object.entries(engines) .sort((a, b) => a[1].startedAt - b[1].startedAt) .map((x) => x[0]) as Engine[]; @@ -158,6 +149,38 @@ export async function runWebScraper({ }, }); } + + if (error === undefined && response?.success) { + if (is_scrape === false) { + let creditsToBeBilled = 1; // Assuming 1 credit per document + if (scrapeOptions.extract) { + creditsToBeBilled = 5; + } + + billTeam(team_id, undefined, creditsToBeBilled).catch((error) => { + logger.error( + `Failed to bill team ${team_id} for ${creditsToBeBilled} credits: ${error}`, + ); + // Optionally, you could notify an admin or add to a retry queue here + }); + } + + return response; + } else { + if (response !== undefined) { + return { + ...response, + success: false, + error, + }; + } else { + return { + success: false, + error, + logs: ["no logs -- error coming from runWebScraper"], + engines, + }; + } } } diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index 040bf0ee..c67f9cbd 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -5,7 +5,7 @@ import { Meta } from ".."; export function extractMetadata( meta: Meta, html: string, -): Document["metadata"] { +): Partial { let title: string | undefined = undefined; let description: string | undefined = undefined; let language: string | undefined = undefined; diff --git a/apps/api/src/types.ts b/apps/api/src/types.ts index 5325a0ad..9db79bc5 100644 --- a/apps/api/src/types.ts +++ b/apps/api/src/types.ts @@ -55,6 +55,7 @@ export interface RunWebScraperParams { bull_job_id: string; priority?: number; is_scrape?: boolean; + is_crawl?: boolean; } export type RunWebScraperResult =