diff --git a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts index e7d9efe8..04b30a0a 100644 --- a/apps/api/src/scraper/scrapeURL/engines/docx/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/docx/index.ts @@ -3,9 +3,10 @@ import { EngineScrapeResult } from ".."; import { downloadFile } from "../utils/downloadFile"; import mammoth from "mammoth"; -export async function scrapeDOCX(meta: Meta): Promise { +export async function scrapeDOCX(meta: Meta, timeToRun: number | undefined): Promise { const { response, tempFilePath } = await downloadFile(meta.id, meta.url, { headers: meta.options.headers, + signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeToRun ?? 300000), }); return { diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 687c8232..40c34399 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -59,7 +59,7 @@ export async function scrapeURLWithFetch( dispatcher: await makeSecureDispatcher(meta.url), redirect: "follow", headers: meta.options.headers, - signal: meta.internalOptions.abort, + signal: meta.internalOptions.abort ?? AbortSignal.timeout(timeout), }), (async () => { await new Promise((resolve) => diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 4e64f74e..29938016 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -142,6 +142,7 @@ export async function fireEngineCheckStatus( : {}), }, mock, + abort, }); }, ); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts index d046b738..0ea28292 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/delete.ts @@ -9,6 +9,7 @@ export async function fireEngineDelete( logger: Logger, jobId: string, mock: MockState | null, + abort?: AbortSignal, ) { await Sentry.startSpan( { @@ -33,6 +34,7 @@ export async function fireEngineDelete( ignoreFailure: true, logger: logger.child({ method: "fireEngineDelete/robustFetch", jobId }), mock, + abort, }); }, ); diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 1463104f..e69df48b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -48,6 +48,7 @@ async function performFireEngineScrape< logger.child({ method: "fireEngineScrape" }), request, mock, + abort, ); const startTime = Date.now(); @@ -56,6 +57,7 @@ async function performFireEngineScrape< let status: FireEngineCheckStatusSuccess | undefined = undefined; while (status === undefined) { + abort?.throwIfAborted(); if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); fireEngineDelete( @@ -236,7 +238,7 @@ export async function scrapeURLWithFireEngineChromeCDP( request, timeout, meta.mock, - meta.internalOptions.abort, + meta.internalOptions.abort ?? AbortSignal.timeout(timeout), ); if ( @@ -317,7 +319,7 @@ export async function scrapeURLWithFireEnginePlaywright( request, timeout, meta.mock, - meta.internalOptions.abort, + meta.internalOptions.abort ?? AbortSignal.timeout(timeout), ); if (!response.url) { @@ -373,7 +375,7 @@ export async function scrapeURLWithFireEngineTLSClient( request, timeout, meta.mock, - meta.internalOptions.abort, + meta.internalOptions.abort ?? AbortSignal.timeout(timeout), ); if (!response.url) { diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 733b06df..ca95cb32 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -45,6 +45,8 @@ async function scrapePDFWithRunPodMU( }); } + const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined; + const result = await robustFetch({ url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", @@ -56,7 +58,7 @@ async function scrapePDFWithRunPodMU( input: { file_content: base64Content, filename: path.basename(tempFilePath) + ".pdf", - timeout: timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined, + timeout, created_at: Date.now(), }, }, @@ -69,6 +71,7 @@ async function scrapePDFWithRunPodMU( }), }), mock: meta.mock, + abort: timeout ? AbortSignal.timeout(timeout) : undefined, }); const processorResult = { diff --git a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts index 123a1c68..259f4938 100644 --- a/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/playwright/index.ts @@ -30,6 +30,7 @@ export async function scrapeURLWithPlaywright( pageError: z.string().optional(), }), mock: meta.mock, + abort: AbortSignal.timeout(timeout), }), (async () => { await new Promise((resolve) => setTimeout(() => resolve(null), timeout)); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index d9da9ea1..18203f81 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -85,6 +85,7 @@ import Express from "express"; import http from "http"; import https from "https"; import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; +import { robustFetch } from "../scraper/scrapeURL/lib/fetch"; configDotenv(); @@ -1546,10 +1547,25 @@ async function processJob(job: Job & { id: string }, token: string) { const app = Express(); app.get("/liveness", (req, res) => { + // stalled check if (isWorkerStalled) { res.status(500).json({ ok: false }); } else { - res.status(200).json({ ok: true }); + // networking check + robustFetch({ + url: "http://firecrawl-app-service:3002", + method: "GET", + mock: null, + logger: _logger, + abort: AbortSignal.timeout(5000), + ignoreResponse: true, + }) + .then(() => { + res.status(200).json({ ok: true }); + }).catch(e => { + _logger.error("WORKER NETWORKING CHECK FAILED", { error: e }); + res.status(500).json({ ok: false }); + }); } });