Merge branch 'main' into python-sdk/e2e-tests-all-params

2025-08-15 01:36:16 +08:00 · 2025-05-16 15:54:30 -03:00 · 2025-05-16 15:54:30 -03:00 · 94220a772b
commit 94220a772b
parent 2da8abe6cb ab30c8e4ac
16 changed files with 408 additions and 22 deletions
--- a/.github/workflows/test-server.yml
+++ b/.github/workflows/test-server.yml
@ -35,6 +35,7 @@ env:
  ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
  VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }}
  USE_GO_MARKDOWN_PARSER: true
+  SENTRY_ENVIRONMENT: dev

 jobs:
  test:
@ -53,6 +54,7 @@ jobs:
          oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
          oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
          tags: tag:ci
+          use-cache: 'true'
      - name: Install pnpm
        uses: pnpm/action-setup@v4
        with:
--- a/apps/api/src/tests/snips/pdf-cache.test.ts
+++ b/apps/api/src/tests/snips/pdf-cache.test.ts
@ -0,0 +1,106 @@
+import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache';
+
+jest.mock('@google-cloud/storage', () => {
+  const mockSave = jest.fn().mockResolvedValue(undefined);
+  const mockExists = jest.fn().mockResolvedValue([true]);
+  const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({
+    markdown: 'cached markdown',
+    html: 'cached html'
+  }))]);
+  const mockFile = jest.fn().mockImplementation((path) => ({
+    save: mockSave,
+    exists: mockExists,
+    download: mockDownload
+  }));
+  
+  return {
+    Storage: jest.fn().mockImplementation(() => ({
+      bucket: jest.fn().mockImplementation(() => ({
+        file: mockFile
+      }))
+    })),
+    _getMockFile: () => mockFile,
+    _getMockSave: () => mockSave
+  };
+});
+
+process.env.GCS_BUCKET_NAME = 'test-bucket';
+
+describe('PDF Caching', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+
+  test('createPdfCacheKey generates consistent keys', () => {
+    const pdfContent1 = 'test-pdf-content';
+    const pdfContent2 = 'test-pdf-content';
+    const pdfContent3 = 'different-pdf-content';
+    
+    const key1 = createPdfCacheKey(pdfContent1);
+    const key2 = createPdfCacheKey(pdfContent2);
+    const key3 = createPdfCacheKey(pdfContent3);
+    
+    expect(key1).toBe(key2); // Same content should generate same key
+    expect(key1).not.toBe(key3); // Different content should generate different key
+    
+    expect(key1).toMatch(/^[a-f0-9]{64}$/);
+  });
+  
+  test('createPdfCacheKey works directly with base64 content', () => {
+    const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy';
+    
+    const key = createPdfCacheKey(base64Content);
+    
+    expect(key).toMatch(/^[a-f0-9]{64}$/);
+    
+    expect(createPdfCacheKey(base64Content)).toBe(key);
+    
+  });
+
+  test('savePdfResultToCache saves results to GCS', async () => {
+    const pdfContent = 'test-pdf-content';
+    const result = { markdown: 'test markdown', html: 'test html' };
+    
+    const { _getMockFile, _getMockSave } = require('@google-cloud/storage');
+    const mockFile = _getMockFile();
+    const mockSave = _getMockSave();
+    
+    mockFile.mockClear();
+    mockSave.mockClear();
+    
+    const cacheKey = await savePdfResultToCache(pdfContent, result);
+    
+    expect(cacheKey).not.toBeNull();
+    
+    expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/'));
+    
+    expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), {
+      contentType: 'application/json',
+      metadata: expect.objectContaining({
+        source: 'runpod_pdf_conversion',
+        cache_type: 'pdf_markdown',
+        created_at: expect.any(String)
+      })
+    });
+  });
+
+  test('getPdfResultFromCache retrieves results from GCS', async () => {
+    const pdfContent = 'test-pdf-content';
+    
+    const result = await getPdfResultFromCache(pdfContent);
+    
+    expect(result).not.toBeNull();
+    expect(result?.markdown).toBe('cached markdown');
+    expect(result?.html).toBe('cached html');
+  });
+
+  test('getPdfResultFromCache returns null when cache miss', async () => {
+    const { Storage } = require('@google-cloud/storage');
+    const mockExists = Storage().bucket().file().exists;
+    mockExists.mockResolvedValueOnce([false]);
+    
+    const result = await getPdfResultFromCache('uncached-content');
+    
+    expect(result).toBeNull();
+  });
+});
--- a/apps/api/src/controllers/v1/crawl.ts
+++ b/apps/api/src/controllers/v1/crawl.ts
@ -89,7 +89,7 @@ export async function crawlController(
    createdAt: Date.now(),
  };

-  const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null);
+  const crawler = crawlToCrawler(id, sc, req.acuc?.flags ?? null);

  try {
    sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -325,7 +325,7 @@ export async function mapController(
        abort: abort.signal,
        mock: req.body.useMock,
        filterByPath: req.body.filterByPath !== false,
-        flags: req.acuc.flags ?? null,
+        flags: req.acuc?.flags ?? null,
      }),
      ...(req.body.timeout !== undefined ? [
        new Promise((resolve, reject) => setTimeout(() => {
--- a/apps/api/src/index.ts
+++ b/apps/api/src/index.ts
@ -18,7 +18,6 @@ import { logger } from "./lib/logger";
 import { adminRouter } from "./routes/admin";
 import http from "node:http";
 import https from "node:https";
-import CacheableLookup from "cacheable-lookup";
 import { v1Router } from "./routes/v1";
 import expressWs from "express-ws";
 import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
@ -26,6 +25,7 @@ import { ZodError } from "zod";
 import { v4 as uuidv4 } from "uuid";
 import { RateLimiterMode } from "./types";
 import { attachWsProxy } from "./services/agentLivecastWS";
+import { cacheableLookup } from "./scraper/scrapeURL/lib/cacheableLookup";

 const { createBullBoard } = require("@bull-board/api");
 const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -34,11 +34,9 @@ const { ExpressAdapter } = require("@bull-board/express");
 const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
 logger.info(`Number of CPUs: ${numCPUs} available`);

-const cacheable = new CacheableLookup();
-
 // Install cacheable lookup for all other requests
-cacheable.install(http.globalAgent);
-cacheable.install(https.globalAgent);
+cacheableLookup.install(http.globalAgent);
+cacheableLookup.install(https.globalAgent);

 // Initialize Express with WebSocket support
 const expressApp = express();
--- a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts
+++ b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts
@ -106,6 +106,22 @@ import { getACUCTeam } from "../../../controllers/auth";
      logger.error("No search results found", {
        query: request.prompt,
      });
+      logJob({
+        job_id: extractId,
+        success: false,
+        message: "No search results found",
+        num_docs: 1,
+        docs: [],
+        time_taken: (new Date().getTime() - Date.now()) / 1000,
+        team_id: teamId,
+        mode: "extract",
+        url: request.urls?.join(", ") || "",
+        scrapeOptions: request,
+        origin: request.origin ?? "api",
+        num_tokens: 0,
+        tokens_billed: 0,
+        sources,
+      });
      return {
        success: false,
        error: "No search results found",
@ -191,6 +207,22 @@ import { getACUCTeam } from "../../../controllers/auth";
      logger.error("0 links! Bailing.", {
        linkCount: links.length,
      });
+      logJob({
+        job_id: extractId,
+        success: false,
+        message: "No valid URLs found to scrape",
+        num_docs: 1,
+        docs: [],
+        time_taken: (new Date().getTime() - Date.now()) / 1000,
+        team_id: teamId,
+        mode: "extract",
+        url: request.urls?.join(", ") || "",
+        scrapeOptions: request,
+        origin: request.origin ?? "api",
+        num_tokens: 0,
+        tokens_billed: 0,
+        sources,
+      });
      return {
        success: false,
        error:
@ -524,6 +556,22 @@ import { getACUCTeam } from "../../../controllers/auth";
  
      } catch (error) {
        logger.error(`Failed to transform array to object`, { error });
+        logJob({
+          job_id: extractId,
+          success: false,
+          message: "Failed to transform array to object",
+          num_docs: 1,
+          docs: [],
+          time_taken: (new Date().getTime() - Date.now()) / 1000,
+          team_id: teamId,
+          mode: "extract",
+          url: request.urls?.join(", ") || "",
+          scrapeOptions: request,
+          origin: request.origin ?? "api",
+          num_tokens: 0,
+          tokens_billed: 0,
+          sources,
+        });
        return {
          success: false,
          error:
@ -602,6 +650,23 @@ import { getACUCTeam } from "../../../controllers/auth";
  
        logger.debug("Scrapes finished.", { docCount: validResults.length });
      } catch (error) {
+        logger.error("Failed to scrape documents", { error });
+        logJob({
+          job_id: extractId,
+          success: false,
+          message: "Failed to scrape documents",
+          num_docs: 1,
+          docs: [],
+          time_taken: (new Date().getTime() - Date.now()) / 1000,
+          team_id: teamId,
+          mode: "extract",
+          url: request.urls?.join(", ") || "",
+          scrapeOptions: request,
+          origin: request.origin ?? "api",
+          num_tokens: 0,
+          tokens_billed: 0,
+          sources,
+        });
        return {
          success: false,
          error: error.message,
@ -614,6 +679,22 @@ import { getACUCTeam } from "../../../controllers/auth";
      if (docsMap.size == 0) {
        // All urls are invalid
        logger.error("All provided URLs are invalid!");
+        logJob({
+          job_id: extractId,
+          success: false,
+          message: "All provided URLs are invalid",
+          num_docs: 1,
+          docs: [],
+          time_taken: (new Date().getTime() - Date.now()) / 1000,
+          team_id: teamId,
+          mode: "extract",
+          url: request.urls?.join(", ") || "",
+          scrapeOptions: request,
+          origin: request.origin ?? "api",
+          num_tokens: 0,
+          tokens_billed: 0,
+          sources,
+        });
        return {
          success: false,
          error:
--- a/apps/api/src/lib/gcs-pdf-cache.ts
+++ b/apps/api/src/lib/gcs-pdf-cache.ts
@ -0,0 +1,112 @@
+import { Storage } from "@google-cloud/storage";
+import { logger } from "./logger";
+import crypto from "crypto";
+
+const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined;
+const PDF_CACHE_PREFIX = "pdf-cache/";
+
+/**
+ * Creates a SHA-256 hash of the PDF content to use as a cache key
+ * Directly hashes the content without any conversion
+ */
+export function createPdfCacheKey(pdfContent: string | Buffer): string {
+  return crypto
+    .createHash('sha256')
+    .update(pdfContent)
+    .digest('hex');
+}
+
+/**
+ * Save RunPod markdown results to GCS cache
+ */
+export async function savePdfResultToCache(
+  pdfContent: string, 
+  result: { markdown: string; html: string }
+): Promise<string | null> {
+  try {
+    if (!process.env.GCS_BUCKET_NAME) {
+      return null;
+    }
+
+    const cacheKey = createPdfCacheKey(pdfContent);
+    const storage = new Storage({ credentials });
+    const bucket = storage.bucket(process.env.GCS_BUCKET_NAME);
+    const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`);
+
+    for (let i = 0; i < 3; i++) {
+      try {
+        await blob.save(JSON.stringify(result), {
+          contentType: "application/json",
+          metadata: {
+            source: "runpod_pdf_conversion",
+            cache_type: "pdf_markdown",
+            created_at: new Date().toISOString(),
+          }
+        });
+        
+        logger.info(`Saved PDF RunPod result to GCS cache`, {
+          cacheKey,
+        });
+        
+        return cacheKey;
+      } catch (error) {
+        if (i === 2) {
+          throw error;
+        } else {
+          logger.error(`Error saving PDF RunPod result to GCS cache, retrying`, {
+            error,
+            cacheKey,
+            i,
+          });
+        }
+      }
+    }
+    
+    return cacheKey;
+  } catch (error) {
+    logger.error(`Error saving PDF RunPod result to GCS cache`, {
+      error,
+    });
+    return null;
+  }
+}
+
+/**
+ * Get cached RunPod markdown results from GCS
+ */
+export async function getPdfResultFromCache(
+  pdfContent: string
+): Promise<{ markdown: string; html: string } | null> {
+  try {
+    if (!process.env.GCS_BUCKET_NAME) {
+      return null;
+    }
+
+    const cacheKey = createPdfCacheKey(pdfContent);
+    const storage = new Storage({ credentials });
+    const bucket = storage.bucket(process.env.GCS_BUCKET_NAME);
+    const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`);
+    
+    const [exists] = await blob.exists();
+    if (!exists) {
+      logger.debug(`PDF RunPod result not found in GCS cache`, {
+        cacheKey,
+      });
+      return null;
+    }
+    
+    const [content] = await blob.download();
+    const result = JSON.parse(content.toString());
+    
+    logger.info(`Retrieved PDF RunPod result from GCS cache`, {
+      cacheKey,
+    });
+    
+    return result;
+  } catch (error) {
+    logger.error(`Error retrieving PDF RunPod result from GCS cache`, {
+      error,
+    });
+    return null;
+  }
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts
@ -7,6 +7,7 @@ import {
  ActionError,
  EngineError,
  SiteError,
+  SSLError,
  UnsupportedFileError,
 } from "../../error";
 import { MockState } from "../../lib/mock";
@ -169,7 +170,13 @@ export async function fireEngineCheckStatus(
      typeof status.error === "string" &&
      status.error.includes("Chrome error: ")
    ) {
-      throw new SiteError(status.error.split("Chrome error: ")[1]);
+      const code = status.error.split("Chrome error: ")[1];
+
+      if (code.includes("ERR_CERT_") || code.includes("ERR_SSL_") || code.includes("ERR_BAD_SSL_")) {
+        throw new SSLError();
+      } else {
+        throw new SiteError(code);
+      }
    } else if (
      typeof status.error === "string" &&
      status.error.includes("File size exceeds")
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -17,6 +17,7 @@ import {
  ActionError,
  EngineError,
  SiteError,
+  SSLError,
  TimeoutError,
  UnsupportedFileError,
 } from "../../error";
@ -94,6 +95,7 @@ async function performFireEngineScrape<
      } else if (
        error instanceof EngineError ||
        error instanceof SiteError ||
+        error instanceof SSLError ||
        error instanceof ActionError ||
        error instanceof UnsupportedFileError
      ) {
--- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts
@ -11,6 +11,7 @@ import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../..
 import { readFile, unlink } from "node:fs/promises";
 import path from "node:path";
 import type { Response } from "undici";
+import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";

 type PDFProcessorResult = { html: string; markdown?: string };

@ -26,6 +27,22 @@ async function scrapePDFWithRunPodMU(
    tempFilePath,
  });

+  try {
+    const cachedResult = await getPdfResultFromCache(base64Content);
+    
+    if (cachedResult) {
+      meta.logger.info("Using cached RunPod MU result for PDF", {
+        tempFilePath,
+      });
+      return cachedResult;
+    }
+  } catch (error) {
+    meta.logger.warn("Error checking PDF cache, proceeding with RunPod MU", {
+      error,
+      tempFilePath,
+    });
+  }
+
  const result = await robustFetch({
    url:
      "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
@ -50,10 +67,21 @@ async function scrapePDFWithRunPodMU(
    mock: meta.mock,
  });

-  return {
+  const processorResult = {
    markdown: result.output.markdown,
    html: await marked.parse(result.output.markdown, { async: true }),
  };
+
+  try {
+    await savePdfResultToCache(base64Content, processorResult);
+  } catch (error) {
+    meta.logger.warn("Error saving PDF to cache", {
+      error,
+      tempFilePath,
+    });
+  }
+
+  return processorResult;
 }

 async function scrapePDFWithParsePDF(
--- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts
@ -2,6 +2,7 @@ import type { Socket } from "net";
 import type { TLSSocket } from "tls";
 import * as undici from "undici";
 import { Address6 } from "ip-address";
+import { cacheableLookup } from "../../lib/cacheableLookup";

 export class InsecureConnectionError extends Error {
  constructor() {
@ -46,7 +47,7 @@ export function makeSecureDispatcher(
  const agentOpts: undici.Agent.Options = {
    connect: {
      rejectUnauthorized: false, // bypass SSL failures -- this is fine
-      // lookup: secureLookup,
+      lookup: cacheableLookup.lookup,
    },
    maxRedirections: 5000,
    ...options,
--- a/apps/api/src/scraper/scrapeURL/error.ts
+++ b/apps/api/src/scraper/scrapeURL/error.ts
@ -49,6 +49,12 @@ export class RemoveFeatureError extends Error {
  }
 }

+export class SSLError extends Error {
+  constructor() {
+    super("An SSL error occurred while scraping the URL. If you're not inputting any sensitive data, try scraping with `skipTlsVerification: true`.");
+  }
+}
+
 export class SiteError extends Error {
  public code: string;
  constructor(code: string) {
--- a/apps/api/src/scraper/scrapeURL/index.ts
+++ b/apps/api/src/scraper/scrapeURL/index.ts
@ -21,6 +21,7 @@ import {
  SiteError,
  TimeoutError,
  UnsupportedFileError,
+  SSLError,
 } from "./error";
 import { executeTransformers } from "./transformers";
 import { LLMRefusalError } from "./transformers/llmExtract";
@ -323,6 +324,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
        throw error;
      } else if (error instanceof SiteError) {
        throw error;
+      } else if (error instanceof SSLError) {
+        throw error;
      } else if (error instanceof ActionError) {
        throw error;
      } else if (error instanceof UnsupportedFileError) {
@ -470,6 +473,8 @@ export async function scrapeURL(
      // TODO: results?
    } else if (error instanceof SiteError) {
      meta.logger.warn("scrapeURL: Site failed to load in browser", { error });
+    } else if (error instanceof SSLError) {
+      meta.logger.warn("scrapeURL: SSL error", { error });
    } else if (error instanceof ActionError) {
      meta.logger.warn("scrapeURL: Action(s) failed to complete", { error });
    } else if (error instanceof UnsupportedFileError) {
--- a/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts
@ -0,0 +1,4 @@
+import CacheableLookup from 'cacheable-lookup';
+import dns from 'dns';
+
+export const cacheableLookup = (process.env.SENTRY_ENVIRONMENT === "dev" ? { lookup: dns.lookup, install: () => {} } : new CacheableLookup({}));
--- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts
+++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts
@ -5,6 +5,7 @@ import { MockState, saveMock } from "./mock";
 import { TimeoutSignal } from "../../../controllers/v1/types";
 import { fireEngineURL } from "../engines/fire-engine/scrape";
 import { fetch, RequestInit, Response, FormData, Agent } from "undici";
+import { cacheableLookup } from "./cacheableLookup";

 export type RobustFetchParams<Schema extends z.Schema<any>> = {
  url: string;
@ -82,6 +83,9 @@ export async function robustFetch<
        dispatcher: new Agent({
          headersTimeout: 0,
          bodyTimeout: 0,
+          connect: {
+            lookup: cacheableLookup.lookup,
+          },
        }),
        ...(body instanceof FormData
          ? {
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -81,6 +81,10 @@ import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt
 import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
 import { CostTracking } from "../lib/extract/extraction-service";
 import { getACUCTeam } from "../controllers/auth";
+import Express from "express";
+import http from "http";
+import https from "https";
+import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";

 configDotenv();

@ -108,6 +112,10 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;

 const runningJobs: Set<string> = new Set();

+// Install cacheable lookup for all other requests
+cacheableLookup.install(http.globalAgent);
+cacheableLookup.install(https.globalAgent);
+
 async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
  const logger = _logger.child({
    module: "queue-worker",
@ -139,19 +147,23 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
          "crawl:" + job.data.crawl_id + ":visited_unique",
        ),
      );
-
+      
      logger.info("Visited URLs", {
        visitedUrls: visitedUrls.size,
      });

-      const lastUrls: string[] = (
-        (
-          await supabase_service.rpc("diff_get_last_crawl_urls", {
-            i_team_id: job.data.team_id,
-            i_url: sc.originUrl!,
-          })
-        ).data ?? []
-      ).map((x) => x.url);
+      let lastUrls: string[] = [];
+      const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true";
+      if (useDbAuthentication) {
+        lastUrls = (
+          (
+            await supabase_service.rpc("diff_get_last_crawl_urls", {
+              i_team_id: job.data.team_id,
+              i_url: sc.originUrl!,
+            })
+          ).data ?? []
+        ).map((x) => x.url);
+      }

      const lastUrlsSet = new Set(lastUrls);

@ -249,7 +261,8 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
      if (
        visitedUrls.length > 0 &&
        job.data.crawlerOptions !== null &&
-        originUrl
+        originUrl &&
+        process.env.USE_DB_AUTHENTICATION === "true"
      ) {
        // Queue the indexing job instead of doing it directly
        await getIndexQueue().add(
@ -688,6 +701,7 @@ const processGenerateLlmsTxtJobInternal = async (
 };

 let isShuttingDown = false;
+let isWorkerStalled = false;

 process.on("SIGINT", () => {
  console.log("Received SIGTERM. Shutting down gracefully...");
@ -731,7 +745,9 @@ const workerFun = async (
      logger.info("Can't accept connection due to RAM/CPU load");
      cantAcceptConnectionCount++;

-      if (cantAcceptConnectionCount >= 25) {
+      isWorkerStalled = cantAcceptConnectionCount >= 25;
+
+      if (isWorkerStalled) {
        logger.error("WORKER STALLED", {
          cpuUsage: await monitor.checkCpuUsage(),
          memoryUsage: await monitor.checkMemoryUsage(),
@ -1526,6 +1542,20 @@ async function processJob(job: Job & { id: string }, token: string) {
 // wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed"));

 // Start all workers
+const app = Express();
+
+app.get("/liveness", (req, res) => {
+  if (isWorkerStalled) {
+    res.status(500).json({ ok: false });
+  } else {
+    res.status(200).json({ ok: true });
+  }
+});
+
+app.listen(3005, () => {
+  _logger.info("Liveness endpoint is running on port 3005");
+});
+
 (async () => {
  await Promise.all([
    workerFun(getScrapeQueue(), processJobInternal),
@ -1542,4 +1572,4 @@ async function processJob(job: Job & { id: string }, token: string) {

  console.log("All jobs finished. Worker out!");
  process.exit(0);
-})();
+})();