From 3db2294b97129dafa4965623efffde10a2c0fe85 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 14 May 2025 23:34:59 +0200 Subject: [PATCH 01/10] feat(scrapeURL): better error for SSL failures (#1552) --- .../scraper/scrapeURL/engines/fire-engine/checkStatus.ts | 9 ++++++++- apps/api/src/scraper/scrapeURL/error.ts | 6 ++++++ apps/api/src/scraper/scrapeURL/index.ts | 5 +++++ 3 files changed, 19 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index fadce779..4e64f74e 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -7,6 +7,7 @@ import { ActionError, EngineError, SiteError, + SSLError, UnsupportedFileError, } from "../../error"; import { MockState } from "../../lib/mock"; @@ -169,7 +170,13 @@ export async function fireEngineCheckStatus( typeof status.error === "string" && status.error.includes("Chrome error: ") ) { - throw new SiteError(status.error.split("Chrome error: ")[1]); + const code = status.error.split("Chrome error: ")[1]; + + if (code.includes("ERR_CERT_") || code.includes("ERR_SSL_") || code.includes("ERR_BAD_SSL_")) { + throw new SSLError(); + } else { + throw new SiteError(code); + } } else if ( typeof status.error === "string" && status.error.includes("File size exceeds") diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index ff445f8d..29b8970e 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -49,6 +49,12 @@ export class RemoveFeatureError extends Error { } } +export class SSLError extends Error { + constructor() { + super("An SSL error occurred while scraping the URL. If you're not inputting any sensitive data, try scraping with `skipTlsVerification: true`."); + } +} + export class SiteError extends Error { public code: string; constructor(code: string) { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 7dfc821f..baf29752 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -21,6 +21,7 @@ import { SiteError, TimeoutError, UnsupportedFileError, + SSLError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -323,6 +324,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof SiteError) { throw error; + } else if (error instanceof SSLError) { + throw error; } else if (error instanceof ActionError) { throw error; } else if (error instanceof UnsupportedFileError) { @@ -470,6 +473,8 @@ export async function scrapeURL( // TODO: results? } else if (error instanceof SiteError) { meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); + } else if (error instanceof SSLError) { + meta.logger.warn("scrapeURL: SSL error", { error }); } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); } else if (error instanceof UnsupportedFileError) { From cee481a3a9ce917285214f23a95e4b746e935282 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Wed, 14 May 2025 23:50:57 +0200 Subject: [PATCH 02/10] fix(fire-engine): sslerror passthrough --- apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 1c2c7bd3..37d3c870 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, + SSLError, TimeoutError, UnsupportedFileError, } from "../../error"; @@ -94,6 +95,7 @@ async function performFireEngineScrape< } else if ( error instanceof EngineError || error instanceof SiteError || + error instanceof SSLError || error instanceof ActionError || error instanceof UnsupportedFileError ) { From b0c203e512baf51c019330c80aea98150a1a1d9c Mon Sep 17 00:00:00 2001 From: Will Date: Wed, 14 May 2025 18:04:04 -0400 Subject: [PATCH 03/10] Fix/optional chaining operators missing (#1549) * fix: missing optional chaining operator in req.acuc.flags * fix: missing optional chaining operator in req.acuc.flags --- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/controllers/v1/map.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index 3948566f..fe9d6afd 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -89,7 +89,7 @@ export async function crawlController( createdAt: Date.now(), }; - const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null); + const crawler = crawlToCrawler(id, sc, req.acuc?.flags ?? null); try { sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 33e23fa1..67dd3f7c 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -325,7 +325,7 @@ export async function mapController( abort: abort.signal, mock: req.body.useMock, filterByPath: req.body.filterByPath !== false, - flags: req.acuc.flags ?? null, + flags: req.acuc?.flags ?? null, }), ...(req.body.timeout !== undefined ? [ new Promise((resolve, reject) => setTimeout(() => { From b5b612c35b70c42b5f05e23fed12e71f46412ca0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 15 May 2025 16:32:59 +0200 Subject: [PATCH 04/10] feat(api/extract/fire-0): error logging (#1556) --- .../extract/fire-0/extraction-service-f0.ts | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) diff --git a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts index fcf133e5..72cc8e2e 100644 --- a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts +++ b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts @@ -106,6 +106,22 @@ import { getACUCTeam } from "../../../controllers/auth"; logger.error("No search results found", { query: request.prompt, }); + logJob({ + job_id: extractId, + success: false, + message: "No search results found", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + }); return { success: false, error: "No search results found", @@ -191,6 +207,22 @@ import { getACUCTeam } from "../../../controllers/auth"; logger.error("0 links! Bailing.", { linkCount: links.length, }); + logJob({ + job_id: extractId, + success: false, + message: "No valid URLs found to scrape", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + }); return { success: false, error: @@ -524,6 +556,22 @@ import { getACUCTeam } from "../../../controllers/auth"; } catch (error) { logger.error(`Failed to transform array to object`, { error }); + logJob({ + job_id: extractId, + success: false, + message: "Failed to transform array to object", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + }); return { success: false, error: @@ -602,6 +650,23 @@ import { getACUCTeam } from "../../../controllers/auth"; logger.debug("Scrapes finished.", { docCount: validResults.length }); } catch (error) { + logger.error("Failed to scrape documents", { error }); + logJob({ + job_id: extractId, + success: false, + message: "Failed to scrape documents", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + }); return { success: false, error: error.message, @@ -614,6 +679,22 @@ import { getACUCTeam } from "../../../controllers/auth"; if (docsMap.size == 0) { // All urls are invalid logger.error("All provided URLs are invalid!"); + logJob({ + job_id: extractId, + success: false, + message: "All provided URLs are invalid", + num_docs: 1, + docs: [], + time_taken: (new Date().getTime() - Date.now()) / 1000, + team_id: teamId, + mode: "extract", + url: request.urls?.join(", ") || "", + scrapeOptions: request, + origin: request.origin ?? "api", + num_tokens: 0, + tokens_billed: 0, + sources, + }); return { success: false, error: From f936befcdb9f9410c71c33a3ac0bd70235f5e8eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 16 May 2025 14:15:48 +0200 Subject: [PATCH 05/10] feat(queue-worker): liveness check endpoint --- apps/api/src/services/queue-worker.ts | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 2a1e47f7..04181c9e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -81,6 +81,7 @@ import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0"; import { CostTracking } from "../lib/extract/extraction-service"; import { getACUCTeam } from "../controllers/auth"; +import Express from "express"; configDotenv(); @@ -688,6 +689,7 @@ const processGenerateLlmsTxtJobInternal = async ( }; let isShuttingDown = false; +let isWorkerStalled = false; process.on("SIGINT", () => { console.log("Received SIGTERM. Shutting down gracefully..."); @@ -731,7 +733,9 @@ const workerFun = async ( logger.info("Can't accept connection due to RAM/CPU load"); cantAcceptConnectionCount++; - if (cantAcceptConnectionCount >= 25) { + isWorkerStalled = cantAcceptConnectionCount >= 25; + + if (isWorkerStalled) { logger.error("WORKER STALLED", { cpuUsage: await monitor.checkCpuUsage(), memoryUsage: await monitor.checkMemoryUsage(), @@ -1526,6 +1530,20 @@ async function processJob(job: Job & { id: string }, token: string) { // wsq.on("removed", j => ScrapeEvents.logJobEvent(j, "removed")); // Start all workers +const app = Express(); + +app.get("/liveness", (req, res) => { + if (isWorkerStalled) { + res.status(500).json({ ok: false }); + } else { + res.status(200).json({ ok: true }); + } +}); + +app.listen(3005, () => { + _logger.info("Liveness endpoint is running on port 3005"); +}); + (async () => { await Promise.all([ workerFun(getScrapeQueue(), processJobInternal), @@ -1542,4 +1560,4 @@ async function processJob(job: Job & { id: string }, token: string) { console.log("All jobs finished. Worker out!"); process.exit(0); -})(); +})(); \ No newline at end of file From b8703b2a720765b92f5c4cab94cc90ea624198a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 16 May 2025 15:27:24 +0200 Subject: [PATCH 06/10] feat: use cacheable lookup everywhere (#1559) * feat(scrapeURL): use cacheableLookup * feat(queue-worker): add cacheablelookup * fix(cacheable-lookup): make it work with tailscale on local * add devenv * try again * allow querying all * log * fixes * asd * fix: * fix(lookup): --- .github/workflows/test-server.yml | 2 ++ apps/api/src/index.ts | 8 +++----- apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts | 3 ++- apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts | 4 ++++ apps/api/src/scraper/scrapeURL/lib/fetch.ts | 4 ++++ apps/api/src/services/queue-worker.ts | 7 +++++++ 6 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index febad61a..cdde350b 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -35,6 +35,7 @@ env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }} USE_GO_MARKDOWN_PARSER: true + SENTRY_ENVIRONMENT: dev jobs: test: @@ -53,6 +54,7 @@ jobs: oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} tags: tag:ci + use-cache: 'true' - name: Install pnpm uses: pnpm/action-setup@v4 with: diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index efca0479..933a980f 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -18,7 +18,6 @@ import { logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import http from "node:http"; import https from "node:https"; -import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; @@ -26,6 +25,7 @@ import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; import { RateLimiterMode } from "./types"; import { attachWsProxy } from "./services/agentLivecastWS"; +import { cacheableLookup } from "./scraper/scrapeURL/lib/cacheableLookup"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -34,11 +34,9 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; logger.info(`Number of CPUs: ${numCPUs} available`); -const cacheable = new CacheableLookup(); - // Install cacheable lookup for all other requests -cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent); +cacheableLookup.install(http.globalAgent); +cacheableLookup.install(https.globalAgent); // Initialize Express with WebSocket support const expressApp = express(); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts index 351a7742..1a959224 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -2,6 +2,7 @@ import type { Socket } from "net"; import type { TLSSocket } from "tls"; import * as undici from "undici"; import { Address6 } from "ip-address"; +import { cacheableLookup } from "../../lib/cacheableLookup"; export class InsecureConnectionError extends Error { constructor() { @@ -46,7 +47,7 @@ export function makeSecureDispatcher( const agentOpts: undici.Agent.Options = { connect: { rejectUnauthorized: false, // bypass SSL failures -- this is fine - // lookup: secureLookup, + lookup: cacheableLookup.lookup, }, maxRedirections: 5000, ...options, diff --git a/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts new file mode 100644 index 00000000..ef4f3fef --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts @@ -0,0 +1,4 @@ +import CacheableLookup from 'cacheable-lookup'; +import dns from 'dns'; + +export const cacheableLookup = (process.env.SENTRY_ENVIRONMENT === "dev" ? { lookup: dns.lookup, install: () => {} } : new CacheableLookup({ lookup: false })); diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index c64e7953..24822f41 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -5,6 +5,7 @@ import { MockState, saveMock } from "./mock"; import { TimeoutSignal } from "../../../controllers/v1/types"; import { fireEngineURL } from "../engines/fire-engine/scrape"; import { fetch, RequestInit, Response, FormData, Agent } from "undici"; +import { cacheableLookup } from "./cacheableLookup"; export type RobustFetchParams> = { url: string; @@ -82,6 +83,9 @@ export async function robustFetch< dispatcher: new Agent({ headersTimeout: 0, bodyTimeout: 0, + connect: { + lookup: cacheableLookup.lookup, + }, }), ...(body instanceof FormData ? { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 04181c9e..553b944f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -82,6 +82,9 @@ import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f import { CostTracking } from "../lib/extract/extraction-service"; import { getACUCTeam } from "../controllers/auth"; import Express from "express"; +import http from "http"; +import https from "https"; +import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; configDotenv(); @@ -109,6 +112,10 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const runningJobs: Set = new Set(); +// Install cacheable lookup for all other requests +cacheableLookup.install(http.globalAgent); +cacheableLookup.install(https.globalAgent); + async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const logger = _logger.child({ module: "queue-worker", From d46ba959247a5f6a69f8fde80b64170900a574b3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 16 May 2025 15:31:06 +0200 Subject: [PATCH 07/10] Revert "feat: use cacheable lookup everywhere (#1559)" This reverts commit b8703b2a720765b92f5c4cab94cc90ea624198a8. --- .github/workflows/test-server.yml | 2 -- apps/api/src/index.ts | 8 +++++--- apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts | 3 +-- apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts | 4 ---- apps/api/src/scraper/scrapeURL/lib/fetch.ts | 4 ---- apps/api/src/services/queue-worker.ts | 7 ------- 6 files changed, 6 insertions(+), 22 deletions(-) delete mode 100644 apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index cdde350b..febad61a 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -35,7 +35,6 @@ env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }} USE_GO_MARKDOWN_PARSER: true - SENTRY_ENVIRONMENT: dev jobs: test: @@ -54,7 +53,6 @@ jobs: oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} tags: tag:ci - use-cache: 'true' - name: Install pnpm uses: pnpm/action-setup@v4 with: diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index 933a980f..efca0479 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -18,6 +18,7 @@ import { logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import http from "node:http"; import https from "node:https"; +import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; @@ -25,7 +26,6 @@ import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; import { RateLimiterMode } from "./types"; import { attachWsProxy } from "./services/agentLivecastWS"; -import { cacheableLookup } from "./scraper/scrapeURL/lib/cacheableLookup"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -34,9 +34,11 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; logger.info(`Number of CPUs: ${numCPUs} available`); +const cacheable = new CacheableLookup(); + // Install cacheable lookup for all other requests -cacheableLookup.install(http.globalAgent); -cacheableLookup.install(https.globalAgent); +cacheable.install(http.globalAgent); +cacheable.install(https.globalAgent); // Initialize Express with WebSocket support const expressApp = express(); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts index 1a959224..351a7742 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -2,7 +2,6 @@ import type { Socket } from "net"; import type { TLSSocket } from "tls"; import * as undici from "undici"; import { Address6 } from "ip-address"; -import { cacheableLookup } from "../../lib/cacheableLookup"; export class InsecureConnectionError extends Error { constructor() { @@ -47,7 +46,7 @@ export function makeSecureDispatcher( const agentOpts: undici.Agent.Options = { connect: { rejectUnauthorized: false, // bypass SSL failures -- this is fine - lookup: cacheableLookup.lookup, + // lookup: secureLookup, }, maxRedirections: 5000, ...options, diff --git a/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts deleted file mode 100644 index ef4f3fef..00000000 --- a/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts +++ /dev/null @@ -1,4 +0,0 @@ -import CacheableLookup from 'cacheable-lookup'; -import dns from 'dns'; - -export const cacheableLookup = (process.env.SENTRY_ENVIRONMENT === "dev" ? { lookup: dns.lookup, install: () => {} } : new CacheableLookup({ lookup: false })); diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index 24822f41..c64e7953 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -5,7 +5,6 @@ import { MockState, saveMock } from "./mock"; import { TimeoutSignal } from "../../../controllers/v1/types"; import { fireEngineURL } from "../engines/fire-engine/scrape"; import { fetch, RequestInit, Response, FormData, Agent } from "undici"; -import { cacheableLookup } from "./cacheableLookup"; export type RobustFetchParams> = { url: string; @@ -83,9 +82,6 @@ export async function robustFetch< dispatcher: new Agent({ headersTimeout: 0, bodyTimeout: 0, - connect: { - lookup: cacheableLookup.lookup, - }, }), ...(body instanceof FormData ? { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 553b944f..04181c9e 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -82,9 +82,6 @@ import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f import { CostTracking } from "../lib/extract/extraction-service"; import { getACUCTeam } from "../controllers/auth"; import Express from "express"; -import http from "http"; -import https from "https"; -import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; configDotenv(); @@ -112,10 +109,6 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const runningJobs: Set = new Set(); -// Install cacheable lookup for all other requests -cacheableLookup.install(http.globalAgent); -cacheableLookup.install(https.globalAgent); - async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const logger = _logger.child({ module: "queue-worker", From bd9673e104caeb6082dfccd0d69ef6cfecae0e09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 16 May 2025 15:44:52 +0200 Subject: [PATCH 08/10] Mog/cachable lookup (#1560) * feat(scrapeURL): use cacheableLookup * feat(queue-worker): add cacheablelookup * fix(cacheable-lookup): make it work with tailscale on local * add devenv * try again * allow querying all * log * fixes * asd * fix: * fix(lookup): * lookup --- .github/workflows/test-server.yml | 2 ++ apps/api/src/index.ts | 8 +++----- apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts | 3 ++- apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts | 4 ++++ apps/api/src/scraper/scrapeURL/lib/fetch.ts | 4 ++++ apps/api/src/services/queue-worker.ts | 7 +++++++ 6 files changed, 22 insertions(+), 6 deletions(-) create mode 100644 apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index febad61a..cdde350b 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -35,6 +35,7 @@ env: ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }} VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }} USE_GO_MARKDOWN_PARSER: true + SENTRY_ENVIRONMENT: dev jobs: test: @@ -53,6 +54,7 @@ jobs: oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }} oauth-secret: ${{ secrets.TS_OAUTH_SECRET }} tags: tag:ci + use-cache: 'true' - name: Install pnpm uses: pnpm/action-setup@v4 with: diff --git a/apps/api/src/index.ts b/apps/api/src/index.ts index efca0479..933a980f 100644 --- a/apps/api/src/index.ts +++ b/apps/api/src/index.ts @@ -18,7 +18,6 @@ import { logger } from "./lib/logger"; import { adminRouter } from "./routes/admin"; import http from "node:http"; import https from "node:https"; -import CacheableLookup from "cacheable-lookup"; import { v1Router } from "./routes/v1"; import expressWs from "express-ws"; import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types"; @@ -26,6 +25,7 @@ import { ZodError } from "zod"; import { v4 as uuidv4 } from "uuid"; import { RateLimiterMode } from "./types"; import { attachWsProxy } from "./services/agentLivecastWS"; +import { cacheableLookup } from "./scraper/scrapeURL/lib/cacheableLookup"; const { createBullBoard } = require("@bull-board/api"); const { BullAdapter } = require("@bull-board/api/bullAdapter"); @@ -34,11 +34,9 @@ const { ExpressAdapter } = require("@bull-board/express"); const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length; logger.info(`Number of CPUs: ${numCPUs} available`); -const cacheable = new CacheableLookup(); - // Install cacheable lookup for all other requests -cacheable.install(http.globalAgent); -cacheable.install(https.globalAgent); +cacheableLookup.install(http.globalAgent); +cacheableLookup.install(https.globalAgent); // Initialize Express with WebSocket support const expressApp = express(); diff --git a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts index 351a7742..1a959224 100644 --- a/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts +++ b/apps/api/src/scraper/scrapeURL/engines/utils/safeFetch.ts @@ -2,6 +2,7 @@ import type { Socket } from "net"; import type { TLSSocket } from "tls"; import * as undici from "undici"; import { Address6 } from "ip-address"; +import { cacheableLookup } from "../../lib/cacheableLookup"; export class InsecureConnectionError extends Error { constructor() { @@ -46,7 +47,7 @@ export function makeSecureDispatcher( const agentOpts: undici.Agent.Options = { connect: { rejectUnauthorized: false, // bypass SSL failures -- this is fine - // lookup: secureLookup, + lookup: cacheableLookup.lookup, }, maxRedirections: 5000, ...options, diff --git a/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts new file mode 100644 index 00000000..b561967c --- /dev/null +++ b/apps/api/src/scraper/scrapeURL/lib/cacheableLookup.ts @@ -0,0 +1,4 @@ +import CacheableLookup from 'cacheable-lookup'; +import dns from 'dns'; + +export const cacheableLookup = (process.env.SENTRY_ENVIRONMENT === "dev" ? { lookup: dns.lookup, install: () => {} } : new CacheableLookup({})); diff --git a/apps/api/src/scraper/scrapeURL/lib/fetch.ts b/apps/api/src/scraper/scrapeURL/lib/fetch.ts index c64e7953..24822f41 100644 --- a/apps/api/src/scraper/scrapeURL/lib/fetch.ts +++ b/apps/api/src/scraper/scrapeURL/lib/fetch.ts @@ -5,6 +5,7 @@ import { MockState, saveMock } from "./mock"; import { TimeoutSignal } from "../../../controllers/v1/types"; import { fireEngineURL } from "../engines/fire-engine/scrape"; import { fetch, RequestInit, Response, FormData, Agent } from "undici"; +import { cacheableLookup } from "./cacheableLookup"; export type RobustFetchParams> = { url: string; @@ -82,6 +83,9 @@ export async function robustFetch< dispatcher: new Agent({ headersTimeout: 0, bodyTimeout: 0, + connect: { + lookup: cacheableLookup.lookup, + }, }), ...(body instanceof FormData ? { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 04181c9e..553b944f 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -82,6 +82,9 @@ import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f import { CostTracking } from "../lib/extract/extraction-service"; import { getACUCTeam } from "../controllers/auth"; import Express from "express"; +import http from "http"; +import https from "https"; +import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup"; configDotenv(); @@ -109,6 +112,10 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; const runningJobs: Set = new Set(); +// Install cacheable lookup for all other requests +cacheableLookup.install(http.globalAgent); +cacheableLookup.install(https.globalAgent); + async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const logger = _logger.child({ module: "queue-worker", From 526165e1b9e3d67f522dab793580921bfa44a73a Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 12:04:38 -0300 Subject: [PATCH 09/10] Add caching for RunPod PDF markdown results in GCS (#1561) * Add caching for RunPod PDF markdown results in GCS Co-Authored-By: thomas@sideguide.dev * Update PDF caching to hash base64 directly and add metadata Co-Authored-By: thomas@sideguide.dev * Fix PDF caching to directly hash content and fix test expectations Co-Authored-By: thomas@sideguide.dev --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: thomas@sideguide.dev --- .../api/src/__tests__/snips/pdf-cache.test.ts | 106 +++++++++++++++++ apps/api/src/lib/gcs-pdf-cache.ts | 112 ++++++++++++++++++ .../scraper/scrapeURL/engines/pdf/index.ts | 30 ++++- 3 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/__tests__/snips/pdf-cache.test.ts create mode 100644 apps/api/src/lib/gcs-pdf-cache.ts diff --git a/apps/api/src/__tests__/snips/pdf-cache.test.ts b/apps/api/src/__tests__/snips/pdf-cache.test.ts new file mode 100644 index 00000000..e60b5f07 --- /dev/null +++ b/apps/api/src/__tests__/snips/pdf-cache.test.ts @@ -0,0 +1,106 @@ +import { createPdfCacheKey, savePdfResultToCache, getPdfResultFromCache } from '../../lib/gcs-pdf-cache'; + +jest.mock('@google-cloud/storage', () => { + const mockSave = jest.fn().mockResolvedValue(undefined); + const mockExists = jest.fn().mockResolvedValue([true]); + const mockDownload = jest.fn().mockResolvedValue([Buffer.from(JSON.stringify({ + markdown: 'cached markdown', + html: 'cached html' + }))]); + const mockFile = jest.fn().mockImplementation((path) => ({ + save: mockSave, + exists: mockExists, + download: mockDownload + })); + + return { + Storage: jest.fn().mockImplementation(() => ({ + bucket: jest.fn().mockImplementation(() => ({ + file: mockFile + })) + })), + _getMockFile: () => mockFile, + _getMockSave: () => mockSave + }; +}); + +process.env.GCS_BUCKET_NAME = 'test-bucket'; + +describe('PDF Caching', () => { + beforeEach(() => { + jest.clearAllMocks(); + }); + + test('createPdfCacheKey generates consistent keys', () => { + const pdfContent1 = 'test-pdf-content'; + const pdfContent2 = 'test-pdf-content'; + const pdfContent3 = 'different-pdf-content'; + + const key1 = createPdfCacheKey(pdfContent1); + const key2 = createPdfCacheKey(pdfContent2); + const key3 = createPdfCacheKey(pdfContent3); + + expect(key1).toBe(key2); // Same content should generate same key + expect(key1).not.toBe(key3); // Different content should generate different key + + expect(key1).toMatch(/^[a-f0-9]{64}$/); + }); + + test('createPdfCacheKey works directly with base64 content', () => { + const base64Content = 'JVBERi0xLjMKJcTl8uXrp/Og0MTGCjQgMCBvYmoKPDwgL0xlbmd0aCA1IDAgUiAvRmlsdGVyIC9GbGF0ZURlY29kZSA+PgpzdHJlYW0KeAFLy'; + + const key = createPdfCacheKey(base64Content); + + expect(key).toMatch(/^[a-f0-9]{64}$/); + + expect(createPdfCacheKey(base64Content)).toBe(key); + + }); + + test('savePdfResultToCache saves results to GCS', async () => { + const pdfContent = 'test-pdf-content'; + const result = { markdown: 'test markdown', html: 'test html' }; + + const { _getMockFile, _getMockSave } = require('@google-cloud/storage'); + const mockFile = _getMockFile(); + const mockSave = _getMockSave(); + + mockFile.mockClear(); + mockSave.mockClear(); + + const cacheKey = await savePdfResultToCache(pdfContent, result); + + expect(cacheKey).not.toBeNull(); + + expect(mockFile).toHaveBeenCalledWith(expect.stringContaining('pdf-cache/')); + + expect(mockSave).toHaveBeenCalledWith(JSON.stringify(result), { + contentType: 'application/json', + metadata: expect.objectContaining({ + source: 'runpod_pdf_conversion', + cache_type: 'pdf_markdown', + created_at: expect.any(String) + }) + }); + }); + + test('getPdfResultFromCache retrieves results from GCS', async () => { + const pdfContent = 'test-pdf-content'; + + const result = await getPdfResultFromCache(pdfContent); + + expect(result).not.toBeNull(); + expect(result?.markdown).toBe('cached markdown'); + expect(result?.html).toBe('cached html'); + }); + + test('getPdfResultFromCache returns null when cache miss', async () => { + const { Storage } = require('@google-cloud/storage'); + const mockExists = Storage().bucket().file().exists; + mockExists.mockResolvedValueOnce([false]); + + const result = await getPdfResultFromCache('uncached-content'); + + expect(result).toBeNull(); + }); +}); diff --git a/apps/api/src/lib/gcs-pdf-cache.ts b/apps/api/src/lib/gcs-pdf-cache.ts new file mode 100644 index 00000000..90348974 --- /dev/null +++ b/apps/api/src/lib/gcs-pdf-cache.ts @@ -0,0 +1,112 @@ +import { Storage } from "@google-cloud/storage"; +import { logger } from "./logger"; +import crypto from "crypto"; + +const credentials = process.env.GCS_CREDENTIALS ? JSON.parse(atob(process.env.GCS_CREDENTIALS)) : undefined; +const PDF_CACHE_PREFIX = "pdf-cache/"; + +/** + * Creates a SHA-256 hash of the PDF content to use as a cache key + * Directly hashes the content without any conversion + */ +export function createPdfCacheKey(pdfContent: string | Buffer): string { + return crypto + .createHash('sha256') + .update(pdfContent) + .digest('hex'); +} + +/** + * Save RunPod markdown results to GCS cache + */ +export async function savePdfResultToCache( + pdfContent: string, + result: { markdown: string; html: string } +): Promise { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const cacheKey = createPdfCacheKey(pdfContent); + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`); + + for (let i = 0; i < 3; i++) { + try { + await blob.save(JSON.stringify(result), { + contentType: "application/json", + metadata: { + source: "runpod_pdf_conversion", + cache_type: "pdf_markdown", + created_at: new Date().toISOString(), + } + }); + + logger.info(`Saved PDF RunPod result to GCS cache`, { + cacheKey, + }); + + return cacheKey; + } catch (error) { + if (i === 2) { + throw error; + } else { + logger.error(`Error saving PDF RunPod result to GCS cache, retrying`, { + error, + cacheKey, + i, + }); + } + } + } + + return cacheKey; + } catch (error) { + logger.error(`Error saving PDF RunPod result to GCS cache`, { + error, + }); + return null; + } +} + +/** + * Get cached RunPod markdown results from GCS + */ +export async function getPdfResultFromCache( + pdfContent: string +): Promise<{ markdown: string; html: string } | null> { + try { + if (!process.env.GCS_BUCKET_NAME) { + return null; + } + + const cacheKey = createPdfCacheKey(pdfContent); + const storage = new Storage({ credentials }); + const bucket = storage.bucket(process.env.GCS_BUCKET_NAME); + const blob = bucket.file(`${PDF_CACHE_PREFIX}${cacheKey}.json`); + + const [exists] = await blob.exists(); + if (!exists) { + logger.debug(`PDF RunPod result not found in GCS cache`, { + cacheKey, + }); + return null; + } + + const [content] = await blob.download(); + const result = JSON.parse(content.toString()); + + logger.info(`Retrieved PDF RunPod result from GCS cache`, { + cacheKey, + }); + + return result; + } catch (error) { + logger.error(`Error retrieving PDF RunPod result from GCS cache`, { + error, + }); + return null; + } +} diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 727e12c9..29a07a5d 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -11,6 +11,7 @@ import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../.. import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import type { Response } from "undici"; +import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache"; type PDFProcessorResult = { html: string; markdown?: string }; @@ -26,6 +27,22 @@ async function scrapePDFWithRunPodMU( tempFilePath, }); + try { + const cachedResult = await getPdfResultFromCache(base64Content); + + if (cachedResult) { + meta.logger.info("Using cached RunPod MU result for PDF", { + tempFilePath, + }); + return cachedResult; + } + } catch (error) { + meta.logger.warn("Error checking PDF cache, proceeding with RunPod MU", { + error, + tempFilePath, + }); + } + const result = await robustFetch({ url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", @@ -50,10 +67,21 @@ async function scrapePDFWithRunPodMU( mock: meta.mock, }); - return { + const processorResult = { markdown: result.output.markdown, html: await marked.parse(result.output.markdown, { async: true }), }; + + try { + await savePdfResultToCache(base64Content, processorResult); + } catch (error) { + meta.logger.warn("Error saving PDF to cache", { + error, + tempFilePath, + }); + } + + return processorResult; } async function scrapePDFWithParsePDF( From ab30c8e4acf6283cd3624de4055e3b163a090892 Mon Sep 17 00:00:00 2001 From: "devin-ai-integration[bot]" <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Fri, 16 May 2025 12:56:33 -0300 Subject: [PATCH 10/10] Fix Supabase client configuration errors when USE_DB_AUTHENTICATION is false (#1534) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Fix Supabase client configuration errors when USE_DB_AUTHENTICATION is false Co-Authored-By: hello@sideguide.dev * Add USE_DB_AUTHENTICATION checks to map and search controllers Add test for USE_DB_AUTHENTICATION=false Add USE_DB_AUTHENTICATION checks to billing services Add USE_DB_AUTHENTICATION checks to batch_billing.ts Add USE_DB_AUTHENTICATION checks to cached-docs.ts Add USE_DB_AUTHENTICATION checks to supabase-jobs.ts Add USE_DB_AUTHENTICATION checks to team-id-sync.ts Add USE_DB_AUTHENTICATION checks to test-suite log.ts Add USE_DB_AUTHENTICATION checks to idempotency services Co-Authored-By: hello@sideguide.dev * Revert "Add USE_DB_AUTHENTICATION checks to map and search controllers" This reverts commit 834a5d51a68c74ada67800fa3a0aa45bde22d745. --------- Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: hello@sideguide.dev Co-authored-by: Nicolas Co-authored-by: Gergő Móricz --- apps/api/src/services/queue-worker.ts | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 553b944f..622d5cfb 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -147,19 +147,23 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { "crawl:" + job.data.crawl_id + ":visited_unique", ), ); - + logger.info("Visited URLs", { visitedUrls: visitedUrls.size, }); - const lastUrls: string[] = ( - ( - await supabase_service.rpc("diff_get_last_crawl_urls", { - i_team_id: job.data.team_id, - i_url: sc.originUrl!, - }) - ).data ?? [] - ).map((x) => x.url); + let lastUrls: string[] = []; + const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === "true"; + if (useDbAuthentication) { + lastUrls = ( + ( + await supabase_service.rpc("diff_get_last_crawl_urls", { + i_team_id: job.data.team_id, + i_url: sc.originUrl!, + }) + ).data ?? [] + ).map((x) => x.url); + } const lastUrlsSet = new Set(lastUrls); @@ -257,7 +261,8 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if ( visitedUrls.length > 0 && job.data.crawlerOptions !== null && - originUrl + originUrl && + process.env.USE_DB_AUTHENTICATION === "true" ) { // Queue the indexing job instead of doing it directly await getIndexQueue().add(