Mog/cachable lookup (#1560)

* feat(scrapeURL): use cacheableLookup

* feat(queue-worker): add cacheablelookup

* fix(cacheable-lookup): make it work with tailscale on local

* add devenv

* try again

* allow querying all

* log

* fixes

* asd

* fix:

* fix(lookup):

* lookup
This commit is contained in:
Gergő Móricz 2025-05-16 15:44:52 +02:00 committed by GitHub
parent d46ba95924
commit bd9673e104
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 22 additions and 6 deletions

View File

@ -35,6 +35,7 @@ env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
VERTEX_CREDENTIALS: ${{ secrets.VERTEX_CREDENTIALS }}
USE_GO_MARKDOWN_PARSER: true
SENTRY_ENVIRONMENT: dev
jobs:
test:
@ -53,6 +54,7 @@ jobs:
oauth-client-id: ${{ secrets.TS_OAUTH_CLIENT_ID }}
oauth-secret: ${{ secrets.TS_OAUTH_SECRET }}
tags: tag:ci
use-cache: 'true'
- name: Install pnpm
uses: pnpm/action-setup@v4
with:

View File

@ -18,7 +18,6 @@ import { logger } from "./lib/logger";
import { adminRouter } from "./routes/admin";
import http from "node:http";
import https from "node:https";
import CacheableLookup from "cacheable-lookup";
import { v1Router } from "./routes/v1";
import expressWs from "express-ws";
import { ErrorResponse, ResponseWithSentry } from "./controllers/v1/types";
@ -26,6 +25,7 @@ import { ZodError } from "zod";
import { v4 as uuidv4 } from "uuid";
import { RateLimiterMode } from "./types";
import { attachWsProxy } from "./services/agentLivecastWS";
import { cacheableLookup } from "./scraper/scrapeURL/lib/cacheableLookup";
const { createBullBoard } = require("@bull-board/api");
const { BullAdapter } = require("@bull-board/api/bullAdapter");
@ -34,11 +34,9 @@ const { ExpressAdapter } = require("@bull-board/express");
const numCPUs = process.env.ENV === "local" ? 2 : os.cpus().length;
logger.info(`Number of CPUs: ${numCPUs} available`);
const cacheable = new CacheableLookup();
// Install cacheable lookup for all other requests
cacheable.install(http.globalAgent);
cacheable.install(https.globalAgent);
cacheableLookup.install(http.globalAgent);
cacheableLookup.install(https.globalAgent);
// Initialize Express with WebSocket support
const expressApp = express();

View File

@ -2,6 +2,7 @@ import type { Socket } from "net";
import type { TLSSocket } from "tls";
import * as undici from "undici";
import { Address6 } from "ip-address";
import { cacheableLookup } from "../../lib/cacheableLookup";
export class InsecureConnectionError extends Error {
constructor() {
@ -46,7 +47,7 @@ export function makeSecureDispatcher(
const agentOpts: undici.Agent.Options = {
connect: {
rejectUnauthorized: false, // bypass SSL failures -- this is fine
// lookup: secureLookup,
lookup: cacheableLookup.lookup,
},
maxRedirections: 5000,
...options,

View File

@ -0,0 +1,4 @@
import CacheableLookup from 'cacheable-lookup';
import dns from 'dns';
export const cacheableLookup = (process.env.SENTRY_ENVIRONMENT === "dev" ? { lookup: dns.lookup, install: () => {} } : new CacheableLookup({}));

View File

@ -5,6 +5,7 @@ import { MockState, saveMock } from "./mock";
import { TimeoutSignal } from "../../../controllers/v1/types";
import { fireEngineURL } from "../engines/fire-engine/scrape";
import { fetch, RequestInit, Response, FormData, Agent } from "undici";
import { cacheableLookup } from "./cacheableLookup";
export type RobustFetchParams<Schema extends z.Schema<any>> = {
url: string;
@ -82,6 +83,9 @@ export async function robustFetch<
dispatcher: new Agent({
headersTimeout: 0,
bodyTimeout: 0,
connect: {
lookup: cacheableLookup.lookup,
},
}),
...(body instanceof FormData
? {

View File

@ -82,6 +82,9 @@ import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f
import { CostTracking } from "../lib/extract/extraction-service";
import { getACUCTeam } from "../controllers/auth";
import Express from "express";
import http from "http";
import https from "https";
import { cacheableLookup } from "../scraper/scrapeURL/lib/cacheableLookup";
configDotenv();
@ -109,6 +112,10 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
const runningJobs: Set<string> = new Set();
// Install cacheable lookup for all other requests
cacheableLookup.install(http.globalAgent);
cacheableLookup.install(https.globalAgent);
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const logger = _logger.child({
module: "queue-worker",