From e6da214aebcdb01484cb11357d913554fb035072 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 30 Dec 2024 21:42:01 -0300 Subject: [PATCH] Nick: async background index --- .../api/src/lib/extract/extraction-service.ts | 70 +++++++++++++++++++ apps/api/src/main/runWebScraper.ts | 7 ++ apps/api/src/scraper/scrapeURL/index.ts | 1 + apps/api/src/services/rate-limiter.ts | 1 + 4 files changed, 79 insertions(+) diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index f84a1f34..e266c2e9 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/ import { buildDocument } from "./build-document"; import { billTeam } from "../../services/billing/credit_billing"; import { logJob } from "../../services/logging/log_job"; +import { _addScrapeJobToBullMQ } from "../../services/queue-jobs"; +import { saveCrawl, StoredCrawl } from "../crawl-redis"; interface ExtractServiceOptions { request: ExtractRequest; @@ -24,6 +26,18 @@ interface ExtractResult { error?: string; } +function getRootDomain(url: string): string { + try { + if(url.endsWith("/*")) { + url = url.slice(0, -2); + } + const urlObj = new URL(url); + return `${urlObj.protocol}//${urlObj.hostname}`; + } catch (e) { + return url; + } +} + export async function performExtraction(options: ExtractServiceOptions): Promise { const { request, teamId, plan, subId } = options; const scrapeId = crypto.randomUUID(); @@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise }); } + // Kickoff background crawl for indexing root domains + const rootDomains = new Set(request.urls.map(getRootDomain)); + rootDomains.forEach(async url => { + const crawlId = crypto.randomUUID(); + + // Create and save crawl configuration first + const sc: StoredCrawl = { + originUrl: url, + crawlerOptions: { + maxDepth: 15, + limit: 5000, + includePaths: [], + excludePaths: [], + ignoreSitemap: false, + includeSubdomains: true, + allowExternalLinks: false, + allowBackwardLinks: true + }, + scrapeOptions: { + formats: ["markdown"], + onlyMainContent: true, + waitFor: 0, + mobile: false, + removeBase64Images: true, + fastMode: false, + parsePDF: true, + skipTlsVerification: false, + }, + internalOptions: { + disableSmartWaitCache: true, + isBackgroundIndex: true + }, + team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + createdAt: Date.now(), + plan: "hobby", // make it a low concurrency + }; + + // Save the crawl configuration + await saveCrawl(crawlId, sc); + + // Then kick off the job + await _addScrapeJobToBullMQ({ + url, + mode: "kickoff" as const, + team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + plan: "hobby", // make it a low concurrency + crawlerOptions: sc.crawlerOptions, + scrapeOptions: sc.scrapeOptions, + internalOptions: sc.internalOptions, + origin: "index", + crawl_id: crawlId, + webhook: null, + v1: true, + }, {}, crypto.randomUUID(), 50); + }); + // Bill team for usage billTeam(teamId, subId, links.length * 5).catch((error) => { logger.error( diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 0f3b8524..6bb8b04e 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -16,6 +16,7 @@ import { ScrapeUrlResponse, } from "../scraper/scrapeURL"; import { Engine } from "../scraper/scrapeURL/engines"; +import { indexPage } from "../lib/extract/index/pinecone"; configDotenv(); export async function startWebScraperPipeline({ @@ -173,6 +174,12 @@ export async function runWebScraper({ creditsToBeBilled = 5; } + // If the team is the background index team, return the response + if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) { + return response; + } + + billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { logger.error( `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 130ef9ee..549ce9d1 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -153,6 +153,7 @@ export type InternalOptions = { v0DisableJsDom?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine + isBackgroundIndex?: boolean; }; export type EngineResultsTracker = { diff --git a/apps/api/src/services/rate-limiter.ts b/apps/api/src/services/rate-limiter.ts index 21025589..304a9fc4 100644 --- a/apps/api/src/services/rate-limiter.ts +++ b/apps/api/src/services/rate-limiter.ts @@ -166,6 +166,7 @@ const testSuiteTokens = [ "4c2638d", "cbb3462", // don't remove (s-ai) "824abcd", // don't remove (s-ai) + "0966288", ]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];