Nick: async background index

This commit is contained in:
Nicolas 2024-12-30 21:42:01 -03:00
parent 7a31306be5
commit e6da214aeb
4 changed files with 79 additions and 0 deletions

View File

@ -7,6 +7,8 @@ import { generateOpenAICompletions } from "../../scraper/scrapeURL/transformers/
import { buildDocument } from "./build-document"; import { buildDocument } from "./build-document";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { _addScrapeJobToBullMQ } from "../../services/queue-jobs";
import { saveCrawl, StoredCrawl } from "../crawl-redis";
interface ExtractServiceOptions { interface ExtractServiceOptions {
request: ExtractRequest; request: ExtractRequest;
@ -24,6 +26,18 @@ interface ExtractResult {
error?: string; error?: string;
} }
function getRootDomain(url: string): string {
try {
if(url.endsWith("/*")) {
url = url.slice(0, -2);
}
const urlObj = new URL(url);
return `${urlObj.protocol}//${urlObj.hostname}`;
} catch (e) {
return url;
}
}
export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> { export async function performExtraction(options: ExtractServiceOptions): Promise<ExtractResult> {
const { request, teamId, plan, subId } = options; const { request, teamId, plan, subId } = options;
const scrapeId = crypto.randomUUID(); const scrapeId = crypto.randomUUID();
@ -112,6 +126,62 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
}); });
} }
// Kickoff background crawl for indexing root domains
const rootDomains = new Set(request.urls.map(getRootDomain));
rootDomains.forEach(async url => {
const crawlId = crypto.randomUUID();
// Create and save crawl configuration first
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions: {
maxDepth: 15,
limit: 5000,
includePaths: [],
excludePaths: [],
ignoreSitemap: false,
includeSubdomains: true,
allowExternalLinks: false,
allowBackwardLinks: true
},
scrapeOptions: {
formats: ["markdown"],
onlyMainContent: true,
waitFor: 0,
mobile: false,
removeBase64Images: true,
fastMode: false,
parsePDF: true,
skipTlsVerification: false,
},
internalOptions: {
disableSmartWaitCache: true,
isBackgroundIndex: true
},
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
createdAt: Date.now(),
plan: "hobby", // make it a low concurrency
};
// Save the crawl configuration
await saveCrawl(crawlId, sc);
// Then kick off the job
await _addScrapeJobToBullMQ({
url,
mode: "kickoff" as const,
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
plan: "hobby", // make it a low concurrency
crawlerOptions: sc.crawlerOptions,
scrapeOptions: sc.scrapeOptions,
internalOptions: sc.internalOptions,
origin: "index",
crawl_id: crawlId,
webhook: null,
v1: true,
}, {}, crypto.randomUUID(), 50);
});
// Bill team for usage // Bill team for usage
billTeam(teamId, subId, links.length * 5).catch((error) => { billTeam(teamId, subId, links.length * 5).catch((error) => {
logger.error( logger.error(

View File

@ -16,6 +16,7 @@ import {
ScrapeUrlResponse, ScrapeUrlResponse,
} from "../scraper/scrapeURL"; } from "../scraper/scrapeURL";
import { Engine } from "../scraper/scrapeURL/engines"; import { Engine } from "../scraper/scrapeURL/engines";
import { indexPage } from "../lib/extract/index/pinecone";
configDotenv(); configDotenv();
export async function startWebScraperPipeline({ export async function startWebScraperPipeline({
@ -173,6 +174,12 @@ export async function runWebScraper({
creditsToBeBilled = 5; creditsToBeBilled = 5;
} }
// If the team is the background index team, return the response
if(team_id === process.env.BACKGROUND_INDEX_TEAM_ID!) {
return response;
}
billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => { billTeam(team_id, undefined, creditsToBeBilled, logger).catch((error) => {
logger.error( logger.error(
`Failed to bill team ${team_id} for ${creditsToBeBilled} credits`, `Failed to bill team ${team_id} for ${creditsToBeBilled} credits`,

View File

@ -153,6 +153,7 @@ export type InternalOptions = {
v0DisableJsDom?: boolean; v0DisableJsDom?: boolean;
disableSmartWaitCache?: boolean; // Passed along to fire-engine disableSmartWaitCache?: boolean; // Passed along to fire-engine
isBackgroundIndex?: boolean;
}; };
export type EngineResultsTracker = { export type EngineResultsTracker = {

View File

@ -166,6 +166,7 @@ const testSuiteTokens = [
"4c2638d", "4c2638d",
"cbb3462", // don't remove (s-ai) "cbb3462", // don't remove (s-ai)
"824abcd", // don't remove (s-ai) "824abcd", // don't remove (s-ai)
"0966288",
]; ];
const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"];