diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index e266c2e9..0ca6a3de 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise } // Kickoff background crawl for indexing root domains - const rootDomains = new Set(request.urls.map(getRootDomain)); - rootDomains.forEach(async url => { - const crawlId = crypto.randomUUID(); + // const rootDomains = new Set(request.urls.map(getRootDomain)); + // rootDomains.forEach(async url => { + // const crawlId = crypto.randomUUID(); - // Create and save crawl configuration first - const sc: StoredCrawl = { - originUrl: url, - crawlerOptions: { - maxDepth: 15, - limit: 5000, - includePaths: [], - excludePaths: [], - ignoreSitemap: false, - includeSubdomains: true, - allowExternalLinks: false, - allowBackwardLinks: true - }, - scrapeOptions: { - formats: ["markdown"], - onlyMainContent: true, - waitFor: 0, - mobile: false, - removeBase64Images: true, - fastMode: false, - parsePDF: true, - skipTlsVerification: false, - }, - internalOptions: { - disableSmartWaitCache: true, - isBackgroundIndex: true - }, - team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, - createdAt: Date.now(), - plan: "hobby", // make it a low concurrency - }; + // // Create and save crawl configuration first + // const sc: StoredCrawl = { + // originUrl: url, + // crawlerOptions: { + // maxDepth: 15, + // limit: 5000, + // includePaths: [], + // excludePaths: [], + // ignoreSitemap: false, + // includeSubdomains: true, + // allowExternalLinks: false, + // allowBackwardLinks: true + // }, + // scrapeOptions: { + // formats: ["markdown"], + // onlyMainContent: true, + // waitFor: 0, + // mobile: false, + // removeBase64Images: true, + // fastMode: false, + // parsePDF: true, + // skipTlsVerification: false, + // }, + // internalOptions: { + // disableSmartWaitCache: true, + // isBackgroundIndex: true + // }, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // createdAt: Date.now(), + // plan: "hobby", // make it a low concurrency + // }; - // Save the crawl configuration - await saveCrawl(crawlId, sc); + // // Save the crawl configuration + // await saveCrawl(crawlId, sc); - // Then kick off the job - await _addScrapeJobToBullMQ({ - url, - mode: "kickoff" as const, - team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, - plan: "hobby", // make it a low concurrency - crawlerOptions: sc.crawlerOptions, - scrapeOptions: sc.scrapeOptions, - internalOptions: sc.internalOptions, - origin: "index", - crawl_id: crawlId, - webhook: null, - v1: true, - }, {}, crypto.randomUUID(), 50); - }); + // // Then kick off the job + // await _addScrapeJobToBullMQ({ + // url, + // mode: "kickoff" as const, + // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, + // plan: "hobby", // make it a low concurrency + // crawlerOptions: sc.crawlerOptions, + // scrapeOptions: sc.scrapeOptions, + // internalOptions: sc.internalOptions, + // origin: "index", + // crawl_id: crawlId, + // webhook: null, + // v1: true, + // }, {}, crypto.randomUUID(), 50); + // }); // Bill team for usage billTeam(teamId, subId, links.length * 5).catch((error) => {