Update extraction-service.ts

This commit is contained in:
Nicolas 2024-12-31 15:22:50 -03:00
parent bd81b41d5f
commit 33632d2fe3

View File

@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
} }
// Kickoff background crawl for indexing root domains // Kickoff background crawl for indexing root domains
const rootDomains = new Set(request.urls.map(getRootDomain)); // const rootDomains = new Set(request.urls.map(getRootDomain));
rootDomains.forEach(async url => { // rootDomains.forEach(async url => {
const crawlId = crypto.randomUUID(); // const crawlId = crypto.randomUUID();
// Create and save crawl configuration first // // Create and save crawl configuration first
const sc: StoredCrawl = { // const sc: StoredCrawl = {
originUrl: url, // originUrl: url,
crawlerOptions: { // crawlerOptions: {
maxDepth: 15, // maxDepth: 15,
limit: 5000, // limit: 5000,
includePaths: [], // includePaths: [],
excludePaths: [], // excludePaths: [],
ignoreSitemap: false, // ignoreSitemap: false,
includeSubdomains: true, // includeSubdomains: true,
allowExternalLinks: false, // allowExternalLinks: false,
allowBackwardLinks: true // allowBackwardLinks: true
}, // },
scrapeOptions: { // scrapeOptions: {
formats: ["markdown"], // formats: ["markdown"],
onlyMainContent: true, // onlyMainContent: true,
waitFor: 0, // waitFor: 0,
mobile: false, // mobile: false,
removeBase64Images: true, // removeBase64Images: true,
fastMode: false, // fastMode: false,
parsePDF: true, // parsePDF: true,
skipTlsVerification: false, // skipTlsVerification: false,
}, // },
internalOptions: { // internalOptions: {
disableSmartWaitCache: true, // disableSmartWaitCache: true,
isBackgroundIndex: true // isBackgroundIndex: true
}, // },
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
createdAt: Date.now(), // createdAt: Date.now(),
plan: "hobby", // make it a low concurrency // plan: "hobby", // make it a low concurrency
}; // };
// Save the crawl configuration // // Save the crawl configuration
await saveCrawl(crawlId, sc); // await saveCrawl(crawlId, sc);
// Then kick off the job // // Then kick off the job
await _addScrapeJobToBullMQ({ // await _addScrapeJobToBullMQ({
url, // url,
mode: "kickoff" as const, // mode: "kickoff" as const,
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!, // team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
plan: "hobby", // make it a low concurrency // plan: "hobby", // make it a low concurrency
crawlerOptions: sc.crawlerOptions, // crawlerOptions: sc.crawlerOptions,
scrapeOptions: sc.scrapeOptions, // scrapeOptions: sc.scrapeOptions,
internalOptions: sc.internalOptions, // internalOptions: sc.internalOptions,
origin: "index", // origin: "index",
crawl_id: crawlId, // crawl_id: crawlId,
webhook: null, // webhook: null,
v1: true, // v1: true,
}, {}, crypto.randomUUID(), 50); // }, {}, crypto.randomUUID(), 50);
}); // });
// Bill team for usage // Bill team for usage
billTeam(teamId, subId, links.length * 5).catch((error) => { billTeam(teamId, subId, links.length * 5).catch((error) => {