Update extraction-service.ts

This commit is contained in:
Nicolas 2024-12-31 15:22:50 -03:00
parent bd81b41d5f
commit 33632d2fe3

View File

@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
}
// Kickoff background crawl for indexing root domains
const rootDomains = new Set(request.urls.map(getRootDomain));
rootDomains.forEach(async url => {
const crawlId = crypto.randomUUID();
// const rootDomains = new Set(request.urls.map(getRootDomain));
// rootDomains.forEach(async url => {
// const crawlId = crypto.randomUUID();
// Create and save crawl configuration first
const sc: StoredCrawl = {
originUrl: url,
crawlerOptions: {
maxDepth: 15,
limit: 5000,
includePaths: [],
excludePaths: [],
ignoreSitemap: false,
includeSubdomains: true,
allowExternalLinks: false,
allowBackwardLinks: true
},
scrapeOptions: {
formats: ["markdown"],
onlyMainContent: true,
waitFor: 0,
mobile: false,
removeBase64Images: true,
fastMode: false,
parsePDF: true,
skipTlsVerification: false,
},
internalOptions: {
disableSmartWaitCache: true,
isBackgroundIndex: true
},
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
createdAt: Date.now(),
plan: "hobby", // make it a low concurrency
};
// // Create and save crawl configuration first
// const sc: StoredCrawl = {
// originUrl: url,
// crawlerOptions: {
// maxDepth: 15,
// limit: 5000,
// includePaths: [],
// excludePaths: [],
// ignoreSitemap: false,
// includeSubdomains: true,
// allowExternalLinks: false,
// allowBackwardLinks: true
// },
// scrapeOptions: {
// formats: ["markdown"],
// onlyMainContent: true,
// waitFor: 0,
// mobile: false,
// removeBase64Images: true,
// fastMode: false,
// parsePDF: true,
// skipTlsVerification: false,
// },
// internalOptions: {
// disableSmartWaitCache: true,
// isBackgroundIndex: true
// },
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// createdAt: Date.now(),
// plan: "hobby", // make it a low concurrency
// };
// Save the crawl configuration
await saveCrawl(crawlId, sc);
// // Save the crawl configuration
// await saveCrawl(crawlId, sc);
// Then kick off the job
await _addScrapeJobToBullMQ({
url,
mode: "kickoff" as const,
team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
plan: "hobby", // make it a low concurrency
crawlerOptions: sc.crawlerOptions,
scrapeOptions: sc.scrapeOptions,
internalOptions: sc.internalOptions,
origin: "index",
crawl_id: crawlId,
webhook: null,
v1: true,
}, {}, crypto.randomUUID(), 50);
});
// // Then kick off the job
// await _addScrapeJobToBullMQ({
// url,
// mode: "kickoff" as const,
// team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
// plan: "hobby", // make it a low concurrency
// crawlerOptions: sc.crawlerOptions,
// scrapeOptions: sc.scrapeOptions,
// internalOptions: sc.internalOptions,
// origin: "index",
// crawl_id: crawlId,
// webhook: null,
// v1: true,
// }, {}, crypto.randomUUID(), 50);
// });
// Bill team for usage
billTeam(teamId, subId, links.length * 5).catch((error) => {