Update extraction-service.ts

2025-08-14 10:55:57 +08:00 · 2024-12-31 15:22:50 -03:00 · 2024-12-31 15:22:50 -03:00 · 33632d2fe3
commit 33632d2fe3
parent bd81b41d5f
1 changed files with 51 additions and 51 deletions
--- a/apps/api/src/lib/extract/extraction-service.ts
+++ b/apps/api/src/lib/extract/extraction-service.ts
@ -127,60 +127,60 @@ export async function performExtraction(options: ExtractServiceOptions): Promise
  }
  // Kickoff background crawl for indexing root domains
-  const rootDomains = new Set(request.urls.map(getRootDomain));
+  // const rootDomains = new Set(request.urls.map(getRootDomain));
-  rootDomains.forEach(async url => {
+  // rootDomains.forEach(async url => {
-    const crawlId = crypto.randomUUID();
+  //   const crawlId = crypto.randomUUID();
-    // Create and save crawl configuration first
+  //   // Create and save crawl configuration first
-    const sc: StoredCrawl = {
+  //   const sc: StoredCrawl = {
-      originUrl: url,
+  //     originUrl: url,
-      crawlerOptions: {
+  //     crawlerOptions: {
-        maxDepth: 15,
+  //       maxDepth: 15,
-        limit: 5000,
+  //       limit: 5000,
-        includePaths: [],
+  //       includePaths: [],
-        excludePaths: [],
+  //       excludePaths: [],
-        ignoreSitemap: false,
+  //       ignoreSitemap: false,
-        includeSubdomains: true,
+  //       includeSubdomains: true,
-        allowExternalLinks: false,
+  //       allowExternalLinks: false,
-        allowBackwardLinks: true
+  //       allowBackwardLinks: true
-      },
+  //     },
-      scrapeOptions: {
+  //     scrapeOptions: {
-          formats: ["markdown"],
+  //         formats: ["markdown"],
-          onlyMainContent: true,
+  //         onlyMainContent: true,
-          waitFor: 0,
+  //         waitFor: 0,
-          mobile: false,
+  //         mobile: false,
-          removeBase64Images: true,
+  //         removeBase64Images: true,
-          fastMode: false,
+  //         fastMode: false,
-          parsePDF: true,
+  //         parsePDF: true,
-          skipTlsVerification: false,
+  //         skipTlsVerification: false,
-      },
+  //     },
-      internalOptions: { 
+  //     internalOptions: { 
-        disableSmartWaitCache: true,
+  //       disableSmartWaitCache: true,
-        isBackgroundIndex: true
+  //       isBackgroundIndex: true
-      },
+  //     },
-      team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
+  //     team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
-      createdAt: Date.now(),
+  //     createdAt: Date.now(),
-      plan: "hobby", // make it a low concurrency
+  //     plan: "hobby", // make it a low concurrency
-    };
+  //   };
-    // Save the crawl configuration
+  //   // Save the crawl configuration
-    await saveCrawl(crawlId, sc);
+  //   await saveCrawl(crawlId, sc);
-    // Then kick off the job
+  //   // Then kick off the job
-    await _addScrapeJobToBullMQ({
+  //   await _addScrapeJobToBullMQ({
-      url,
+  //     url,
-      mode: "kickoff" as const,
+  //     mode: "kickoff" as const,
-      team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
+  //     team_id: process.env.BACKGROUND_INDEX_TEAM_ID!,
-      plan: "hobby", // make it a low concurrency
+  //     plan: "hobby", // make it a low concurrency
-      crawlerOptions: sc.crawlerOptions,
+  //     crawlerOptions: sc.crawlerOptions,
-      scrapeOptions: sc.scrapeOptions,
+  //     scrapeOptions: sc.scrapeOptions,
-      internalOptions: sc.internalOptions,
+  //     internalOptions: sc.internalOptions,
-      origin: "index",
+  //     origin: "index",
-      crawl_id: crawlId,
+  //     crawl_id: crawlId,
-      webhook: null,
+  //     webhook: null,
-      v1: true,
+  //     v1: true,
-    }, {}, crypto.randomUUID(), 50);
+  //   }, {}, crypto.randomUUID(), 50);
-  });
+  // });
  // Bill team for usage
  billTeam(teamId, subId, links.length * 5).catch((error) => {