From 2e5785d8d9b98cadfa1075bf11996a3953d9ca50 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Sun, 19 Jan 2025 11:40:13 -0300 Subject: [PATCH] Nick: fetch sitemap timeout param --- apps/api/src/controllers/v1/map.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 59 +++++++++++++++------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index c8f7dd96..e1e4b4d8 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -100,6 +100,7 @@ export async function getMapResults({ }, true, true, + 30000 ); if (sitemap > 0) { links = links diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 7d4be97b..2003e448 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -202,6 +202,7 @@ export class WebCrawler { urlsHandler: (urls: string[]) => unknown, fromMap: boolean = false, onlySitemap: boolean = false, + timeout: number = 120000, ): Promise { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { method: "tryGetSitemap", @@ -250,27 +251,49 @@ export class WebCrawler { } }; - let count = (await Promise.all([ - this.tryFetchSitemapLinks( - this.initialUrl, - _urlsHandler, - ), - ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), - ])).reduce((a,x) => a+x, 0); + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout); + }); - if (count > 0) { - if ( - await redisConnection.sadd( - "sitemap:" + this.jobId + ":links", - normalizeUrl(this.initialUrl), - ) - ) { - urlsHandler([this.initialUrl]); + try { + let count = await Promise.race([ + Promise.all([ + this.tryFetchSitemapLinks( + this.initialUrl, + _urlsHandler, + ), + ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), + ]).then(results => results.reduce((a,x) => a+x, 0)), + timeoutPromise + ]) as number; + + if (count > 0) { + if ( + await redisConnection.sadd( + "sitemap:" + this.jobId + ":links", + normalizeUrl(this.initialUrl), + ) + ) { + urlsHandler([this.initialUrl]); + } + count++; } - count++; - } - return count; + return count; + } catch (error) { + if (error.message === 'Sitemap fetch timeout') { + this.logger.warn('Sitemap fetch timed out', { + method: "tryGetSitemap", + timeout, + }); + return 0; + } + this.logger.error('Error fetching sitemap', { + method: "tryGetSitemap", + error, + }); + return 0; + } } public filterURL(href: string, url: string): string | null {