Nick: fetch sitemap timeout param

This commit is contained in:
Nicolas 2025-01-19 11:40:13 -03:00
parent 24ddcd4a6d
commit 2e5785d8d9
2 changed files with 42 additions and 18 deletions

View File

@ -100,6 +100,7 @@ export async function getMapResults({
}, },
true, true,
true, true,
30000
); );
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links

View File

@ -202,6 +202,7 @@ export class WebCrawler {
urlsHandler: (urls: string[]) => unknown, urlsHandler: (urls: string[]) => unknown,
fromMap: boolean = false, fromMap: boolean = false,
onlySitemap: boolean = false, onlySitemap: boolean = false,
timeout: number = 120000,
): Promise<number> { ): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap", method: "tryGetSitemap",
@ -250,13 +251,21 @@ export class WebCrawler {
} }
}; };
let count = (await Promise.all([ const timeoutPromise = new Promise((_, reject) => {
setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout);
});
try {
let count = await Promise.race([
Promise.all([
this.tryFetchSitemapLinks( this.tryFetchSitemapLinks(
this.initialUrl, this.initialUrl,
_urlsHandler, _urlsHandler,
), ),
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
])).reduce((a,x) => a+x, 0); ]).then(results => results.reduce((a,x) => a+x, 0)),
timeoutPromise
]) as number;
if (count > 0) { if (count > 0) {
if ( if (
@ -271,6 +280,20 @@ export class WebCrawler {
} }
return count; return count;
} catch (error) {
if (error.message === 'Sitemap fetch timeout') {
this.logger.warn('Sitemap fetch timed out', {
method: "tryGetSitemap",
timeout,
});
return 0;
}
this.logger.error('Error fetching sitemap', {
method: "tryGetSitemap",
error,
});
return 0;
}
} }
public filterURL(href: string, url: string): string | null { public filterURL(href: string, url: string): string | null {