Nick: fetch sitemap timeout param

This commit is contained in:
Nicolas 2025-01-19 11:40:13 -03:00
parent 24ddcd4a6d
commit 2e5785d8d9
2 changed files with 42 additions and 18 deletions

View File

@ -100,6 +100,7 @@ export async function getMapResults({
}, },
true, true,
true, true,
30000
); );
if (sitemap > 0) { if (sitemap > 0) {
links = links links = links

View File

@ -202,6 +202,7 @@ export class WebCrawler {
urlsHandler: (urls: string[]) => unknown, urlsHandler: (urls: string[]) => unknown,
fromMap: boolean = false, fromMap: boolean = false,
onlySitemap: boolean = false, onlySitemap: boolean = false,
timeout: number = 120000,
): Promise<number> { ): Promise<number> {
this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, { this.logger.debug(`Fetching sitemap links from ${this.initialUrl}`, {
method: "tryGetSitemap", method: "tryGetSitemap",
@ -250,27 +251,49 @@ export class WebCrawler {
} }
}; };
let count = (await Promise.all([ const timeoutPromise = new Promise((_, reject) => {
this.tryFetchSitemapLinks( setTimeout(() => reject(new Error('Sitemap fetch timeout')), timeout);
this.initialUrl, });
_urlsHandler,
),
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
])).reduce((a,x) => a+x, 0);
if (count > 0) { try {
if ( let count = await Promise.race([
await redisConnection.sadd( Promise.all([
"sitemap:" + this.jobId + ":links", this.tryFetchSitemapLinks(
normalizeUrl(this.initialUrl), this.initialUrl,
) _urlsHandler,
) { ),
urlsHandler([this.initialUrl]); ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
]).then(results => results.reduce((a,x) => a+x, 0)),
timeoutPromise
]) as number;
if (count > 0) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(this.initialUrl),
)
) {
urlsHandler([this.initialUrl]);
}
count++;
} }
count++;
}
return count; return count;
} catch (error) {
if (error.message === 'Sitemap fetch timeout') {
this.logger.warn('Sitemap fetch timed out', {
method: "tryGetSitemap",
timeout,
});
return 0;
}
this.logger.error('Error fetching sitemap', {
method: "tryGetSitemap",
error,
});
return 0;
}
} }
public filterURL(href: string, url: string): string | null { public filterURL(href: string, url: string): string | null {