From 23bb172592b40738df71b81873e9ad285bd06fd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 17 Jan 2025 15:45:52 +0100 Subject: [PATCH] fix(crawler): recognize sitemaps in robots.txt --- apps/api/src/controllers/v1/map.ts | 5 ++ apps/api/src/scraper/WebScraper/crawler.ts | 75 +++++++++++----------- 2 files changed, 43 insertions(+), 37 deletions(-) diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 2afae0d4..c8f7dd96 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -85,6 +85,11 @@ export async function getMapResults({ const crawler = crawlToCrawler(id, sc); + try { + sc.robots = await crawler.getRobotsTxt(); + await crawler.importRobotsTxt(sc.robots); + } catch (_) {} + // If sitemapOnly is true, only get links from sitemap if (crawlerOptions.sitemapOnly) { const sitemap = await crawler.tryGetSitemap( diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5662fff9..1cca7a18 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios"; import cheerio, { load } from "cheerio"; import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; -import robotsParser from "robots-parser"; +import robotsParser, { Robot } from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../lib/timeout"; import { logger as _logger } from "../../lib/logger"; @@ -20,7 +20,7 @@ export class WebCrawler { private crawledUrls: Map = new Map(); private limit: number; private robotsTxtUrl: string; - public robots: any; + public robots: Robot; private generateImgAltText: boolean; private allowBackwardCrawling: boolean; private allowExternalContentLinks: boolean; @@ -63,7 +63,7 @@ export class WebCrawler { this.includes = Array.isArray(includes) ? includes : []; this.excludes = Array.isArray(excludes) ? excludes : []; this.limit = limit; - this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; + this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`; this.robots = robotsParser(this.robotsTxtUrl, ""); // Deprecated, use limit instead this.maxCrawledLinks = maxCrawledLinks ?? limit; @@ -217,45 +217,46 @@ export class WebCrawler { }; const _urlsHandler = async (urls: string[]) => { - let uniqueURLs: string[] = []; - for (const url of urls) { - if ( - await redisConnection.sadd( - "sitemap:" + this.jobId + ":links", - normalizeUrl(url), - ) - ) { - uniqueURLs.push(url); + if (fromMap && onlySitemap) { + return urlsHandler(urls); + } else { + let filteredLinks = this.filterLinks( + [...new Set(urls)], + leftOfLimit, + this.maxCrawledDepth, + fromMap, + ); + leftOfLimit -= filteredLinks.length; + let uniqueURLs: string[] = []; + for (const url of urls) { + if ( + await redisConnection.sadd( + "sitemap:" + this.jobId + ":links", + normalizeUrl(url), + ) + ) { + uniqueURLs.push(url); + } } - } - await redisConnection.expire( - "sitemap:" + this.jobId + ":links", - 3600, - "NX", - ); - if (uniqueURLs.length > 0) { - urlsHandler(uniqueURLs); + await redisConnection.expire( + "sitemap:" + this.jobId + ":links", + 3600, + "NX", + ); + if (uniqueURLs.length > 0) { + return urlsHandler(uniqueURLs); + } } }; - let count = await this.tryFetchSitemapLinks( - this.initialUrl, - (urls: string[]) => { - if (fromMap && onlySitemap) { - return urlsHandler(urls); - } else { - let filteredLinks = this.filterLinks( - [...new Set(urls)], - leftOfLimit, - this.maxCrawledDepth, - fromMap, - ); - leftOfLimit -= filteredLinks.length; - return _urlsHandler(filteredLinks); - } - }, - ); + let count = (await Promise.all([ + this.tryFetchSitemapLinks( + this.initialUrl, + _urlsHandler, + ), + ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)), + ])).reduce((a,x) => a+x, 0); if (count > 0) { if (