fix(crawler): recognize sitemaps in robots.txt

This commit is contained in:
Gergő Móricz 2025-01-17 15:45:52 +01:00
parent faf58dfca7
commit 23bb172592
2 changed files with 43 additions and 37 deletions

View File

@ -85,6 +85,11 @@ export async function getMapResults({
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc);
try {
sc.robots = await crawler.getRobotsTxt();
await crawler.importRobotsTxt(sc.robots);
} catch (_) {}
// If sitemapOnly is true, only get links from sitemap // If sitemapOnly is true, only get links from sitemap
if (crawlerOptions.sitemapOnly) { if (crawlerOptions.sitemapOnly) {
const sitemap = await crawler.tryGetSitemap( const sitemap = await crawler.tryGetSitemap(

View File

@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio"; import cheerio, { load } from "cheerio";
import { URL } from "url"; import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap"; import { getLinksFromSitemap } from "./sitemap";
import robotsParser from "robots-parser"; import robotsParser, { Robot } from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
private crawledUrls: Map<string, string> = new Map(); private crawledUrls: Map<string, string> = new Map();
private limit: number; private limit: number;
private robotsTxtUrl: string; private robotsTxtUrl: string;
public robots: any; public robots: Robot;
private generateImgAltText: boolean; private generateImgAltText: boolean;
private allowBackwardCrawling: boolean; private allowBackwardCrawling: boolean;
private allowExternalContentLinks: boolean; private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
this.includes = Array.isArray(includes) ? includes : []; this.includes = Array.isArray(includes) ? includes : [];
this.excludes = Array.isArray(excludes) ? excludes : []; this.excludes = Array.isArray(excludes) ? excludes : [];
this.limit = limit; this.limit = limit;
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`; this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
this.robots = robotsParser(this.robotsTxtUrl, ""); this.robots = robotsParser(this.robotsTxtUrl, "");
// Deprecated, use limit instead // Deprecated, use limit instead
this.maxCrawledLinks = maxCrawledLinks ?? limit; this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,45 +217,46 @@ export class WebCrawler {
}; };
const _urlsHandler = async (urls: string[]) => { const _urlsHandler = async (urls: string[]) => {
let uniqueURLs: string[] = []; if (fromMap && onlySitemap) {
for (const url of urls) { return urlsHandler(urls);
if ( } else {
await redisConnection.sadd( let filteredLinks = this.filterLinks(
"sitemap:" + this.jobId + ":links", [...new Set(urls)],
normalizeUrl(url), leftOfLimit,
) this.maxCrawledDepth,
) { fromMap,
uniqueURLs.push(url); );
leftOfLimit -= filteredLinks.length;
let uniqueURLs: string[] = [];
for (const url of urls) {
if (
await redisConnection.sadd(
"sitemap:" + this.jobId + ":links",
normalizeUrl(url),
)
) {
uniqueURLs.push(url);
}
} }
}
await redisConnection.expire( await redisConnection.expire(
"sitemap:" + this.jobId + ":links", "sitemap:" + this.jobId + ":links",
3600, 3600,
"NX", "NX",
); );
if (uniqueURLs.length > 0) { if (uniqueURLs.length > 0) {
urlsHandler(uniqueURLs); return urlsHandler(uniqueURLs);
}
} }
}; };
let count = await this.tryFetchSitemapLinks( let count = (await Promise.all([
this.initialUrl, this.tryFetchSitemapLinks(
(urls: string[]) => { this.initialUrl,
if (fromMap && onlySitemap) { _urlsHandler,
return urlsHandler(urls); ),
} else { ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
let filteredLinks = this.filterLinks( ])).reduce((a,x) => a+x, 0);
[...new Set(urls)],
leftOfLimit,
this.maxCrawledDepth,
fromMap,
);
leftOfLimit -= filteredLinks.length;
return _urlsHandler(filteredLinks);
}
},
);
if (count > 0) { if (count > 0) {
if ( if (