fix(crawler): recognize sitemaps in robots.txt

2025-08-12 17:39:06 +08:00 · 2025-01-17 15:45:52 +01:00 · 2025-01-17 15:45:52 +01:00 · 23bb172592
commit 23bb172592
parent faf58dfca7
2 changed files with 43 additions and 37 deletions
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -85,6 +85,11 @@ export async function getMapResults({
  const crawler = crawlToCrawler(id, sc);
  try {
    sc.robots = await crawler.getRobotsTxt();
    await crawler.importRobotsTxt(sc.robots);
  } catch (_) {}
  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
 import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
-import robotsParser from "robots-parser";
+import robotsParser, { Robot } from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 import { axiosTimeout } from "../../lib/timeout";
 import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
-  public robots: any;
+  public robots: Robot;
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
    this.includes = Array.isArray(includes) ? includes : [];
    this.excludes = Array.isArray(excludes) ? excludes : [];
    this.limit = limit;
-    this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
+    this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
    this.robots = robotsParser(this.robotsTxtUrl, "");
    // Deprecated, use limit instead
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,45 +217,46 @@ export class WebCrawler {
    };
    const _urlsHandler = async (urls: string[]) => {
-      let uniqueURLs: string[] = [];
+      if (fromMap && onlySitemap) {
-      for (const url of urls) {
+        return urlsHandler(urls);
-        if (
+      } else {
-          await redisConnection.sadd(
+        let filteredLinks = this.filterLinks(
-            "sitemap:" + this.jobId + ":links",
+          [...new Set(urls)],
-            normalizeUrl(url),
+          leftOfLimit,
-          )
+          this.maxCrawledDepth,
-        ) {
+          fromMap,
-          uniqueURLs.push(url);
+        );
        leftOfLimit -= filteredLinks.length;
        let uniqueURLs: string[] = [];
        for (const url of urls) {
          if (
            await redisConnection.sadd(
              "sitemap:" + this.jobId + ":links",
              normalizeUrl(url),
            )
          ) {
            uniqueURLs.push(url);
          }
        }
      }
-      await redisConnection.expire(
+        await redisConnection.expire(
-        "sitemap:" + this.jobId + ":links",
+          "sitemap:" + this.jobId + ":links",
-        3600,
+          3600,
-        "NX",
+          "NX",
-      );
+        );
-      if (uniqueURLs.length > 0) {
+        if (uniqueURLs.length > 0) {
-        urlsHandler(uniqueURLs);
+          return urlsHandler(uniqueURLs);
        }
      }
    };
-    let count = await this.tryFetchSitemapLinks(
+    let count = (await Promise.all([
-      this.initialUrl,
+      this.tryFetchSitemapLinks(
-      (urls: string[]) => {
+        this.initialUrl,
-        if (fromMap && onlySitemap) {
+        _urlsHandler,
-          return urlsHandler(urls);
+      ),
-        } else {
+      ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
-          let filteredLinks = this.filterLinks(
+    ])).reduce((a,x) => a+x, 0);
            [...new Set(urls)],
            leftOfLimit,
            this.maxCrawledDepth,
            fromMap,
          );
          leftOfLimit -= filteredLinks.length;
          return _urlsHandler(filteredLinks);
        }
      },
    );
    if (count > 0) {
      if (