fix(crawler): recognize sitemaps in robots.txt

2025-08-12 15:09:02 +08:00 · 2025-01-17 15:45:52 +01:00 · 2025-01-17 15:45:52 +01:00 · 23bb172592
commit 23bb172592
parent faf58dfca7
2 changed files with 43 additions and 37 deletions
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -85,6 +85,11 @@ export async function getMapResults({
  const crawler = crawlToCrawler(id, sc);
  try {
    sc.robots = await crawler.getRobotsTxt();
    await crawler.importRobotsTxt(sc.robots);
  } catch (_) {}
  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
 import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
-import robotsParser from "robots-parser";
+import robotsParser, { Robot } from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 import { axiosTimeout } from "../../lib/timeout";
 import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
-  public robots: any;
+  public robots: Robot;
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
    this.includes = Array.isArray(includes) ? includes : [];
    this.excludes = Array.isArray(excludes) ? excludes : [];
    this.limit = limit;
-    this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
+    this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
    this.robots = robotsParser(this.robotsTxtUrl, "");
    // Deprecated, use limit instead
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,6 +217,16 @@ export class WebCrawler {
    };
    const _urlsHandler = async (urls: string[]) => {
      if (fromMap && onlySitemap) {
        return urlsHandler(urls);
      } else {
        let filteredLinks = this.filterLinks(
          [...new Set(urls)],
          leftOfLimit,
          this.maxCrawledDepth,
          fromMap,
        );
        leftOfLimit -= filteredLinks.length;
        let uniqueURLs: string[] = [];
        for (const url of urls) {
          if (
@ -235,27 +245,18 @@ export class WebCrawler {
          "NX",
        );
        if (uniqueURLs.length > 0) {
-        urlsHandler(uniqueURLs);
+          return urlsHandler(uniqueURLs);
        }
      }
    };
-    let count = await this.tryFetchSitemapLinks(
+    let count = (await Promise.all([
      this.tryFetchSitemapLinks(
        this.initialUrl,
-      (urls: string[]) => {
+        _urlsHandler,
-        if (fromMap && onlySitemap) {
+      ),
-          return urlsHandler(urls);
+      ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
-        } else {
+    ])).reduce((a,x) => a+x, 0);
          let filteredLinks = this.filterLinks(
            [...new Set(urls)],
            leftOfLimit,
            this.maxCrawledDepth,
            fromMap,
          );
          leftOfLimit -= filteredLinks.length;
          return _urlsHandler(filteredLinks);
        }
      },
    );
    if (count > 0) {
      if (