fix(crawler): recognize sitemaps in robots.txt

2025-08-12 04:39:03 +08:00 · 2025-01-17 15:45:52 +01:00 · 2025-01-17 15:45:52 +01:00 · 23bb172592
commit 23bb172592
parent faf58dfca7
2 changed files with 43 additions and 37 deletions
--- a/apps/api/src/controllers/v1/map.ts
+++ b/apps/api/src/controllers/v1/map.ts
@ -85,6 +85,11 @@ export async function getMapResults({

  const crawler = crawlToCrawler(id, sc);

+  try {
+    sc.robots = await crawler.getRobotsTxt();
+    await crawler.importRobotsTxt(sc.robots);
+  } catch (_) {}
+
  // If sitemapOnly is true, only get links from sitemap
  if (crawlerOptions.sitemapOnly) {
    const sitemap = await crawler.tryGetSitemap(
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
 import cheerio, { load } from "cheerio";
 import { URL } from "url";
 import { getLinksFromSitemap } from "./sitemap";
-import robotsParser from "robots-parser";
+import robotsParser, { Robot } from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
 import { axiosTimeout } from "../../lib/timeout";
 import { logger as _logger } from "../../lib/logger";
@ -20,7 +20,7 @@ export class WebCrawler {
  private crawledUrls: Map<string, string> = new Map();
  private limit: number;
  private robotsTxtUrl: string;
-  public robots: any;
+  public robots: Robot;
  private generateImgAltText: boolean;
  private allowBackwardCrawling: boolean;
  private allowExternalContentLinks: boolean;
@ -63,7 +63,7 @@ export class WebCrawler {
    this.includes = Array.isArray(includes) ? includes : [];
    this.excludes = Array.isArray(excludes) ? excludes : [];
    this.limit = limit;
-    this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
+    this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
    this.robots = robotsParser(this.robotsTxtUrl, "");
    // Deprecated, use limit instead
    this.maxCrawledLinks = maxCrawledLinks ?? limit;
@ -217,45 +217,46 @@ export class WebCrawler {
    };

    const _urlsHandler = async (urls: string[]) => {
-      let uniqueURLs: string[] = [];
-      for (const url of urls) {
-        if (
-          await redisConnection.sadd(
-            "sitemap:" + this.jobId + ":links",
-            normalizeUrl(url),
-          )
-        ) {
-          uniqueURLs.push(url);
+      if (fromMap && onlySitemap) {
+        return urlsHandler(urls);
+      } else {
+        let filteredLinks = this.filterLinks(
+          [...new Set(urls)],
+          leftOfLimit,
+          this.maxCrawledDepth,
+          fromMap,
+        );
+        leftOfLimit -= filteredLinks.length;
+        let uniqueURLs: string[] = [];
+        for (const url of urls) {
+          if (
+            await redisConnection.sadd(
+              "sitemap:" + this.jobId + ":links",
+              normalizeUrl(url),
+            )
+          ) {
+            uniqueURLs.push(url);
+          }
        }
-      }

-      await redisConnection.expire(
-        "sitemap:" + this.jobId + ":links",
-        3600,
-        "NX",
-      );
-      if (uniqueURLs.length > 0) {
-        urlsHandler(uniqueURLs);
+        await redisConnection.expire(
+          "sitemap:" + this.jobId + ":links",
+          3600,
+          "NX",
+        );
+        if (uniqueURLs.length > 0) {
+          return urlsHandler(uniqueURLs);
+        }
      }
    };

-    let count = await this.tryFetchSitemapLinks(
-      this.initialUrl,
-      (urls: string[]) => {
-        if (fromMap && onlySitemap) {
-          return urlsHandler(urls);
-        } else {
-          let filteredLinks = this.filterLinks(
-            [...new Set(urls)],
-            leftOfLimit,
-            this.maxCrawledDepth,
-            fromMap,
-          );
-          leftOfLimit -= filteredLinks.length;
-          return _urlsHandler(filteredLinks);
-        }
-      },
-    );
+    let count = (await Promise.all([
+      this.tryFetchSitemapLinks(
+        this.initialUrl,
+        _urlsHandler,
+      ),
+      ...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
+    ])).reduce((a,x) => a+x, 0);

    if (count > 0) {
      if (