mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 17:39:06 +08:00
fix(crawler): recognize sitemaps in robots.txt
This commit is contained in:
parent
faf58dfca7
commit
23bb172592
@ -85,6 +85,11 @@ export async function getMapResults({
|
|||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc);
|
||||||
|
|
||||||
|
try {
|
||||||
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
await crawler.importRobotsTxt(sc.robots);
|
||||||
|
} catch (_) {}
|
||||||
|
|
||||||
// If sitemapOnly is true, only get links from sitemap
|
// If sitemapOnly is true, only get links from sitemap
|
||||||
if (crawlerOptions.sitemapOnly) {
|
if (crawlerOptions.sitemapOnly) {
|
||||||
const sitemap = await crawler.tryGetSitemap(
|
const sitemap = await crawler.tryGetSitemap(
|
||||||
|
@ -2,7 +2,7 @@ import axios, { AxiosError } from "axios";
|
|||||||
import cheerio, { load } from "cheerio";
|
import cheerio, { load } from "cheerio";
|
||||||
import { URL } from "url";
|
import { URL } from "url";
|
||||||
import { getLinksFromSitemap } from "./sitemap";
|
import { getLinksFromSitemap } from "./sitemap";
|
||||||
import robotsParser from "robots-parser";
|
import robotsParser, { Robot } from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
@ -20,7 +20,7 @@ export class WebCrawler {
|
|||||||
private crawledUrls: Map<string, string> = new Map();
|
private crawledUrls: Map<string, string> = new Map();
|
||||||
private limit: number;
|
private limit: number;
|
||||||
private robotsTxtUrl: string;
|
private robotsTxtUrl: string;
|
||||||
public robots: any;
|
public robots: Robot;
|
||||||
private generateImgAltText: boolean;
|
private generateImgAltText: boolean;
|
||||||
private allowBackwardCrawling: boolean;
|
private allowBackwardCrawling: boolean;
|
||||||
private allowExternalContentLinks: boolean;
|
private allowExternalContentLinks: boolean;
|
||||||
@ -63,7 +63,7 @@ export class WebCrawler {
|
|||||||
this.includes = Array.isArray(includes) ? includes : [];
|
this.includes = Array.isArray(includes) ? includes : [];
|
||||||
this.excludes = Array.isArray(excludes) ? excludes : [];
|
this.excludes = Array.isArray(excludes) ? excludes : [];
|
||||||
this.limit = limit;
|
this.limit = limit;
|
||||||
this.robotsTxtUrl = `${this.baseUrl}/robots.txt`;
|
this.robotsTxtUrl = `${this.baseUrl}${this.baseUrl.endsWith("/") ? "" : "/"}robots.txt`;
|
||||||
this.robots = robotsParser(this.robotsTxtUrl, "");
|
this.robots = robotsParser(this.robotsTxtUrl, "");
|
||||||
// Deprecated, use limit instead
|
// Deprecated, use limit instead
|
||||||
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
this.maxCrawledLinks = maxCrawledLinks ?? limit;
|
||||||
@ -217,45 +217,46 @@ export class WebCrawler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const _urlsHandler = async (urls: string[]) => {
|
const _urlsHandler = async (urls: string[]) => {
|
||||||
let uniqueURLs: string[] = [];
|
if (fromMap && onlySitemap) {
|
||||||
for (const url of urls) {
|
return urlsHandler(urls);
|
||||||
if (
|
} else {
|
||||||
await redisConnection.sadd(
|
let filteredLinks = this.filterLinks(
|
||||||
"sitemap:" + this.jobId + ":links",
|
[...new Set(urls)],
|
||||||
normalizeUrl(url),
|
leftOfLimit,
|
||||||
)
|
this.maxCrawledDepth,
|
||||||
) {
|
fromMap,
|
||||||
uniqueURLs.push(url);
|
);
|
||||||
|
leftOfLimit -= filteredLinks.length;
|
||||||
|
let uniqueURLs: string[] = [];
|
||||||
|
for (const url of urls) {
|
||||||
|
if (
|
||||||
|
await redisConnection.sadd(
|
||||||
|
"sitemap:" + this.jobId + ":links",
|
||||||
|
normalizeUrl(url),
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
uniqueURLs.push(url);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
await redisConnection.expire(
|
await redisConnection.expire(
|
||||||
"sitemap:" + this.jobId + ":links",
|
"sitemap:" + this.jobId + ":links",
|
||||||
3600,
|
3600,
|
||||||
"NX",
|
"NX",
|
||||||
);
|
);
|
||||||
if (uniqueURLs.length > 0) {
|
if (uniqueURLs.length > 0) {
|
||||||
urlsHandler(uniqueURLs);
|
return urlsHandler(uniqueURLs);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let count = await this.tryFetchSitemapLinks(
|
let count = (await Promise.all([
|
||||||
this.initialUrl,
|
this.tryFetchSitemapLinks(
|
||||||
(urls: string[]) => {
|
this.initialUrl,
|
||||||
if (fromMap && onlySitemap) {
|
_urlsHandler,
|
||||||
return urlsHandler(urls);
|
),
|
||||||
} else {
|
...this.robots.getSitemaps().map(x => this.tryFetchSitemapLinks(x, _urlsHandler)),
|
||||||
let filteredLinks = this.filterLinks(
|
])).reduce((a,x) => a+x, 0);
|
||||||
[...new Set(urls)],
|
|
||||||
leftOfLimit,
|
|
||||||
this.maxCrawledDepth,
|
|
||||||
fromMap,
|
|
||||||
);
|
|
||||||
leftOfLimit -= filteredLinks.length;
|
|
||||||
return _urlsHandler(filteredLinks);
|
|
||||||
}
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
if (count > 0) {
|
if (count > 0) {
|
||||||
if (
|
if (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user