mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-15 19:46:00 +08:00
added fire-engine fallback for getting sitemaps
This commit is contained in:
parent
fcc67a3c9e
commit
9ad06fdf56
@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url";
|
||||
import robotsParser from "robots-parser";
|
||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
|
||||
export class WebCrawler {
|
||||
private initialUrl: string;
|
||||
@ -448,10 +449,14 @@ export class WebCrawler {
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||
if (response) {
|
||||
sitemapLinks = response;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
||||
}
|
||||
|
||||
if (sitemapLinks.length === 0) {
|
||||
@ -459,10 +464,11 @@ export class WebCrawler {
|
||||
try {
|
||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||
if (response.status === 200) {
|
||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -218,7 +218,7 @@ export class WebScraperDataProvider {
|
||||
private async handleSitemapMode(
|
||||
inProgress?: (progress: Progress) => void
|
||||
): Promise<Document[]> {
|
||||
let links = await getLinksFromSitemap(this.urls[0]);
|
||||
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
||||
links = await this.cleanIrrelevantPath(links);
|
||||
|
||||
if (this.returnOnlyUrls) {
|
||||
|
@ -1,16 +1,29 @@
|
||||
import axios from "axios";
|
||||
import { axiosTimeout } from "../../lib/timeout";
|
||||
import { parseStringPromise } from "xml2js";
|
||||
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||
|
||||
export async function getLinksFromSitemap(
|
||||
sitemapUrl: string,
|
||||
allUrls: string[] = []
|
||||
{
|
||||
sitemapUrl,
|
||||
allUrls = [],
|
||||
mode = 'axios'
|
||||
}: {
|
||||
sitemapUrl: string,
|
||||
allUrls?: string[],
|
||||
mode?: 'axios' | 'fire-engine'
|
||||
}
|
||||
): Promise<string[]> {
|
||||
try {
|
||||
let content: string;
|
||||
try {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
if (mode === 'axios') {
|
||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||
content = response.data;
|
||||
} else if (mode === 'fire-engine') {
|
||||
const response = await scrapWithFireEngine({ url: sitemapUrl });
|
||||
content = response.html;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||
|
||||
@ -23,7 +36,7 @@ export async function getLinksFromSitemap(
|
||||
if (root && root.sitemap) {
|
||||
for (const sitemap of root.sitemap) {
|
||||
if (sitemap.loc && sitemap.loc.length > 0) {
|
||||
await getLinksFromSitemap(sitemap.loc[0], allUrls);
|
||||
await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode });
|
||||
}
|
||||
}
|
||||
} else if (root && root.url) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user