mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 11:05:59 +08:00
Merge pull request #386 from mendableai/feat/fire-engine-fallback-for-sitemap
[Feat] Added fire-engine fallback for getting sitemaps
This commit is contained in:
commit
961b27811d
@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url";
|
|||||||
import robotsParser from "robots-parser";
|
import robotsParser from "robots-parser";
|
||||||
import { getURLDepth } from "./utils/maxDepthUtils";
|
import { getURLDepth } from "./utils/maxDepthUtils";
|
||||||
import { axiosTimeout } from "../../../src/lib/timeout";
|
import { axiosTimeout } from "../../../src/lib/timeout";
|
||||||
|
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||||
|
|
||||||
export class WebCrawler {
|
export class WebCrawler {
|
||||||
private initialUrl: string;
|
private initialUrl: string;
|
||||||
@ -448,10 +449,14 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
|
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
|
||||||
|
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
|
||||||
|
if (response) {
|
||||||
|
sitemapLinks = response;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (sitemapLinks.length === 0) {
|
if (sitemapLinks.length === 0) {
|
||||||
@ -459,10 +464,11 @@ export class WebCrawler {
|
|||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
|
||||||
if (response.status === 200) {
|
if (response.status === 200) {
|
||||||
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
|
||||||
|
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -218,7 +218,7 @@ export class WebScraperDataProvider {
|
|||||||
private async handleSitemapMode(
|
private async handleSitemapMode(
|
||||||
inProgress?: (progress: Progress) => void
|
inProgress?: (progress: Progress) => void
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
let links = await getLinksFromSitemap(this.urls[0]);
|
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
|
||||||
links = await this.cleanIrrelevantPath(links);
|
links = await this.cleanIrrelevantPath(links);
|
||||||
|
|
||||||
if (this.returnOnlyUrls) {
|
if (this.returnOnlyUrls) {
|
||||||
|
@ -1,16 +1,29 @@
|
|||||||
import axios from "axios";
|
import axios from "axios";
|
||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
|
import { scrapWithFireEngine } from "./scrapers/fireEngine";
|
||||||
|
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
|
{
|
||||||
|
sitemapUrl,
|
||||||
|
allUrls = [],
|
||||||
|
mode = 'axios'
|
||||||
|
}: {
|
||||||
sitemapUrl: string,
|
sitemapUrl: string,
|
||||||
allUrls: string[] = []
|
allUrls?: string[],
|
||||||
|
mode?: 'axios' | 'fire-engine'
|
||||||
|
}
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
try {
|
try {
|
||||||
let content: string;
|
let content: string;
|
||||||
try {
|
try {
|
||||||
|
if (mode === 'axios') {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
content = response.data;
|
content = response.data;
|
||||||
|
} else if (mode === 'fire-engine') {
|
||||||
|
const response = await scrapWithFireEngine({ url: sitemapUrl });
|
||||||
|
content = response.html;
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
console.error(`Request failed for ${sitemapUrl}: ${error}`);
|
||||||
|
|
||||||
@ -23,7 +36,7 @@ export async function getLinksFromSitemap(
|
|||||||
if (root && root.sitemap) {
|
if (root && root.sitemap) {
|
||||||
for (const sitemap of root.sitemap) {
|
for (const sitemap of root.sitemap) {
|
||||||
if (sitemap.loc && sitemap.loc.length > 0) {
|
if (sitemap.loc && sitemap.loc.length > 0) {
|
||||||
await getLinksFromSitemap(sitemap.loc[0], allUrls);
|
await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user