Merge pull request #386 from mendableai/feat/fire-engine-fallback-for-sitemap

[Feat] Added fire-engine fallback for getting sitemaps
This commit is contained in:
Nicolas 2024-07-11 20:38:01 -04:00 committed by GitHub
commit 961b27811d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 29 additions and 10 deletions

View File

@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout"; import { axiosTimeout } from "../../../src/lib/timeout";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export class WebCrawler { export class WebCrawler {
private initialUrl: string; private initialUrl: string;
@ -448,10 +449,14 @@ export class WebCrawler {
try { try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
} }
} catch (error) { } catch (error) {
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) {
sitemapLinks = response;
}
} }
if (sitemapLinks.length === 0) { if (sitemapLinks.length === 0) {
@ -459,10 +464,11 @@ export class WebCrawler {
try { try {
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
} }
} catch (error) { } catch (error) {
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
} }
} }

View File

@ -218,7 +218,7 @@ export class WebScraperDataProvider {
private async handleSitemapMode( private async handleSitemapMode(
inProgress?: (progress: Progress) => void inProgress?: (progress: Progress) => void
): Promise<Document[]> { ): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]); let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
links = await this.cleanIrrelevantPath(links); links = await this.cleanIrrelevantPath(links);
if (this.returnOnlyUrls) { if (this.returnOnlyUrls) {

View File

@ -1,16 +1,29 @@
import axios from "axios"; import axios from "axios";
import { axiosTimeout } from "../../lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{
sitemapUrl,
allUrls = [],
mode = 'axios'
}: {
sitemapUrl: string, sitemapUrl: string,
allUrls: string[] = [] allUrls?: string[],
mode?: 'axios' | 'fire-engine'
}
): Promise<string[]> { ): Promise<string[]> {
try { try {
let content: string; let content: string;
try { try {
if (mode === 'axios') {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl });
content = response.html;
}
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -23,7 +36,7 @@ export async function getLinksFromSitemap(
if (root && root.sitemap) { if (root && root.sitemap) {
for (const sitemap of root.sitemap) { for (const sitemap of root.sitemap) {
if (sitemap.loc && sitemap.loc.length > 0) { if (sitemap.loc && sitemap.loc.length > 0) {
await getLinksFromSitemap(sitemap.loc[0], allUrls); await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode });
} }
} }
} else if (root && root.url) { } else if (root && root.url) {