added fire-engine fallback for getting sitemaps

This commit is contained in:
rafaelsideguide 2024-07-09 16:07:53 -03:00
parent fcc67a3c9e
commit 9ad06fdf56
3 changed files with 29 additions and 10 deletions

View File

@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export class WebCrawler {
private initialUrl: string;
@ -448,10 +449,14 @@ export class WebCrawler {
try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl });
}
} catch (error) {
console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`);
const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' });
if (response) {
sitemapLinks = response;
}
} catch (error) {
console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
}
if (sitemapLinks.length === 0) {
@ -459,10 +464,11 @@ export class WebCrawler {
try {
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) {
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap });
}
} catch (error) {
console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' });
}
}

View File

@ -218,7 +218,7 @@ export class WebScraperDataProvider {
private async handleSitemapMode(
inProgress?: (progress: Progress) => void
): Promise<Document[]> {
let links = await getLinksFromSitemap(this.urls[0]);
let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] });
links = await this.cleanIrrelevantPath(links);
if (this.returnOnlyUrls) {

View File

@ -1,16 +1,29 @@
import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
import { scrapWithFireEngine } from "./scrapers/fireEngine";
export async function getLinksFromSitemap(
sitemapUrl: string,
allUrls: string[] = []
{
sitemapUrl,
allUrls = [],
mode = 'axios'
}: {
sitemapUrl: string,
allUrls?: string[],
mode?: 'axios' | 'fire-engine'
}
): Promise<string[]> {
try {
let content: string;
try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
if (mode === 'axios') {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === 'fire-engine') {
const response = await scrapWithFireEngine({ url: sitemapUrl });
content = response.html;
}
} catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -23,7 +36,7 @@ export async function getLinksFromSitemap(
if (root && root.sitemap) {
for (const sitemap of root.sitemap) {
if (sitemap.loc && sitemap.loc.length > 0) {
await getLinksFromSitemap(sitemap.loc[0], allUrls);
await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode });
}
}
} else if (root && root.url) {