diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 99fff9e4..acaa432e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -8,6 +8,7 @@ import { scrapSingleUrl } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; import { axiosTimeout } from "../../../src/lib/timeout"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; export class WebCrawler { private initialUrl: string; @@ -448,10 +449,14 @@ export class WebCrawler { try { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap(sitemapUrl); + sitemapLinks = await getLinksFromSitemap({ sitemapUrl }); + } + } catch (error) { + console.error(`Failed to fetch sitemap with axios from ${sitemapUrl}: ${error}`); + const response = await getLinksFromSitemap({ sitemapUrl, mode: 'fire-engine' }); + if (response) { + sitemapLinks = response; } - } catch (error) { - console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } if (sitemapLinks.length === 0) { @@ -459,10 +464,11 @@ export class WebCrawler { try { const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); + sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); } } catch (error) { console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 8b7de28a..87b5aee7 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -218,7 +218,7 @@ export class WebScraperDataProvider { private async handleSitemapMode( inProgress?: (progress: Progress) => void ): Promise { - let links = await getLinksFromSitemap(this.urls[0]); + let links = await getLinksFromSitemap({ sitemapUrl: this.urls[0] }); links = await this.cleanIrrelevantPath(links); if (this.returnOnlyUrls) { diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 3f563471..6d1d28be 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -1,16 +1,29 @@ import axios from "axios"; import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; +import { scrapWithFireEngine } from "./scrapers/fireEngine"; export async function getLinksFromSitemap( - sitemapUrl: string, - allUrls: string[] = [] + { + sitemapUrl, + allUrls = [], + mode = 'axios' + }: { + sitemapUrl: string, + allUrls?: string[], + mode?: 'axios' | 'fire-engine' + } ): Promise { try { let content: string; try { - const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); - content = response.data; + if (mode === 'axios') { + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); + content = response.data; + } else if (mode === 'fire-engine') { + const response = await scrapWithFireEngine({ url: sitemapUrl }); + content = response.html; + } } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -23,7 +36,7 @@ export async function getLinksFromSitemap( if (root && root.sitemap) { for (const sitemap of root.sitemap) { if (sitemap.loc && sitemap.loc.length > 0) { - await getLinksFromSitemap(sitemap.loc[0], allUrls); + await getLinksFromSitemap({ sitemapUrl: sitemap.loc[0], allUrls, mode }); } } } else if (root && root.url) {