diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0cb50808..643b274e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -536,8 +536,6 @@ export class WebCrawler { ? url : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; - this.sitemapsHit.add(sitemapUrl); - let sitemapCount: number = 0; // Try to get sitemap from the provided URL first diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index bb989077..b6b132aa 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -36,86 +36,41 @@ export async function getLinksFromSitemap( try { let content: string = ""; try { - if (mode === "fire-engine" && useFireEngine) { - const fetchResponse = await scrapeURL( - "sitemap;" + crawlId, - sitemapUrl, - scrapeOptions.parse({ formats: ["rawHtml"] }), - { forceEngine: "fetch" }, - ); + const response = await scrapeURL( + "sitemap;" + crawlId, + sitemapUrl, + scrapeOptions.parse({ formats: ["rawHtml"] }), + { + forceEngine: [ + "fetch", + ...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []), + ], + v0DisableJsDom: true + }, + ); - if ( - fetchResponse.success && - fetchResponse.document.metadata.statusCode >= 200 && - fetchResponse.document.metadata.statusCode < 300 - ) { - content = fetchResponse.document.rawHtml!; - } else { - logger.debug( - "Failed to scrape sitemap via fetch, falling back to TLSClient...", - { - error: fetchResponse.success - ? fetchResponse.document - : fetchResponse.error, - }, - ); - - const tlsResponse = await scrapeURL( - "sitemap", - sitemapUrl, - scrapeOptions.parse({ formats: ["rawHtml"] }), - { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, - ); - - if ( - tlsResponse.success && - tlsResponse.document.metadata.statusCode >= 200 && - tlsResponse.document.metadata.statusCode < 300 - ) { - content = tlsResponse.document.rawHtml!; - } else { - logger.error( - `Request failed for ${sitemapUrl}, ran out of engines!`, - { - method: "getLinksFromSitemap", - mode, - sitemapUrl, - error: tlsResponse.success - ? tlsResponse.document - : tlsResponse.error, - }, - ); - return 0; - } - } + if ( + response.success && + response.document.metadata.statusCode >= 200 && + response.document.metadata.statusCode < 300 + ) { + content = response.document.rawHtml!; } else { - const fetchResponse = await scrapeURL( - "sitemap;" + crawlId, - sitemapUrl, - scrapeOptions.parse({ formats: ["rawHtml"] }), - { forceEngine: "fetch" }, + logger.error( + `Request failed for sitemap fetch`, + { + method: "getLinksFromSitemap", + mode, + sitemapUrl, + error: response.success + ? response.document + : response.error, + }, ); - - if ( - fetchResponse.success && - fetchResponse.document.metadata.statusCode >= 200 && - fetchResponse.document.metadata.statusCode < 300 - ) { - content = fetchResponse.document.rawHtml!; - } else { - logger.error( - `Request failed for ${sitemapUrl}, ran out of engines!`, - { - method: "getLinksFromSitemap", - mode, - sitemapUrl, - }, - ); - return 0; - } + return 0; } } catch (error) { - logger.error(`Request failed for ${sitemapUrl}`, { + logger.error(`Request failed for sitemap fetch`, { method: "getLinksFromSitemap", mode, sitemapUrl, diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 12a5e6e4..896e177b 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -298,10 +298,12 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { + const _engines = [...engines]; + if (meta.internalOptions.useCache !== true) { - const cacheIndex = engines.indexOf("cache"); + const cacheIndex = _engines.indexOf("cache"); if (cacheIndex !== -1) { - engines.splice(cacheIndex, 1); + _engines.splice(cacheIndex, 1); } } else { meta.logger.debug("Cache engine enabled by useCache option"); @@ -319,8 +321,8 @@ export function buildFallbackList(meta: Meta): { const currentEngines = meta.internalOptions.forceEngine !== undefined - ? [meta.internalOptions.forceEngine] - : engines; + ? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine]) + : _engines; for (const engine of currentEngines) { const supportedFlags = new Set([ @@ -371,11 +373,13 @@ export function buildFallbackList(meta: Meta): { ); } - selectedEngines.sort( - (a, b) => - b.supportScore - a.supportScore || - engineOptions[b.engine].quality - engineOptions[a.engine].quality, - ); + if (meta.internalOptions.forceEngine === undefined) { // retain force engine order + selectedEngines.sort( + (a, b) => + b.supportScore - a.supportScore || + engineOptions[b.engine].quality - engineOptions[a.engine].quality, + ); + } return selectedEngines; } diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index a657c4c4..f95d199f 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -152,7 +152,7 @@ async function buildMetaObject( export type InternalOptions = { priority?: number; // Passed along to fire-engine - forceEngine?: Engine; + forceEngine?: Engine | Engine[]; atsv?: boolean; // anti-bot solver, beta v0CrawlOnlyUrls?: boolean;