From 3761eb17a7e59d86fbe5bde59570dd02e9e6ec5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 23 Jan 2025 08:43:13 +0100 Subject: [PATCH] feat(sitemap): reenable fallback to tlsclient --- apps/api/src/scraper/WebScraper/sitemap.ts | 53 ++++++++++------------ 1 file changed, 25 insertions(+), 28 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 7cb9e274..bb989077 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -51,45 +51,42 @@ export async function getLinksFromSitemap( ) { content = fetchResponse.document.rawHtml!; } else { - // logger.debug( - // "Failed to scrape sitemap via fetch, falling back to TLSClient...", - // { - // error: fetchResponse.success - // ? fetchResponse.document - // : fetchResponse.error, - // }, - // ); + logger.debug( + "Failed to scrape sitemap via fetch, falling back to TLSClient...", + { + error: fetchResponse.success + ? fetchResponse.document + : fetchResponse.error, + }, + ); - // const tlsResponse = await scrapeURL( - // "sitemap", - // sitemapUrl, - // scrapeOptions.parse({ formats: ["rawHtml"] }), - // { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, - // ); + const tlsResponse = await scrapeURL( + "sitemap", + sitemapUrl, + scrapeOptions.parse({ formats: ["rawHtml"] }), + { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, + ); - // if ( - // tlsResponse.success && - // tlsResponse.document.metadata.statusCode >= 200 && - // tlsResponse.document.metadata.statusCode < 300 - // ) { - // content = tlsResponse.document.rawHtml!; - // } else { + if ( + tlsResponse.success && + tlsResponse.document.metadata.statusCode >= 200 && + tlsResponse.document.metadata.statusCode < 300 + ) { + content = tlsResponse.document.rawHtml!; + } else { logger.error( `Request failed for ${sitemapUrl}, ran out of engines!`, { method: "getLinksFromSitemap", mode, sitemapUrl, - // error: tlsResponse.success - // ? tlsResponse.document - // : tlsResponse.error, - error: fetchResponse.success - ? fetchResponse.document - : fetchResponse.error, + error: tlsResponse.success + ? tlsResponse.document + : tlsResponse.error, }, ); return 0; - // } + } } } else { const fetchResponse = await scrapeURL(