feat(sitemap): switch around engine order

This commit is contained in:
Gergő Móricz 2025-01-22 19:10:27 +01:00
parent 5a039e7b64
commit ed929221ab

View File

@ -1,4 +1,3 @@
import axios from "axios";
import { axiosTimeout } from "../../lib/timeout"; import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
import { WebCrawler } from "./crawler"; import { WebCrawler } from "./crawler";
@ -24,44 +23,58 @@ export async function getLinksFromSitemap(
let content: string = ""; let content: string = "";
try { try {
if (mode === "fire-engine" && useFireEngine) { if (mode === "fire-engine" && useFireEngine) {
// Try TLS client first const fetchResponse = await scrapeURL(
const tlsResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true }, { forceEngine: "fetch" },
); );
if (tlsResponse.success) { if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
content = tlsResponse.document.rawHtml!; content = fetchResponse.document.rawHtml!;
} else { } else {
logger.debug( logger.debug(
"Failed to scrape sitemap via TLSClient, trying Chrome CDP...", "Failed to scrape sitemap via fetch, falling back to TLSClient...",
{ error: tlsResponse.error }, { error: fetchResponse.success ? fetchResponse.document : fetchResponse.error },
); );
// Try Chrome CDP next const tlsResponse = await scrapeURL(
const cdpResponse = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }), scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;chrome-cdp" }, { forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
); );
if (cdpResponse.success) { if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) {
content = cdpResponse.document.rawHtml!; content = tlsResponse.document.rawHtml!;
} else { } else {
logger.debug( logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
"Failed to scrape sitemap via Chrome CDP, falling back to axios...", method: "getLinksFromSitemap",
{ error: cdpResponse.error }, mode,
); sitemapUrl,
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout }); error: tlsResponse.success ? tlsResponse.document : tlsResponse.error,
content = ar.data; });
return 0;
} }
} }
} else { } else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); const fetchResponse = await scrapeURL(
content = response.data; "sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" },
);
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
content = fetchResponse.document.rawHtml!;
} else {
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,
});
return 0;
}
} }
} catch (error) { } catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, { logger.error(`Request failed for ${sitemapUrl}`, {
@ -151,11 +164,15 @@ export const fetchSitemapData = async (
): Promise<SitemapEntry[] | null> => { ): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const response = await axios.get(sitemapUrl, { const fetchResponse = await scrapeURL(
timeout: timeout || axiosTimeout, "sitemap",
}); sitemapUrl,
if (response.status === 200) { scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }),
const xml = response.data; { forceEngine: "fetch" },
);
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
const xml = fetchResponse.document.rawHtml!;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);
const sitemapData: SitemapEntry[] = []; const sitemapData: SitemapEntry[] = [];