fix(sitemap): better ordering

This commit is contained in:
Móricz Gergő 2025-01-23 08:58:18 +01:00
parent 3761eb17a7
commit bee2b2873e
4 changed files with 44 additions and 87 deletions

View File

@ -536,8 +536,6 @@ export class WebCrawler {
? url ? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`; : `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
this.sitemapsHit.add(sitemapUrl);
let sitemapCount: number = 0; let sitemapCount: number = 0;
// Try to get sitemap from the provided URL first // Try to get sitemap from the provided URL first

View File

@ -36,86 +36,41 @@ export async function getLinksFromSitemap(
try { try {
let content: string = ""; let content: string = "";
try { try {
if (mode === "fire-engine" && useFireEngine) { const response = await scrapeURL(
const fetchResponse = await scrapeURL( "sitemap;" + crawlId,
"sitemap;" + crawlId, sitemapUrl,
sitemapUrl, scrapeOptions.parse({ formats: ["rawHtml"] }),
scrapeOptions.parse({ formats: ["rawHtml"] }), {
{ forceEngine: "fetch" }, forceEngine: [
); "fetch",
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
],
v0DisableJsDom: true
},
);
if ( if (
fetchResponse.success && response.success &&
fetchResponse.document.metadata.statusCode >= 200 && response.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300 response.document.metadata.statusCode < 300
) { ) {
content = fetchResponse.document.rawHtml!; content = response.document.rawHtml!;
} else {
logger.debug(
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
{
error: fetchResponse.success
? fetchResponse.document
: fetchResponse.error,
},
);
const tlsResponse = await scrapeURL(
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (
tlsResponse.success &&
tlsResponse.document.metadata.statusCode >= 200 &&
tlsResponse.document.metadata.statusCode < 300
) {
content = tlsResponse.document.rawHtml!;
} else {
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error: tlsResponse.success
? tlsResponse.document
: tlsResponse.error,
},
);
return 0;
}
}
} else { } else {
const fetchResponse = await scrapeURL( logger.error(
"sitemap;" + crawlId, `Request failed for sitemap fetch`,
sitemapUrl, {
scrapeOptions.parse({ formats: ["rawHtml"] }), method: "getLinksFromSitemap",
{ forceEngine: "fetch" }, mode,
sitemapUrl,
error: response.success
? response.document
: response.error,
},
); );
return 0;
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!;
} else {
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
},
);
return 0;
}
} }
} catch (error) { } catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, { logger.error(`Request failed for sitemap fetch`, {
method: "getLinksFromSitemap", method: "getLinksFromSitemap",
mode, mode,
sitemapUrl, sitemapUrl,

View File

@ -298,10 +298,12 @@ export function buildFallbackList(meta: Meta): {
engine: Engine; engine: Engine;
unsupportedFeatures: Set<FeatureFlag>; unsupportedFeatures: Set<FeatureFlag>;
}[] { }[] {
const _engines = [...engines];
if (meta.internalOptions.useCache !== true) { if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache"); const cacheIndex = _engines.indexOf("cache");
if (cacheIndex !== -1) { if (cacheIndex !== -1) {
engines.splice(cacheIndex, 1); _engines.splice(cacheIndex, 1);
} }
} else { } else {
meta.logger.debug("Cache engine enabled by useCache option"); meta.logger.debug("Cache engine enabled by useCache option");
@ -319,8 +321,8 @@ export function buildFallbackList(meta: Meta): {
const currentEngines = const currentEngines =
meta.internalOptions.forceEngine !== undefined meta.internalOptions.forceEngine !== undefined
? [meta.internalOptions.forceEngine] ? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine])
: engines; : _engines;
for (const engine of currentEngines) { for (const engine of currentEngines) {
const supportedFlags = new Set([ const supportedFlags = new Set([
@ -371,11 +373,13 @@ export function buildFallbackList(meta: Meta): {
); );
} }
selectedEngines.sort( if (meta.internalOptions.forceEngine === undefined) { // retain force engine order
(a, b) => selectedEngines.sort(
b.supportScore - a.supportScore || (a, b) =>
engineOptions[b.engine].quality - engineOptions[a.engine].quality, b.supportScore - a.supportScore ||
); engineOptions[b.engine].quality - engineOptions[a.engine].quality,
);
}
return selectedEngines; return selectedEngines;
} }

View File

@ -152,7 +152,7 @@ async function buildMetaObject(
export type InternalOptions = { export type InternalOptions = {
priority?: number; // Passed along to fire-engine priority?: number; // Passed along to fire-engine
forceEngine?: Engine; forceEngine?: Engine | Engine[];
atsv?: boolean; // anti-bot solver, beta atsv?: boolean; // anti-bot solver, beta
v0CrawlOnlyUrls?: boolean; v0CrawlOnlyUrls?: boolean;