fix(sitemap): better ordering

This commit is contained in:
Móricz Gergő 2025-01-23 08:58:18 +01:00
parent 3761eb17a7
commit bee2b2873e
4 changed files with 44 additions and 87 deletions

View File

@ -536,8 +536,6 @@ export class WebCrawler {
? url
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
this.sitemapsHit.add(sitemapUrl);
let sitemapCount: number = 0;
// Try to get sitemap from the provided URL first

View File

@ -36,86 +36,41 @@ export async function getLinksFromSitemap(
try {
let content: string = "";
try {
if (mode === "fire-engine" && useFireEngine) {
const fetchResponse = await scrapeURL(
"sitemap;" + crawlId,
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" },
);
const response = await scrapeURL(
"sitemap;" + crawlId,
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{
forceEngine: [
"fetch",
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
],
v0DisableJsDom: true
},
);
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!;
} else {
logger.debug(
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
{
error: fetchResponse.success
? fetchResponse.document
: fetchResponse.error,
},
);
const tlsResponse = await scrapeURL(
"sitemap",
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
);
if (
tlsResponse.success &&
tlsResponse.document.metadata.statusCode >= 200 &&
tlsResponse.document.metadata.statusCode < 300
) {
content = tlsResponse.document.rawHtml!;
} else {
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error: tlsResponse.success
? tlsResponse.document
: tlsResponse.error,
},
);
return 0;
}
}
if (
response.success &&
response.document.metadata.statusCode >= 200 &&
response.document.metadata.statusCode < 300
) {
content = response.document.rawHtml!;
} else {
const fetchResponse = await scrapeURL(
"sitemap;" + crawlId,
sitemapUrl,
scrapeOptions.parse({ formats: ["rawHtml"] }),
{ forceEngine: "fetch" },
logger.error(
`Request failed for sitemap fetch`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
error: response.success
? response.document
: response.error,
},
);
if (
fetchResponse.success &&
fetchResponse.document.metadata.statusCode >= 200 &&
fetchResponse.document.metadata.statusCode < 300
) {
content = fetchResponse.document.rawHtml!;
} else {
logger.error(
`Request failed for ${sitemapUrl}, ran out of engines!`,
{
method: "getLinksFromSitemap",
mode,
sitemapUrl,
},
);
return 0;
}
return 0;
}
} catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, {
logger.error(`Request failed for sitemap fetch`, {
method: "getLinksFromSitemap",
mode,
sitemapUrl,

View File

@ -298,10 +298,12 @@ export function buildFallbackList(meta: Meta): {
engine: Engine;
unsupportedFeatures: Set<FeatureFlag>;
}[] {
const _engines = [...engines];
if (meta.internalOptions.useCache !== true) {
const cacheIndex = engines.indexOf("cache");
const cacheIndex = _engines.indexOf("cache");
if (cacheIndex !== -1) {
engines.splice(cacheIndex, 1);
_engines.splice(cacheIndex, 1);
}
} else {
meta.logger.debug("Cache engine enabled by useCache option");
@ -319,8 +321,8 @@ export function buildFallbackList(meta: Meta): {
const currentEngines =
meta.internalOptions.forceEngine !== undefined
? [meta.internalOptions.forceEngine]
: engines;
? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine])
: _engines;
for (const engine of currentEngines) {
const supportedFlags = new Set([
@ -371,11 +373,13 @@ export function buildFallbackList(meta: Meta): {
);
}
selectedEngines.sort(
(a, b) =>
b.supportScore - a.supportScore ||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
);
if (meta.internalOptions.forceEngine === undefined) { // retain force engine order
selectedEngines.sort(
(a, b) =>
b.supportScore - a.supportScore ||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
);
}
return selectedEngines;
}

View File

@ -152,7 +152,7 @@ async function buildMetaObject(
export type InternalOptions = {
priority?: number; // Passed along to fire-engine
forceEngine?: Engine;
forceEngine?: Engine | Engine[];
atsv?: boolean; // anti-bot solver, beta
v0CrawlOnlyUrls?: boolean;