mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:19:03 +08:00
fix(sitemap): better ordering
This commit is contained in:
parent
3761eb17a7
commit
bee2b2873e
@ -536,8 +536,6 @@ export class WebCrawler {
|
||||
? url
|
||||
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||
|
||||
this.sitemapsHit.add(sitemapUrl);
|
||||
|
||||
let sitemapCount: number = 0;
|
||||
|
||||
// Try to get sitemap from the provided URL first
|
||||
|
@ -36,86 +36,41 @@ export async function getLinksFromSitemap(
|
||||
try {
|
||||
let content: string = "";
|
||||
try {
|
||||
if (mode === "fire-engine" && useFireEngine) {
|
||||
const fetchResponse = await scrapeURL(
|
||||
"sitemap;" + crawlId,
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fetch" },
|
||||
);
|
||||
const response = await scrapeURL(
|
||||
"sitemap;" + crawlId,
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{
|
||||
forceEngine: [
|
||||
"fetch",
|
||||
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
|
||||
],
|
||||
v0DisableJsDom: true
|
||||
},
|
||||
);
|
||||
|
||||
if (
|
||||
fetchResponse.success &&
|
||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||
fetchResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = fetchResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.debug(
|
||||
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
||||
{
|
||||
error: fetchResponse.success
|
||||
? fetchResponse.document
|
||||
: fetchResponse.error,
|
||||
},
|
||||
);
|
||||
|
||||
const tlsResponse = await scrapeURL(
|
||||
"sitemap",
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||
);
|
||||
|
||||
if (
|
||||
tlsResponse.success &&
|
||||
tlsResponse.document.metadata.statusCode >= 200 &&
|
||||
tlsResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = tlsResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.error(
|
||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||
{
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error: tlsResponse.success
|
||||
? tlsResponse.document
|
||||
: tlsResponse.error,
|
||||
},
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
if (
|
||||
response.success &&
|
||||
response.document.metadata.statusCode >= 200 &&
|
||||
response.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = response.document.rawHtml!;
|
||||
} else {
|
||||
const fetchResponse = await scrapeURL(
|
||||
"sitemap;" + crawlId,
|
||||
sitemapUrl,
|
||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||
{ forceEngine: "fetch" },
|
||||
logger.error(
|
||||
`Request failed for sitemap fetch`,
|
||||
{
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
error: response.success
|
||||
? response.document
|
||||
: response.error,
|
||||
},
|
||||
);
|
||||
|
||||
if (
|
||||
fetchResponse.success &&
|
||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
||||
fetchResponse.document.metadata.statusCode < 300
|
||||
) {
|
||||
content = fetchResponse.document.rawHtml!;
|
||||
} else {
|
||||
logger.error(
|
||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
||||
{
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
},
|
||||
);
|
||||
return 0;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||
logger.error(`Request failed for sitemap fetch`, {
|
||||
method: "getLinksFromSitemap",
|
||||
mode,
|
||||
sitemapUrl,
|
||||
|
@ -298,10 +298,12 @@ export function buildFallbackList(meta: Meta): {
|
||||
engine: Engine;
|
||||
unsupportedFeatures: Set<FeatureFlag>;
|
||||
}[] {
|
||||
const _engines = [...engines];
|
||||
|
||||
if (meta.internalOptions.useCache !== true) {
|
||||
const cacheIndex = engines.indexOf("cache");
|
||||
const cacheIndex = _engines.indexOf("cache");
|
||||
if (cacheIndex !== -1) {
|
||||
engines.splice(cacheIndex, 1);
|
||||
_engines.splice(cacheIndex, 1);
|
||||
}
|
||||
} else {
|
||||
meta.logger.debug("Cache engine enabled by useCache option");
|
||||
@ -319,8 +321,8 @@ export function buildFallbackList(meta: Meta): {
|
||||
|
||||
const currentEngines =
|
||||
meta.internalOptions.forceEngine !== undefined
|
||||
? [meta.internalOptions.forceEngine]
|
||||
: engines;
|
||||
? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine])
|
||||
: _engines;
|
||||
|
||||
for (const engine of currentEngines) {
|
||||
const supportedFlags = new Set([
|
||||
@ -371,11 +373,13 @@ export function buildFallbackList(meta: Meta): {
|
||||
);
|
||||
}
|
||||
|
||||
selectedEngines.sort(
|
||||
(a, b) =>
|
||||
b.supportScore - a.supportScore ||
|
||||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
|
||||
);
|
||||
if (meta.internalOptions.forceEngine === undefined) { // retain force engine order
|
||||
selectedEngines.sort(
|
||||
(a, b) =>
|
||||
b.supportScore - a.supportScore ||
|
||||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
|
||||
);
|
||||
}
|
||||
|
||||
return selectedEngines;
|
||||
}
|
||||
|
@ -152,7 +152,7 @@ async function buildMetaObject(
|
||||
|
||||
export type InternalOptions = {
|
||||
priority?: number; // Passed along to fire-engine
|
||||
forceEngine?: Engine;
|
||||
forceEngine?: Engine | Engine[];
|
||||
atsv?: boolean; // anti-bot solver, beta
|
||||
|
||||
v0CrawlOnlyUrls?: boolean;
|
||||
|
Loading…
x
Reference in New Issue
Block a user