mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 09:38:59 +08:00
fix(sitemap): better ordering
This commit is contained in:
parent
3761eb17a7
commit
bee2b2873e
@ -536,8 +536,6 @@ export class WebCrawler {
|
|||||||
? url
|
? url
|
||||||
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
: `${url}${url.endsWith("/") ? "" : "/"}sitemap.xml`;
|
||||||
|
|
||||||
this.sitemapsHit.add(sitemapUrl);
|
|
||||||
|
|
||||||
let sitemapCount: number = 0;
|
let sitemapCount: number = 0;
|
||||||
|
|
||||||
// Try to get sitemap from the provided URL first
|
// Try to get sitemap from the provided URL first
|
||||||
|
@ -36,86 +36,41 @@ export async function getLinksFromSitemap(
|
|||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === "fire-engine" && useFireEngine) {
|
const response = await scrapeURL(
|
||||||
const fetchResponse = await scrapeURL(
|
"sitemap;" + crawlId,
|
||||||
"sitemap;" + crawlId,
|
sitemapUrl,
|
||||||
sitemapUrl,
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
{
|
||||||
{ forceEngine: "fetch" },
|
forceEngine: [
|
||||||
);
|
"fetch",
|
||||||
|
...((mode === "fire-engine" && useFireEngine) ? ["fire-engine;tlsclient" as const] : []),
|
||||||
|
],
|
||||||
|
v0DisableJsDom: true
|
||||||
|
},
|
||||||
|
);
|
||||||
|
|
||||||
if (
|
if (
|
||||||
fetchResponse.success &&
|
response.success &&
|
||||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
response.document.metadata.statusCode >= 200 &&
|
||||||
fetchResponse.document.metadata.statusCode < 300
|
response.document.metadata.statusCode < 300
|
||||||
) {
|
) {
|
||||||
content = fetchResponse.document.rawHtml!;
|
content = response.document.rawHtml!;
|
||||||
} else {
|
|
||||||
logger.debug(
|
|
||||||
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
|
||||||
{
|
|
||||||
error: fetchResponse.success
|
|
||||||
? fetchResponse.document
|
|
||||||
: fetchResponse.error,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
const tlsResponse = await scrapeURL(
|
|
||||||
"sitemap",
|
|
||||||
sitemapUrl,
|
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
|
||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
|
||||||
);
|
|
||||||
|
|
||||||
if (
|
|
||||||
tlsResponse.success &&
|
|
||||||
tlsResponse.document.metadata.statusCode >= 200 &&
|
|
||||||
tlsResponse.document.metadata.statusCode < 300
|
|
||||||
) {
|
|
||||||
content = tlsResponse.document.rawHtml!;
|
|
||||||
} else {
|
|
||||||
logger.error(
|
|
||||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
|
||||||
{
|
|
||||||
method: "getLinksFromSitemap",
|
|
||||||
mode,
|
|
||||||
sitemapUrl,
|
|
||||||
error: tlsResponse.success
|
|
||||||
? tlsResponse.document
|
|
||||||
: tlsResponse.error,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
const fetchResponse = await scrapeURL(
|
logger.error(
|
||||||
"sitemap;" + crawlId,
|
`Request failed for sitemap fetch`,
|
||||||
sitemapUrl,
|
{
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
method: "getLinksFromSitemap",
|
||||||
{ forceEngine: "fetch" },
|
mode,
|
||||||
|
sitemapUrl,
|
||||||
|
error: response.success
|
||||||
|
? response.document
|
||||||
|
: response.error,
|
||||||
|
},
|
||||||
);
|
);
|
||||||
|
return 0;
|
||||||
if (
|
|
||||||
fetchResponse.success &&
|
|
||||||
fetchResponse.document.metadata.statusCode >= 200 &&
|
|
||||||
fetchResponse.document.metadata.statusCode < 300
|
|
||||||
) {
|
|
||||||
content = fetchResponse.document.rawHtml!;
|
|
||||||
} else {
|
|
||||||
logger.error(
|
|
||||||
`Request failed for ${sitemapUrl}, ran out of engines!`,
|
|
||||||
{
|
|
||||||
method: "getLinksFromSitemap",
|
|
||||||
mode,
|
|
||||||
sitemapUrl,
|
|
||||||
},
|
|
||||||
);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
logger.error(`Request failed for sitemap fetch`, {
|
||||||
method: "getLinksFromSitemap",
|
method: "getLinksFromSitemap",
|
||||||
mode,
|
mode,
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
|
@ -298,10 +298,12 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
engine: Engine;
|
engine: Engine;
|
||||||
unsupportedFeatures: Set<FeatureFlag>;
|
unsupportedFeatures: Set<FeatureFlag>;
|
||||||
}[] {
|
}[] {
|
||||||
|
const _engines = [...engines];
|
||||||
|
|
||||||
if (meta.internalOptions.useCache !== true) {
|
if (meta.internalOptions.useCache !== true) {
|
||||||
const cacheIndex = engines.indexOf("cache");
|
const cacheIndex = _engines.indexOf("cache");
|
||||||
if (cacheIndex !== -1) {
|
if (cacheIndex !== -1) {
|
||||||
engines.splice(cacheIndex, 1);
|
_engines.splice(cacheIndex, 1);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
meta.logger.debug("Cache engine enabled by useCache option");
|
meta.logger.debug("Cache engine enabled by useCache option");
|
||||||
@ -319,8 +321,8 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
|
|
||||||
const currentEngines =
|
const currentEngines =
|
||||||
meta.internalOptions.forceEngine !== undefined
|
meta.internalOptions.forceEngine !== undefined
|
||||||
? [meta.internalOptions.forceEngine]
|
? (Array.isArray(meta.internalOptions.forceEngine) ? meta.internalOptions.forceEngine : [meta.internalOptions.forceEngine])
|
||||||
: engines;
|
: _engines;
|
||||||
|
|
||||||
for (const engine of currentEngines) {
|
for (const engine of currentEngines) {
|
||||||
const supportedFlags = new Set([
|
const supportedFlags = new Set([
|
||||||
@ -371,11 +373,13 @@ export function buildFallbackList(meta: Meta): {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
selectedEngines.sort(
|
if (meta.internalOptions.forceEngine === undefined) { // retain force engine order
|
||||||
(a, b) =>
|
selectedEngines.sort(
|
||||||
b.supportScore - a.supportScore ||
|
(a, b) =>
|
||||||
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
|
b.supportScore - a.supportScore ||
|
||||||
);
|
engineOptions[b.engine].quality - engineOptions[a.engine].quality,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return selectedEngines;
|
return selectedEngines;
|
||||||
}
|
}
|
||||||
|
@ -152,7 +152,7 @@ async function buildMetaObject(
|
|||||||
|
|
||||||
export type InternalOptions = {
|
export type InternalOptions = {
|
||||||
priority?: number; // Passed along to fire-engine
|
priority?: number; // Passed along to fire-engine
|
||||||
forceEngine?: Engine;
|
forceEngine?: Engine | Engine[];
|
||||||
atsv?: boolean; // anti-bot solver, beta
|
atsv?: boolean; // anti-bot solver, beta
|
||||||
|
|
||||||
v0CrawlOnlyUrls?: boolean;
|
v0CrawlOnlyUrls?: boolean;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user