mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 21:09:03 +08:00
feat(sitemap): switch around engine order
This commit is contained in:
parent
5a039e7b64
commit
ed929221ab
@ -1,4 +1,3 @@
|
|||||||
import axios from "axios";
|
|
||||||
import { axiosTimeout } from "../../lib/timeout";
|
import { axiosTimeout } from "../../lib/timeout";
|
||||||
import { parseStringPromise } from "xml2js";
|
import { parseStringPromise } from "xml2js";
|
||||||
import { WebCrawler } from "./crawler";
|
import { WebCrawler } from "./crawler";
|
||||||
@ -24,44 +23,58 @@ export async function getLinksFromSitemap(
|
|||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === "fire-engine" && useFireEngine) {
|
if (mode === "fire-engine" && useFireEngine) {
|
||||||
// Try TLS client first
|
const fetchResponse = await scrapeURL(
|
||||||
const tlsResponse = await scrapeURL(
|
|
||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
{ forceEngine: "fetch" },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (tlsResponse.success) {
|
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||||
content = tlsResponse.document.rawHtml!;
|
content = fetchResponse.document.rawHtml!;
|
||||||
} else {
|
} else {
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Failed to scrape sitemap via TLSClient, trying Chrome CDP...",
|
"Failed to scrape sitemap via fetch, falling back to TLSClient...",
|
||||||
{ error: tlsResponse.error },
|
{ error: fetchResponse.success ? fetchResponse.document : fetchResponse.error },
|
||||||
);
|
);
|
||||||
|
|
||||||
// Try Chrome CDP next
|
const tlsResponse = await scrapeURL(
|
||||||
const cdpResponse = await scrapeURL(
|
|
||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
{ forceEngine: "fire-engine;chrome-cdp" },
|
{ forceEngine: "fire-engine;tlsclient", v0DisableJsDom: true },
|
||||||
);
|
);
|
||||||
|
|
||||||
if (cdpResponse.success) {
|
if (tlsResponse.success && (tlsResponse.document.metadata.statusCode >= 200 && tlsResponse.document.metadata.statusCode < 300)) {
|
||||||
content = cdpResponse.document.rawHtml!;
|
content = tlsResponse.document.rawHtml!;
|
||||||
} else {
|
} else {
|
||||||
logger.debug(
|
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
||||||
"Failed to scrape sitemap via Chrome CDP, falling back to axios...",
|
method: "getLinksFromSitemap",
|
||||||
{ error: cdpResponse.error },
|
mode,
|
||||||
);
|
sitemapUrl,
|
||||||
const ar = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
error: tlsResponse.success ? tlsResponse.document : tlsResponse.error,
|
||||||
content = ar.data;
|
});
|
||||||
|
return 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
const fetchResponse = await scrapeURL(
|
||||||
content = response.data;
|
"sitemap",
|
||||||
|
sitemapUrl,
|
||||||
|
scrapeOptions.parse({ formats: ["rawHtml"] }),
|
||||||
|
{ forceEngine: "fetch" },
|
||||||
|
);
|
||||||
|
|
||||||
|
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||||
|
content = fetchResponse.document.rawHtml!;
|
||||||
|
} else {
|
||||||
|
logger.error(`Request failed for ${sitemapUrl}, ran out of engines!`, {
|
||||||
|
method: "getLinksFromSitemap",
|
||||||
|
mode,
|
||||||
|
sitemapUrl,
|
||||||
|
});
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||||
@ -151,11 +164,15 @@ export const fetchSitemapData = async (
|
|||||||
): Promise<SitemapEntry[] | null> => {
|
): Promise<SitemapEntry[] | null> => {
|
||||||
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, {
|
const fetchResponse = await scrapeURL(
|
||||||
timeout: timeout || axiosTimeout,
|
"sitemap",
|
||||||
});
|
sitemapUrl,
|
||||||
if (response.status === 200) {
|
scrapeOptions.parse({ formats: ["rawHtml"], timeout: timeout || axiosTimeout }),
|
||||||
const xml = response.data;
|
{ forceEngine: "fetch" },
|
||||||
|
);
|
||||||
|
|
||||||
|
if (fetchResponse.success && (fetchResponse.document.metadata.statusCode >= 200 && fetchResponse.document.metadata.statusCode < 300)) {
|
||||||
|
const xml = fetchResponse.document.rawHtml!;
|
||||||
const parsedXml = await parseStringPromise(xml);
|
const parsedXml = await parseStringPromise(xml);
|
||||||
|
|
||||||
const sitemapData: SitemapEntry[] = [];
|
const sitemapData: SitemapEntry[] = [];
|
||||||
|
Loading…
x
Reference in New Issue
Block a user