mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-08 19:29:02 +08:00
Merge pull request #1015 from mendableai/nsc/improves-sitemap-fetching
Improves sitemap fetching
This commit is contained in:
commit
c5b6495e48
@ -446,44 +446,75 @@ export class WebCrawler {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
|
||||||
|
|
||||||
let sitemapLinks: string[] = [];
|
let sitemapLinks: string[] = [];
|
||||||
|
|
||||||
|
// Try to get sitemap from the provided URL first
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
sitemapLinks = await getLinksFromSitemap(
|
||||||
if (response.status === 200) {
|
{ sitemapUrl, allUrls: [], mode: "fire-engine" },
|
||||||
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger);
|
|
||||||
}
|
|
||||||
} catch (error) {
|
|
||||||
this.logger.debug(
|
|
||||||
`Failed to fetch sitemap with axios from ${sitemapUrl}`,
|
|
||||||
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
|
||||||
);
|
|
||||||
if (error instanceof AxiosError && error.response?.status === 404) {
|
|
||||||
// ignore 404
|
|
||||||
} else {
|
|
||||||
const response = await getLinksFromSitemap(
|
|
||||||
{ sitemapUrl, mode: "fire-engine" },
|
|
||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
if (response) {
|
} catch (error) {
|
||||||
sitemapLinks = response;
|
this.logger.debug(
|
||||||
}
|
`Failed to fetch sitemap from ${sitemapUrl}`,
|
||||||
}
|
{ method: "tryFetchSitemapLinks", sitemapUrl, error },
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// If this is a subdomain, also try to get sitemap from the main domain
|
||||||
|
try {
|
||||||
|
const urlObj = new URL(url);
|
||||||
|
const hostname = urlObj.hostname;
|
||||||
|
const domainParts = hostname.split('.');
|
||||||
|
|
||||||
|
// Check if this is a subdomain (has more than 2 parts and not www)
|
||||||
|
if (domainParts.length > 2 && domainParts[0] !== 'www') {
|
||||||
|
// Get the main domain by taking the last two parts
|
||||||
|
const mainDomain = domainParts.slice(-2).join('.');
|
||||||
|
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
|
||||||
|
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Get all links from the main domain's sitemap
|
||||||
|
const mainDomainLinks = await getLinksFromSitemap(
|
||||||
|
{ sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" },
|
||||||
|
this.logger,
|
||||||
|
);
|
||||||
|
// Filter links to only include those pointing to the current subdomain
|
||||||
|
const subdomainLinks = mainDomainLinks.filter(link => {
|
||||||
|
try {
|
||||||
|
const linkUrl = new URL(link);
|
||||||
|
return linkUrl.hostname.endsWith(hostname);
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])];
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.debug(
|
||||||
|
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
|
||||||
|
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
this.logger.debug(`Error processing main domain sitemap`, {
|
||||||
|
method: "tryFetchSitemapLinks",
|
||||||
|
url,
|
||||||
|
error,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// If no sitemap found yet, try the baseUrl as a last resort
|
||||||
if (sitemapLinks.length === 0) {
|
if (sitemapLinks.length === 0) {
|
||||||
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
|
||||||
try {
|
try {
|
||||||
const response = await axios.get(baseUrlSitemap, {
|
const baseLinks = await getLinksFromSitemap(
|
||||||
timeout: axiosTimeout,
|
{ sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" },
|
||||||
});
|
|
||||||
if (response.status === 200) {
|
|
||||||
sitemapLinks = await getLinksFromSitemap(
|
|
||||||
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" },
|
|
||||||
this.logger,
|
this.logger,
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])];
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
|
||||||
method: "tryFetchSitemapLinks",
|
method: "tryFetchSitemapLinks",
|
||||||
|
@ -5,7 +5,9 @@ import { WebCrawler } from "./crawler";
|
|||||||
import { scrapeURL } from "../scrapeURL";
|
import { scrapeURL } from "../scrapeURL";
|
||||||
import { scrapeOptions } from "../../controllers/v1/types";
|
import { scrapeOptions } from "../../controllers/v1/types";
|
||||||
import type { Logger } from "winston";
|
import type { Logger } from "winston";
|
||||||
|
const useFireEngine =
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== "" &&
|
||||||
|
process.env.FIRE_ENGINE_BETA_URL !== undefined;
|
||||||
export async function getLinksFromSitemap(
|
export async function getLinksFromSitemap(
|
||||||
{
|
{
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
@ -21,10 +23,7 @@ export async function getLinksFromSitemap(
|
|||||||
try {
|
try {
|
||||||
let content: string = "";
|
let content: string = "";
|
||||||
try {
|
try {
|
||||||
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") {
|
if (mode === "fire-engine" && useFireEngine) {
|
||||||
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
|
||||||
content = response.data;
|
|
||||||
} else if (mode === "fire-engine") {
|
|
||||||
const response = await scrapeURL(
|
const response = await scrapeURL(
|
||||||
"sitemap",
|
"sitemap",
|
||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
@ -35,6 +34,9 @@ export async function getLinksFromSitemap(
|
|||||||
throw response.error;
|
throw response.error;
|
||||||
}
|
}
|
||||||
content = response.document.rawHtml!;
|
content = response.document.rawHtml!;
|
||||||
|
} else {
|
||||||
|
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
|
||||||
|
content = response.data;
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(`Request failed for ${sitemapUrl}`, {
|
logger.error(`Request failed for ${sitemapUrl}`, {
|
||||||
@ -43,7 +45,6 @@ export async function getLinksFromSitemap(
|
|||||||
sitemapUrl,
|
sitemapUrl,
|
||||||
error,
|
error,
|
||||||
});
|
});
|
||||||
|
|
||||||
return allUrls;
|
return allUrls;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -51,21 +52,55 @@ export async function getLinksFromSitemap(
|
|||||||
const root = parsed.urlset || parsed.sitemapindex;
|
const root = parsed.urlset || parsed.sitemapindex;
|
||||||
|
|
||||||
if (root && root.sitemap) {
|
if (root && root.sitemap) {
|
||||||
const sitemapPromises = root.sitemap
|
// Handle sitemap index files
|
||||||
|
const sitemapUrls = root.sitemap
|
||||||
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
|
||||||
.map((sitemap) =>
|
.map((sitemap) => sitemap.loc[0]);
|
||||||
|
|
||||||
|
const sitemapPromises = sitemapUrls.map((sitemapUrl) =>
|
||||||
getLinksFromSitemap(
|
getLinksFromSitemap(
|
||||||
{ sitemapUrl: sitemap.loc[0], allUrls, mode },
|
{ sitemapUrl, allUrls: [], mode },
|
||||||
logger,
|
logger,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
await Promise.all(sitemapPromises);
|
|
||||||
|
const results = await Promise.all(sitemapPromises);
|
||||||
|
results.forEach(urls => {
|
||||||
|
allUrls.push(...urls);
|
||||||
|
});
|
||||||
} else if (root && root.url) {
|
} else if (root && root.url) {
|
||||||
|
// Check if any URLs point to additional sitemaps
|
||||||
|
const xmlSitemaps = root.url
|
||||||
|
.filter(
|
||||||
|
(url) =>
|
||||||
|
url.loc &&
|
||||||
|
url.loc.length > 0 &&
|
||||||
|
url.loc[0].toLowerCase().endsWith('.xml')
|
||||||
|
)
|
||||||
|
.map((url) => url.loc[0]);
|
||||||
|
|
||||||
|
if (xmlSitemaps.length > 0) {
|
||||||
|
// Recursively fetch links from additional sitemaps
|
||||||
|
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
|
||||||
|
getLinksFromSitemap(
|
||||||
|
{ sitemapUrl, allUrls: [], mode },
|
||||||
|
logger,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
const results = await Promise.all(sitemapPromises);
|
||||||
|
results.forEach(urls => {
|
||||||
|
allUrls.push(...urls);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add regular URLs that aren't sitemaps
|
||||||
const validUrls = root.url
|
const validUrls = root.url
|
||||||
.filter(
|
.filter(
|
||||||
(url) =>
|
(url) =>
|
||||||
url.loc &&
|
url.loc &&
|
||||||
url.loc.length > 0 &&
|
url.loc.length > 0 &&
|
||||||
|
!url.loc[0].toLowerCase().endsWith('.xml') &&
|
||||||
!WebCrawler.prototype.isFile(url.loc[0]),
|
!WebCrawler.prototype.isFile(url.loc[0]),
|
||||||
)
|
)
|
||||||
.map((url) => url.loc[0]);
|
.map((url) => url.loc[0]);
|
||||||
@ -80,7 +115,7 @@ export async function getLinksFromSitemap(
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
return allUrls;
|
return [...new Set(allUrls)];
|
||||||
}
|
}
|
||||||
|
|
||||||
export const fetchSitemapData = async (
|
export const fetchSitemapData = async (
|
||||||
|
Loading…
x
Reference in New Issue
Block a user