Merge pull request #1015 from mendableai/nsc/improves-sitemap-fetching

Improves sitemap fetching
This commit is contained in:
Nicolas 2024-12-27 14:41:04 -03:00 committed by GitHub
commit c5b6495e48
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 104 additions and 38 deletions

View File

@ -446,44 +446,75 @@ export class WebCrawler {
}; };
const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith(".xml") ? url : `${url}/sitemap.xml`;
let sitemapLinks: string[] = []; let sitemapLinks: string[] = [];
// Try to get sitemap from the provided URL first
try { try {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); sitemapLinks = await getLinksFromSitemap(
if (response.status === 200) { { sitemapUrl, allUrls: [], mode: "fire-engine" },
sitemapLinks = await getLinksFromSitemap({ sitemapUrl }, this.logger); this.logger,
} );
} catch (error) { } catch (error) {
this.logger.debug( this.logger.debug(
`Failed to fetch sitemap with axios from ${sitemapUrl}`, `Failed to fetch sitemap from ${sitemapUrl}`,
{ method: "tryFetchSitemapLinks", sitemapUrl, error }, { method: "tryFetchSitemapLinks", sitemapUrl, error },
); );
if (error instanceof AxiosError && error.response?.status === 404) {
// ignore 404
} else {
const response = await getLinksFromSitemap(
{ sitemapUrl, mode: "fire-engine" },
this.logger,
);
if (response) {
sitemapLinks = response;
}
}
} }
// If this is a subdomain, also try to get sitemap from the main domain
try {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
const domainParts = hostname.split('.');
// Check if this is a subdomain (has more than 2 parts and not www)
if (domainParts.length > 2 && domainParts[0] !== 'www') {
// Get the main domain by taking the last two parts
const mainDomain = domainParts.slice(-2).join('.');
const mainDomainUrl = `${urlObj.protocol}//${mainDomain}`;
const mainDomainSitemapUrl = `${mainDomainUrl}/sitemap.xml`;
try {
// Get all links from the main domain's sitemap
const mainDomainLinks = await getLinksFromSitemap(
{ sitemapUrl: mainDomainSitemapUrl, allUrls: [], mode: "fire-engine" },
this.logger,
);
// Filter links to only include those pointing to the current subdomain
const subdomainLinks = mainDomainLinks.filter(link => {
try {
const linkUrl = new URL(link);
return linkUrl.hostname.endsWith(hostname);
} catch {
return false;
}
});
sitemapLinks = [...new Set([...sitemapLinks, ...subdomainLinks])];
} catch (error) {
this.logger.debug(
`Failed to fetch main domain sitemap from ${mainDomainSitemapUrl}`,
{ method: "tryFetchSitemapLinks", mainDomainSitemapUrl, error },
);
}
}
} catch (error) {
this.logger.debug(`Error processing main domain sitemap`, {
method: "tryFetchSitemapLinks",
url,
error,
});
}
// If no sitemap found yet, try the baseUrl as a last resort
if (sitemapLinks.length === 0) { if (sitemapLinks.length === 0) {
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try { try {
const response = await axios.get(baseUrlSitemap, { const baseLinks = await getLinksFromSitemap(
timeout: axiosTimeout, { sitemapUrl: baseUrlSitemap, allUrls: [], mode: "fire-engine" },
}); this.logger,
if (response.status === 200) { );
sitemapLinks = await getLinksFromSitemap(
{ sitemapUrl: baseUrlSitemap, mode: "fire-engine" }, sitemapLinks = [...new Set([...sitemapLinks, ...baseLinks])];
this.logger,
);
}
} catch (error) { } catch (error) {
this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, { this.logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}`, {
method: "tryFetchSitemapLinks", method: "tryFetchSitemapLinks",

View File

@ -5,7 +5,9 @@ import { WebCrawler } from "./crawler";
import { scrapeURL } from "../scrapeURL"; import { scrapeURL } from "../scrapeURL";
import { scrapeOptions } from "../../controllers/v1/types"; import { scrapeOptions } from "../../controllers/v1/types";
import type { Logger } from "winston"; import type { Logger } from "winston";
const useFireEngine =
process.env.FIRE_ENGINE_BETA_URL !== "" &&
process.env.FIRE_ENGINE_BETA_URL !== undefined;
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
{ {
sitemapUrl, sitemapUrl,
@ -21,10 +23,7 @@ export async function getLinksFromSitemap(
try { try {
let content: string = ""; let content: string = "";
try { try {
if (mode === "axios" || process.env.FIRE_ENGINE_BETA_URL === "") { if (mode === "fire-engine" && useFireEngine) {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} else if (mode === "fire-engine") {
const response = await scrapeURL( const response = await scrapeURL(
"sitemap", "sitemap",
sitemapUrl, sitemapUrl,
@ -35,6 +34,9 @@ export async function getLinksFromSitemap(
throw response.error; throw response.error;
} }
content = response.document.rawHtml!; content = response.document.rawHtml!;
} else {
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} }
} catch (error) { } catch (error) {
logger.error(`Request failed for ${sitemapUrl}`, { logger.error(`Request failed for ${sitemapUrl}`, {
@ -43,7 +45,6 @@ export async function getLinksFromSitemap(
sitemapUrl, sitemapUrl,
error, error,
}); });
return allUrls; return allUrls;
} }
@ -51,21 +52,55 @@ export async function getLinksFromSitemap(
const root = parsed.urlset || parsed.sitemapindex; const root = parsed.urlset || parsed.sitemapindex;
if (root && root.sitemap) { if (root && root.sitemap) {
const sitemapPromises = root.sitemap // Handle sitemap index files
const sitemapUrls = root.sitemap
.filter((sitemap) => sitemap.loc && sitemap.loc.length > 0) .filter((sitemap) => sitemap.loc && sitemap.loc.length > 0)
.map((sitemap) => .map((sitemap) => sitemap.loc[0]);
const sitemapPromises = sitemapUrls.map((sitemapUrl) =>
getLinksFromSitemap(
{ sitemapUrl, allUrls: [], mode },
logger,
),
);
const results = await Promise.all(sitemapPromises);
results.forEach(urls => {
allUrls.push(...urls);
});
} else if (root && root.url) {
// Check if any URLs point to additional sitemaps
const xmlSitemaps = root.url
.filter(
(url) =>
url.loc &&
url.loc.length > 0 &&
url.loc[0].toLowerCase().endsWith('.xml')
)
.map((url) => url.loc[0]);
if (xmlSitemaps.length > 0) {
// Recursively fetch links from additional sitemaps
const sitemapPromises = xmlSitemaps.map((sitemapUrl) =>
getLinksFromSitemap( getLinksFromSitemap(
{ sitemapUrl: sitemap.loc[0], allUrls, mode }, { sitemapUrl, allUrls: [], mode },
logger, logger,
), ),
); );
await Promise.all(sitemapPromises);
} else if (root && root.url) { const results = await Promise.all(sitemapPromises);
results.forEach(urls => {
allUrls.push(...urls);
});
}
// Add regular URLs that aren't sitemaps
const validUrls = root.url const validUrls = root.url
.filter( .filter(
(url) => (url) =>
url.loc && url.loc &&
url.loc.length > 0 && url.loc.length > 0 &&
!url.loc[0].toLowerCase().endsWith('.xml') &&
!WebCrawler.prototype.isFile(url.loc[0]), !WebCrawler.prototype.isFile(url.loc[0]),
) )
.map((url) => url.loc[0]); .map((url) => url.loc[0]);
@ -80,7 +115,7 @@ export async function getLinksFromSitemap(
}); });
} }
return allUrls; return [...new Set(allUrls)];
} }
export const fetchSitemapData = async ( export const fetchSitemapData = async (