Merge pull request #1018 from mendableai/feat/add-favicon-metadata

[FIR-37] feat: extract and return favicon URL during scraping
This commit is contained in:
Nicolas 2024-12-27 17:44:03 -03:00 committed by GitHub
commit 05d5f84d87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -8,6 +8,7 @@ export function extractMetadata(
): Partial<Document["metadata"]> {
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let favicon: string | undefined = undefined;
let language: string | undefined = undefined;
let keywords: string | undefined = undefined;
let robots: string | undefined = undefined;
@ -43,6 +44,12 @@ export function extractMetadata(
title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern
language = soup("html").attr("lang") || undefined;
@ -121,6 +128,7 @@ export function extractMetadata(
return {
title,
description,
favicon,
language,
keywords,
robots,