Merge pull request #1018 from mendableai/feat/add-favicon-metadata

[FIR-37] feat: extract and return favicon URL during scraping
This commit is contained in:
Nicolas 2024-12-27 17:44:03 -03:00 committed by GitHub
commit 05d5f84d87
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -8,6 +8,7 @@ export function extractMetadata(
): Partial<Document["metadata"]> { ): Partial<Document["metadata"]> {
let title: string | undefined = undefined; let title: string | undefined = undefined;
let description: string | undefined = undefined; let description: string | undefined = undefined;
let favicon: string | undefined = undefined;
let language: string | undefined = undefined; let language: string | undefined = undefined;
let keywords: string | undefined = undefined; let keywords: string | undefined = undefined;
let robots: string | undefined = undefined; let robots: string | undefined = undefined;
@ -42,6 +43,12 @@ export function extractMetadata(
try { try {
title = soup("title").first().text().trim() || undefined; title = soup("title").first().text().trim() || undefined;
description = soup('meta[name="description"]').attr("content") || undefined; description = soup('meta[name="description"]').attr("content") || undefined;
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
if (faviconLink) {
const baseUrl = new URL(meta.url).origin;
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
}
// Assuming the language is part of the URL as per the regex pattern // Assuming the language is part of the URL as per the regex pattern
language = soup("html").attr("lang") || undefined; language = soup("html").attr("lang") || undefined;
@ -121,6 +128,7 @@ export function extractMetadata(
return { return {
title, title,
description, description,
favicon,
language, language,
keywords, keywords,
robots, robots,