mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 19:49:10 +08:00
feat: return favicon url when scraping
This commit is contained in:
parent
0421f81020
commit
a4cf814f70
@ -8,6 +8,7 @@ export function extractMetadata(
|
||||
): Partial<Document["metadata"]> {
|
||||
let title: string | undefined = undefined;
|
||||
let description: string | undefined = undefined;
|
||||
let favicon: string | undefined = undefined;
|
||||
let language: string | undefined = undefined;
|
||||
let keywords: string | undefined = undefined;
|
||||
let robots: string | undefined = undefined;
|
||||
@ -43,6 +44,12 @@ export function extractMetadata(
|
||||
title = soup("title").first().text().trim() || undefined;
|
||||
description = soup('meta[name="description"]').attr("content") || undefined;
|
||||
|
||||
const faviconLink = soup('link[rel="icon"]').attr("href") || soup('link[rel*="icon"]').first().attr("href") || undefined;
|
||||
if (faviconLink) {
|
||||
const baseUrl = new URL(meta.url).origin;
|
||||
favicon = faviconLink.startsWith('http') ? faviconLink : `${baseUrl}${faviconLink}`;
|
||||
}
|
||||
|
||||
// Assuming the language is part of the URL as per the regex pattern
|
||||
language = soup("html").attr("lang") || undefined;
|
||||
|
||||
@ -121,6 +128,7 @@ export function extractMetadata(
|
||||
return {
|
||||
title,
|
||||
description,
|
||||
favicon,
|
||||
language,
|
||||
keywords,
|
||||
robots,
|
||||
|
Loading…
x
Reference in New Issue
Block a user