mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 16:20:48 +08:00
Merge pull request #785 from mendableai/nsc/support-for-all-metadata
Return all the website metadata
This commit is contained in:
commit
a73b06589c
@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => {
|
|||||||
},
|
},
|
||||||
30000
|
30000
|
||||||
); // 30 seconds timeout
|
); // 30 seconds timeout
|
||||||
|
|
||||||
|
it.concurrent(
|
||||||
|
"should return a successful response with a valid API key",
|
||||||
|
async () => {
|
||||||
|
const scrapeRequest: ScrapeRequest = {
|
||||||
|
url: "https://arxiv.org/abs/2410.04840",
|
||||||
|
};
|
||||||
|
|
||||||
|
const response: ScrapeResponseRequestTest = await request(TEST_URL)
|
||||||
|
.post("/v1/scrape")
|
||||||
|
.set("Authorization", `Bearer ${process.env.TEST_API_KEY}`)
|
||||||
|
.set("Content-Type", "application/json")
|
||||||
|
.send(scrapeRequest);
|
||||||
|
|
||||||
|
expect(response.statusCode).toBe(200);
|
||||||
|
|
||||||
|
if (!("data" in response.body)) {
|
||||||
|
throw new Error("Expected response body to have 'data' property");
|
||||||
|
}
|
||||||
|
expect(response.body.data).not.toHaveProperty("content");
|
||||||
|
expect(response.body.data).toHaveProperty("markdown");
|
||||||
|
expect(response.body.data).toHaveProperty("metadata");
|
||||||
|
expect(response.body.data).not.toHaveProperty("html");
|
||||||
|
expect(response.body.data.markdown).toContain("Strong Model Collapse");
|
||||||
|
expect(response.body.data.metadata.error).toBeUndefined();
|
||||||
|
expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse");
|
||||||
|
expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse");
|
||||||
|
expect(response.body.data.metadata.citation_author).toEqual([
|
||||||
|
"Dohmatob, Elvis",
|
||||||
|
"Feng, Yunzhen",
|
||||||
|
"Subramonian, Arjun",
|
||||||
|
"Kempe, Julia"
|
||||||
|
]);
|
||||||
|
expect(response.body.data.metadata.citation_date).toBe("2024/10/07");
|
||||||
|
expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08");
|
||||||
|
expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840");
|
||||||
|
expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840");
|
||||||
|
expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm");
|
||||||
|
expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840");
|
||||||
|
expect(response.body.data.metadata.statusCode).toBe(200);
|
||||||
|
},
|
||||||
|
30000
|
||||||
|
);
|
||||||
it.concurrent(
|
it.concurrent(
|
||||||
"should return a successful response with a valid API key and includeHtml set to true",
|
"should return a successful response with a valid API key and includeHtml set to true",
|
||||||
async () => {
|
async () => {
|
||||||
|
@ -259,6 +259,8 @@ export type Document = {
|
|||||||
sourceURL?: string;
|
sourceURL?: string;
|
||||||
statusCode?: number;
|
statusCode?: number;
|
||||||
error?: string;
|
error?: string;
|
||||||
|
[key: string]: string | string[] | number | undefined;
|
||||||
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ interface Metadata {
|
|||||||
sourceURL?: string;
|
sourceURL?: string;
|
||||||
pageStatusCode?: number;
|
pageStatusCode?: number;
|
||||||
pageError?: string;
|
pageError?: string;
|
||||||
|
[key: string]: string | string[] | number | undefined;
|
||||||
}
|
}
|
||||||
|
|
||||||
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
||||||
@ -70,40 +71,78 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
let pageStatusCode: number | null = null;
|
let pageStatusCode: number | null = null;
|
||||||
let pageError: string | null = null;
|
let pageError: string | null = null;
|
||||||
|
|
||||||
|
const customMetadata: Record<string, string | string[]> = {};
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
// TODO: remove this as it is redundant with the below implementation
|
||||||
title = soup("title").text() || null;
|
title = soup("title").text() || null;
|
||||||
description = soup('meta[name="description"]').attr("content") || null;
|
description = soup('meta[name="description"]').attr("content") || null;
|
||||||
|
|
||||||
// Assuming the language is part of the URL as per the regex pattern
|
language = soup("html").attr("lang") || null;
|
||||||
language = soup('html').attr('lang') || null;
|
|
||||||
|
|
||||||
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
keywords = soup('meta[name="keywords"]').attr("content") || null;
|
||||||
robots = soup('meta[name="robots"]').attr("content") || null;
|
robots = soup('meta[name="robots"]').attr("content") || null;
|
||||||
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
ogTitle = soup('meta[property="og:title"]').attr("content") || null;
|
||||||
ogDescription = soup('meta[property="og:description"]').attr("content") || null;
|
ogDescription =
|
||||||
|
soup('meta[property="og:description"]').attr("content") || null;
|
||||||
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
ogUrl = soup('meta[property="og:url"]').attr("content") || null;
|
||||||
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
ogImage = soup('meta[property="og:image"]').attr("content") || null;
|
||||||
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
ogAudio = soup('meta[property="og:audio"]').attr("content") || null;
|
||||||
ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null;
|
ogDeterminer =
|
||||||
|
soup('meta[property="og:determiner"]').attr("content") || null;
|
||||||
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
ogLocale = soup('meta[property="og:locale"]').attr("content") || null;
|
||||||
ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null;
|
ogLocaleAlternate =
|
||||||
|
soup('meta[property="og:locale:alternate"]')
|
||||||
|
.map((i, el) => soup(el).attr("content"))
|
||||||
|
.get() || null;
|
||||||
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null;
|
||||||
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
ogVideo = soup('meta[property="og:video"]').attr("content") || null;
|
||||||
articleSection = soup('meta[name="article:section"]').attr("content") || null;
|
articleSection =
|
||||||
|
soup('meta[name="article:section"]').attr("content") || null;
|
||||||
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
articleTag = soup('meta[name="article:tag"]').attr("content") || null;
|
||||||
publishedTime = soup('meta[property="article:published_time"]').attr("content") || null;
|
publishedTime =
|
||||||
modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null;
|
soup('meta[property="article:published_time"]').attr("content") || null;
|
||||||
dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
modifiedTime =
|
||||||
|
soup('meta[property="article:modified_time"]').attr("content") || null;
|
||||||
|
dctermsKeywords =
|
||||||
|
soup('meta[name="dcterms.keywords"]').attr("content") || null;
|
||||||
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
dcDescription = soup('meta[name="dc.description"]').attr("content") || null;
|
||||||
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
dcSubject = soup('meta[name="dc.subject"]').attr("content") || null;
|
||||||
dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null;
|
dctermsSubject =
|
||||||
dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null;
|
soup('meta[name="dcterms.subject"]').attr("content") || null;
|
||||||
|
dctermsAudience =
|
||||||
|
soup('meta[name="dcterms.audience"]').attr("content") || null;
|
||||||
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
dcType = soup('meta[name="dc.type"]').attr("content") || null;
|
||||||
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null;
|
||||||
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
dcDate = soup('meta[name="dc.date"]').attr("content") || null;
|
||||||
dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null;
|
dcDateCreated =
|
||||||
dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null;
|
soup('meta[name="dc.date.created"]').attr("content") || null;
|
||||||
|
dctermsCreated =
|
||||||
|
soup('meta[name="dcterms.created"]').attr("content") || null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
// Extract all meta tags for custom metadata
|
||||||
|
soup("meta").each((i, elem) => {
|
||||||
|
try {
|
||||||
|
const name = soup(elem).attr("name") || soup(elem).attr("property");
|
||||||
|
const content = soup(elem).attr("content");
|
||||||
|
|
||||||
|
if (name && content) {
|
||||||
|
if (customMetadata[name] === undefined) {
|
||||||
|
customMetadata[name] = content;
|
||||||
|
} else if (Array.isArray(customMetadata[name])) {
|
||||||
|
(customMetadata[name] as string[]).push(content);
|
||||||
|
} else {
|
||||||
|
customMetadata[name] = [customMetadata[name] as string, content];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error extracting custom metadata (in): ${error}`);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error extracting custom metadata: ${error}`);
|
||||||
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.error(`Error extracting metadata: ${error}`);
|
Logger.error(`Error extracting metadata: ${error}`);
|
||||||
}
|
}
|
||||||
@ -141,5 +180,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata {
|
|||||||
...(sourceURL ? { sourceURL } : {}),
|
...(sourceURL ? { sourceURL } : {}),
|
||||||
...(pageStatusCode ? { pageStatusCode } : {}),
|
...(pageStatusCode ? { pageStatusCode } : {}),
|
||||||
...(pageError ? { pageError } : {}),
|
...(pageError ? { pageError } : {}),
|
||||||
|
...customMetadata,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user