From 795e5a92287dd2969ca58de040d7ce0e1d46c308 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 15 Oct 2024 21:36:13 -0300 Subject: [PATCH 1/5] Update metadata.ts --- .../src/scraper/WebScraper/utils/metadata.ts | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index fac53b38..0c2af118 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -70,11 +70,12 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { let pageStatusCode: number | null = null; let pageError: string | null = null; + const customMetadata: Record = {}; + try { title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; - // Assuming the language is part of the URL as per the regex pattern language = soup('html').attr('lang') || null; keywords = soup('meta[name="keywords"]').attr("content") || null; @@ -104,6 +105,22 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; + // Extract all meta tags for custom metadata + soup("meta").each((i, elem) => { + const name = soup(elem).attr("name") || soup(elem).attr("property"); + const content = soup(elem).attr("content"); + + if (name && content) { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } + } + }); + } catch (error) { Logger.error(`Error extracting metadata: ${error}`); } @@ -141,5 +158,6 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { ...(sourceURL ? { sourceURL } : {}), ...(pageStatusCode ? { pageStatusCode } : {}), ...(pageError ? { pageError } : {}), + ...customMetadata, }; } From 417c7697c385c8e227a632dd2e13b74d331a847b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:26:46 -0300 Subject: [PATCH 2/5] Update metadata.ts --- apps/api/src/scraper/WebScraper/utils/metadata.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index 0c2af118..b009c20c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -34,6 +34,7 @@ interface Metadata { sourceURL?: string; pageStatusCode?: number; pageError?: string; + [key: string]: string | string[] | number | undefined; } export function extractMetadata(soup: CheerioAPI, url: string): Metadata { From c0384ea381e08fedee49a9216561f8b312568d51 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:32:44 -0300 Subject: [PATCH 3/5] Nick: added tests --- .../__tests__/e2e_v1_withAuth/index.test.ts | 43 +++++++++++++++++++ apps/api/src/controllers/v1/types.ts | 2 + 2 files changed, 45 insertions(+) diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index eef65125..a4163472 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -121,6 +121,49 @@ describe("E2E Tests for v1 API Routes", () => { }, 30000 ); // 30 seconds timeout + + it.concurrent( + "should return a successful response with a valid API key", + async () => { + const scrapeRequest: ScrapeRequest = { + url: "https://arxiv.org/abs/2410.04840", + }; + + const response: ScrapeResponseRequestTest = await request(TEST_URL) + .post("/v1/scrape") + .set("Authorization", `Bearer ${process.env.TEST_API_KEY}`) + .set("Content-Type", "application/json") + .send(scrapeRequest); + + expect(response.statusCode).toBe(200); + + if (!("data" in response.body)) { + throw new Error("Expected response body to have 'data' property"); + } + expect(response.body.data).not.toHaveProperty("content"); + expect(response.body.data).toHaveProperty("markdown"); + expect(response.body.data).toHaveProperty("metadata"); + expect(response.body.data).not.toHaveProperty("html"); + expect(response.body.data.markdown).toContain("Strong Model Collapse"); + expect(response.body.data.metadata.error).toBeUndefined(); + expect(response.body.data.metadata.description).toContain("Abstract page for arXiv paper 2410.04840: Strong Model Collapse"); + expect(response.body.data.metadata.citation_title).toBe("Strong Model Collapse"); + expect(response.body.data.metadata.citation_author).toEqual([ + "Dohmatob, Elvis", + "Feng, Yunzhen", + "Subramonian, Arjun", + "Kempe, Julia" + ]); + expect(response.body.data.metadata.citation_date).toBe("2024/10/07"); + expect(response.body.data.metadata.citation_online_date).toBe("2024/10/08"); + expect(response.body.data.metadata.citation_pdf_url).toBe("http://arxiv.org/pdf/2410.04840"); + expect(response.body.data.metadata.citation_arxiv_id).toBe("2410.04840"); + expect(response.body.data.metadata.citation_abstract).toContain("Within the scaling laws paradigm"); + expect(response.body.data.metadata.sourceURL).toBe("https://arxiv.org/abs/2410.04840"); + expect(response.body.data.metadata.statusCode).toBe(200); + }, + 30000 + ); it.concurrent( "should return a successful response with a valid API key and includeHtml set to true", async () => { diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 998f2dfa..0975bb01 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -250,6 +250,8 @@ export type Document = { sourceURL?: string; statusCode?: number; error?: string; + [key: string]: string | string[] | number | undefined; + }; }; From 8974230db47abd3dc041896fe032a4a91951eef5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:35:03 -0300 Subject: [PATCH 4/5] Nick: formatting + error handling --- .../src/scraper/WebScraper/utils/metadata.ts | 74 ++++++++++++------- 1 file changed, 47 insertions(+), 27 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index b009c20c..aecae481 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -76,52 +76,72 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { try { title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null; - - language = soup('html').attr('lang') || null; + + language = soup("html").attr("lang") || null; keywords = soup('meta[name="keywords"]').attr("content") || null; robots = soup('meta[name="robots"]').attr("content") || null; ogTitle = soup('meta[property="og:title"]').attr("content") || null; - ogDescription = soup('meta[property="og:description"]').attr("content") || null; + ogDescription = + soup('meta[property="og:description"]').attr("content") || null; ogUrl = soup('meta[property="og:url"]').attr("content") || null; ogImage = soup('meta[property="og:image"]').attr("content") || null; ogAudio = soup('meta[property="og:audio"]').attr("content") || null; - ogDeterminer = soup('meta[property="og:determiner"]').attr("content") || null; + ogDeterminer = + soup('meta[property="og:determiner"]').attr("content") || null; ogLocale = soup('meta[property="og:locale"]').attr("content") || null; - ogLocaleAlternate = soup('meta[property="og:locale:alternate"]').map((i, el) => soup(el).attr("content")).get() || null; + ogLocaleAlternate = + soup('meta[property="og:locale:alternate"]') + .map((i, el) => soup(el).attr("content")) + .get() || null; ogSiteName = soup('meta[property="og:site_name"]').attr("content") || null; ogVideo = soup('meta[property="og:video"]').attr("content") || null; - articleSection = soup('meta[name="article:section"]').attr("content") || null; + articleSection = + soup('meta[name="article:section"]').attr("content") || null; articleTag = soup('meta[name="article:tag"]').attr("content") || null; - publishedTime = soup('meta[property="article:published_time"]').attr("content") || null; - modifiedTime = soup('meta[property="article:modified_time"]').attr("content") || null; - dctermsKeywords = soup('meta[name="dcterms.keywords"]').attr("content") || null; + publishedTime = + soup('meta[property="article:published_time"]').attr("content") || null; + modifiedTime = + soup('meta[property="article:modified_time"]').attr("content") || null; + dctermsKeywords = + soup('meta[name="dcterms.keywords"]').attr("content") || null; dcDescription = soup('meta[name="dc.description"]').attr("content") || null; dcSubject = soup('meta[name="dc.subject"]').attr("content") || null; - dctermsSubject = soup('meta[name="dcterms.subject"]').attr("content") || null; - dctermsAudience = soup('meta[name="dcterms.audience"]').attr("content") || null; + dctermsSubject = + soup('meta[name="dcterms.subject"]').attr("content") || null; + dctermsAudience = + soup('meta[name="dcterms.audience"]').attr("content") || null; dcType = soup('meta[name="dc.type"]').attr("content") || null; dctermsType = soup('meta[name="dcterms.type"]').attr("content") || null; dcDate = soup('meta[name="dc.date"]').attr("content") || null; - dcDateCreated = soup('meta[name="dc.date.created"]').attr("content") || null; - dctermsCreated = soup('meta[name="dcterms.created"]').attr("content") || null; + dcDateCreated = + soup('meta[name="dc.date.created"]').attr("content") || null; + dctermsCreated = + soup('meta[name="dcterms.created"]').attr("content") || null; - // Extract all meta tags for custom metadata - soup("meta").each((i, elem) => { - const name = soup(elem).attr("name") || soup(elem).attr("property"); - const content = soup(elem).attr("content"); + try { + // Extract all meta tags for custom metadata + soup("meta").each((i, elem) => { + try { + const name = soup(elem).attr("name") || soup(elem).attr("property"); + const content = soup(elem).attr("content"); - if (name && content) { - if (customMetadata[name] === undefined) { - customMetadata[name] = content; - } else if (Array.isArray(customMetadata[name])) { - (customMetadata[name] as string[]).push(content); - } else { - customMetadata[name] = [customMetadata[name] as string, content]; + if (name && content) { + if (customMetadata[name] === undefined) { + customMetadata[name] = content; + } else if (Array.isArray(customMetadata[name])) { + (customMetadata[name] as string[]).push(content); + } else { + customMetadata[name] = [customMetadata[name] as string, content]; + } + } + } catch (error) { + Logger.error(`Error extracting custom metadata (in): ${error}`); } - } - }); - + }); + } catch (error) { + Logger.error(`Error extracting custom metadata: ${error}`); + } } catch (error) { Logger.error(`Error extracting metadata: ${error}`); } From 2ac50a16f536b834087759a3477b8d8221eb9b0e Mon Sep 17 00:00:00 2001 From: Nicolas Date: Wed, 16 Oct 2024 23:37:07 -0300 Subject: [PATCH 5/5] Update metadata.ts --- apps/api/src/scraper/WebScraper/utils/metadata.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/api/src/scraper/WebScraper/utils/metadata.ts b/apps/api/src/scraper/WebScraper/utils/metadata.ts index aecae481..531dc17c 100644 --- a/apps/api/src/scraper/WebScraper/utils/metadata.ts +++ b/apps/api/src/scraper/WebScraper/utils/metadata.ts @@ -74,6 +74,7 @@ export function extractMetadata(soup: CheerioAPI, url: string): Metadata { const customMetadata: Record = {}; try { + // TODO: remove this as it is redundant with the below implementation title = soup("title").text() || null; description = soup('meta[name="description"]').attr("content") || null;