From 38c96b524f120a068dfae3a35c3a13fcb2d5e889 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 29 May 2025 15:26:07 +0200 Subject: [PATCH] feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604) --- apps/api/src/__tests__/snips/scrape.test.ts | 11 ++++++++++- apps/api/src/controllers/v1/types.ts | 1 + .../api/src/scraper/scrapeURL/engines/fetch/index.ts | 5 ++++- .../scraper/scrapeURL/engines/fire-engine/index.ts | 12 ++++++++++++ apps/api/src/scraper/scrapeURL/engines/index.ts | 2 ++ apps/api/src/scraper/scrapeURL/index.ts | 1 + apps/api/src/scraper/scrapeURL/transformers/index.ts | 11 +++++++++++ 7 files changed, 41 insertions(+), 2 deletions(-) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 9c2a9e2d..a27057f7 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -373,5 +373,14 @@ describe("Scrape tests", () => { }); expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog"); - }, 35000); + }, 30000); + + it.concurrent("application/json content type is markdownified properly", async () => { + const response = await scrape({ + url: "https://jsonplaceholder.typicode.com/todos/1", + formats: ["markdown"], + }); + + expect(response.markdown).toContain("```json"); + }, 30000); }); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 908a51d3..5ee1328d 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -750,6 +750,7 @@ export type Document = { scrapeId?: string; error?: string; numPages?: number; + contentType?: string; proxyUsed: "basic" | "stealth"; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; }; diff --git a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts index 40c34399..299b0d35 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fetch/index.ts @@ -30,7 +30,7 @@ export async function scrapeURLWithFetch( url: string; body: string, status: number; - headers: any; + headers: [string, string][]; }; if (meta.mock !== null) { @@ -117,5 +117,8 @@ export async function scrapeURLWithFetch( url: response.url, html: response.body, statusCode: response.status, + contentType: (response.headers.find( + (x) => x[0].toLowerCase() === "content-type", + ) ?? [])[1] ?? undefined, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index 6c1648bf..8bbacb4c 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP( error: response.pageError, statusCode: response.pageStatusCode, + contentType: (Object.entries(response.responseHeaders ?? {}).find( + (x) => x[0].toLowerCase() === "content-type", + ) ?? [])[1] ?? undefined, + screenshot: response.screenshot, ...(actions.length > 0 ? { @@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright( error: response.pageError, statusCode: response.pageStatusCode, + contentType: (Object.entries(response.responseHeaders ?? {}).find( + (x) => x[0].toLowerCase() === "content-type", + ) ?? [])[1] ?? undefined, + ...(response.screenshots !== undefined && response.screenshots.length > 0 ? { screenshot: response.screenshots[0], @@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient( html: response.content, error: response.pageError, statusCode: response.pageStatusCode, + + contentType: (Object.entries(response.responseHeaders ?? {}).find( + (x) => x[0].toLowerCase() === "content-type", + ) ?? [])[1] ?? undefined, }; } diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index 06c5e072..fe0ae8c7 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -111,6 +111,8 @@ export type EngineScrapeResult = { }; numPages?: number; + + contentType?: string; }; const engineHandlers: { diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index fbc0c53b..90873ca6 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise { statusCode: result.result.statusCode, error: result.result.error, numPages: result.result.numPages, + contentType: result.result.contentType, proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic", }, }; diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index d9fc4b8c..5186d6b4 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML( ); } + if (document.metadata.contentType?.includes("application/json")) { + if (document.rawHtml === undefined) { + throw new Error( + "rawHtml is undefined -- this transformer is being called out of order", + ); + } + + document.markdown = "```json\n" + document.rawHtml + "\n```"; + return document; + } + document.markdown = await parseMarkdown(document.html); return document; }