feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604)

This commit is contained in:
Gergő Móricz 2025-05-29 15:26:07 +02:00 committed by GitHub
parent 7e73b01599
commit 38c96b524f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 41 additions and 2 deletions

View File

@ -373,5 +373,14 @@ describe("Scrape tests", () => {
}); });
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog"); expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000); }, 30000);
it.concurrent("application/json content type is markdownified properly", async () => {
const response = await scrape({
url: "https://jsonplaceholder.typicode.com/todos/1",
formats: ["markdown"],
});
expect(response.markdown).toContain("```json");
}, 30000);
}); });

View File

@ -750,6 +750,7 @@ export type Document = {
scrapeId?: string; scrapeId?: string;
error?: string; error?: string;
numPages?: number; numPages?: number;
contentType?: string;
proxyUsed: "basic" | "stealth"; proxyUsed: "basic" | "stealth";
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined; // [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
}; };

View File

@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
url: string; url: string;
body: string, body: string,
status: number; status: number;
headers: any; headers: [string, string][];
}; };
if (meta.mock !== null) { if (meta.mock !== null) {
@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
url: response.url, url: response.url,
html: response.body, html: response.body,
statusCode: response.status, statusCode: response.status,
contentType: (response.headers.find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
}; };
} }

View File

@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
error: response.pageError, error: response.pageError,
statusCode: response.pageStatusCode, statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
screenshot: response.screenshot, screenshot: response.screenshot,
...(actions.length > 0 ...(actions.length > 0
? { ? {
@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
error: response.pageError, error: response.pageError,
statusCode: response.pageStatusCode, statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
...(response.screenshots !== undefined && response.screenshots.length > 0 ...(response.screenshots !== undefined && response.screenshots.length > 0
? { ? {
screenshot: response.screenshots[0], screenshot: response.screenshots[0],
@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
html: response.content, html: response.content,
error: response.pageError, error: response.pageError,
statusCode: response.pageStatusCode, statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
}; };
} }

View File

@ -111,6 +111,8 @@ export type EngineScrapeResult = {
}; };
numPages?: number; numPages?: number;
contentType?: string;
}; };
const engineHandlers: { const engineHandlers: {

View File

@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
statusCode: result.result.statusCode, statusCode: result.result.statusCode,
error: result.result.error, error: result.result.error,
numPages: result.result.numPages, numPages: result.result.numPages,
contentType: result.result.contentType,
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic", proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
}, },
}; };

View File

@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
); );
} }
if (document.metadata.contentType?.includes("application/json")) {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.markdown = "```json\n" + document.rawHtml + "\n```";
return document;
}
document.markdown = await parseMarkdown(document.html); document.markdown = await parseMarkdown(document.html);
return document; return document;
} }