feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604)

This commit is contained in:
Gergő Móricz 2025-05-29 15:26:07 +02:00 committed by GitHub
parent 7e73b01599
commit 38c96b524f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 41 additions and 2 deletions

View File

@ -373,5 +373,14 @@ describe("Scrape tests", () => {
});
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
}, 35000);
}, 30000);
it.concurrent("application/json content type is markdownified properly", async () => {
const response = await scrape({
url: "https://jsonplaceholder.typicode.com/todos/1",
formats: ["markdown"],
});
expect(response.markdown).toContain("```json");
}, 30000);
});

View File

@ -750,6 +750,7 @@ export type Document = {
scrapeId?: string;
error?: string;
numPages?: number;
contentType?: string;
proxyUsed: "basic" | "stealth";
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
};

View File

@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
url: string;
body: string,
status: number;
headers: any;
headers: [string, string][];
};
if (meta.mock !== null) {
@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
url: response.url,
html: response.body,
statusCode: response.status,
contentType: (response.headers.find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
};
}

View File

@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
error: response.pageError,
statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
screenshot: response.screenshot,
...(actions.length > 0
? {
@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
error: response.pageError,
statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
...(response.screenshots !== undefined && response.screenshots.length > 0
? {
screenshot: response.screenshots[0],
@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
html: response.content,
error: response.pageError,
statusCode: response.pageStatusCode,
contentType: (Object.entries(response.responseHeaders ?? {}).find(
(x) => x[0].toLowerCase() === "content-type",
) ?? [])[1] ?? undefined,
};
}

View File

@ -111,6 +111,8 @@ export type EngineScrapeResult = {
};
numPages?: number;
contentType?: string;
};
const engineHandlers: {

View File

@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
statusCode: result.result.statusCode,
error: result.result.error,
numPages: result.result.numPages,
contentType: result.result.contentType,
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
},
};

View File

@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
);
}
if (document.metadata.contentType?.includes("application/json")) {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.markdown = "```json\n" + document.rawHtml + "\n```";
return document;
}
document.markdown = await parseMarkdown(document.html);
return document;
}