mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-11 22:39:01 +08:00
feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604)
This commit is contained in:
parent
7e73b01599
commit
38c96b524f
@ -373,5 +373,14 @@ describe("Scrape tests", () => {
|
||||
});
|
||||
|
||||
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||
}, 35000);
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("application/json content type is markdownified properly", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://jsonplaceholder.typicode.com/todos/1",
|
||||
formats: ["markdown"],
|
||||
});
|
||||
|
||||
expect(response.markdown).toContain("```json");
|
||||
}, 30000);
|
||||
});
|
||||
|
@ -750,6 +750,7 @@ export type Document = {
|
||||
scrapeId?: string;
|
||||
error?: string;
|
||||
numPages?: number;
|
||||
contentType?: string;
|
||||
proxyUsed: "basic" | "stealth";
|
||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||
};
|
||||
|
@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
|
||||
url: string;
|
||||
body: string,
|
||||
status: number;
|
||||
headers: any;
|
||||
headers: [string, string][];
|
||||
};
|
||||
|
||||
if (meta.mock !== null) {
|
||||
@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
|
||||
url: response.url,
|
||||
html: response.body,
|
||||
statusCode: response.status,
|
||||
contentType: (response.headers.find(
|
||||
(x) => x[0].toLowerCase() === "content-type",
|
||||
) ?? [])[1] ?? undefined,
|
||||
};
|
||||
}
|
||||
|
@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||
(x) => x[0].toLowerCase() === "content-type",
|
||||
) ?? [])[1] ?? undefined,
|
||||
|
||||
screenshot: response.screenshot,
|
||||
...(actions.length > 0
|
||||
? {
|
||||
@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||
(x) => x[0].toLowerCase() === "content-type",
|
||||
) ?? [])[1] ?? undefined,
|
||||
|
||||
...(response.screenshots !== undefined && response.screenshots.length > 0
|
||||
? {
|
||||
screenshot: response.screenshots[0],
|
||||
@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
|
||||
html: response.content,
|
||||
error: response.pageError,
|
||||
statusCode: response.pageStatusCode,
|
||||
|
||||
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||
(x) => x[0].toLowerCase() === "content-type",
|
||||
) ?? [])[1] ?? undefined,
|
||||
};
|
||||
}
|
||||
|
@ -111,6 +111,8 @@ export type EngineScrapeResult = {
|
||||
};
|
||||
|
||||
numPages?: number;
|
||||
|
||||
contentType?: string;
|
||||
};
|
||||
|
||||
const engineHandlers: {
|
||||
|
@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
statusCode: result.result.statusCode,
|
||||
error: result.result.error,
|
||||
numPages: result.result.numPages,
|
||||
contentType: result.result.contentType,
|
||||
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||
},
|
||||
};
|
||||
|
@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
|
||||
);
|
||||
}
|
||||
|
||||
if (document.metadata.contentType?.includes("application/json")) {
|
||||
if (document.rawHtml === undefined) {
|
||||
throw new Error(
|
||||
"rawHtml is undefined -- this transformer is being called out of order",
|
||||
);
|
||||
}
|
||||
|
||||
document.markdown = "```json\n" + document.rawHtml + "\n```";
|
||||
return document;
|
||||
}
|
||||
|
||||
document.markdown = await parseMarkdown(document.html);
|
||||
return document;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user