mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 06:39:02 +08:00
feat(scrapeURL): handle contentType JSON better in markdown conversion (#1604)
This commit is contained in:
parent
7e73b01599
commit
38c96b524f
@ -373,5 +373,14 @@ describe("Scrape tests", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
expect(response.metadata.sourceURL).toBe("https://firecrawl.dev/?pagewanted=all&et_blog");
|
||||||
}, 35000);
|
}, 30000);
|
||||||
|
|
||||||
|
it.concurrent("application/json content type is markdownified properly", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://jsonplaceholder.typicode.com/todos/1",
|
||||||
|
formats: ["markdown"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(response.markdown).toContain("```json");
|
||||||
|
}, 30000);
|
||||||
});
|
});
|
||||||
|
@ -750,6 +750,7 @@ export type Document = {
|
|||||||
scrapeId?: string;
|
scrapeId?: string;
|
||||||
error?: string;
|
error?: string;
|
||||||
numPages?: number;
|
numPages?: number;
|
||||||
|
contentType?: string;
|
||||||
proxyUsed: "basic" | "stealth";
|
proxyUsed: "basic" | "stealth";
|
||||||
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
// [key: string]: string | string[] | number | { smartScrape: number; other: number; total: number } | undefined;
|
||||||
};
|
};
|
||||||
|
@ -30,7 +30,7 @@ export async function scrapeURLWithFetch(
|
|||||||
url: string;
|
url: string;
|
||||||
body: string,
|
body: string,
|
||||||
status: number;
|
status: number;
|
||||||
headers: any;
|
headers: [string, string][];
|
||||||
};
|
};
|
||||||
|
|
||||||
if (meta.mock !== null) {
|
if (meta.mock !== null) {
|
||||||
@ -117,5 +117,8 @@ export async function scrapeURLWithFetch(
|
|||||||
url: response.url,
|
url: response.url,
|
||||||
html: response.body,
|
html: response.body,
|
||||||
statusCode: response.status,
|
statusCode: response.status,
|
||||||
|
contentType: (response.headers.find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -273,6 +273,10 @@ export async function scrapeURLWithFireEngineChromeCDP(
|
|||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
|
|
||||||
screenshot: response.screenshot,
|
screenshot: response.screenshot,
|
||||||
...(actions.length > 0
|
...(actions.length > 0
|
||||||
? {
|
? {
|
||||||
@ -336,6 +340,10 @@ export async function scrapeURLWithFireEnginePlaywright(
|
|||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
|
|
||||||
...(response.screenshots !== undefined && response.screenshots.length > 0
|
...(response.screenshots !== undefined && response.screenshots.length > 0
|
||||||
? {
|
? {
|
||||||
screenshot: response.screenshots[0],
|
screenshot: response.screenshots[0],
|
||||||
@ -391,5 +399,9 @@ export async function scrapeURLWithFireEngineTLSClient(
|
|||||||
html: response.content,
|
html: response.content,
|
||||||
error: response.pageError,
|
error: response.pageError,
|
||||||
statusCode: response.pageStatusCode,
|
statusCode: response.pageStatusCode,
|
||||||
|
|
||||||
|
contentType: (Object.entries(response.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? undefined,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,8 @@ export type EngineScrapeResult = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
numPages?: number;
|
numPages?: number;
|
||||||
|
|
||||||
|
contentType?: string;
|
||||||
};
|
};
|
||||||
|
|
||||||
const engineHandlers: {
|
const engineHandlers: {
|
||||||
|
@ -379,6 +379,7 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
|||||||
statusCode: result.result.statusCode,
|
statusCode: result.result.statusCode,
|
||||||
error: result.result.error,
|
error: result.result.error,
|
||||||
numPages: result.result.numPages,
|
numPages: result.result.numPages,
|
||||||
|
contentType: result.result.contentType,
|
||||||
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
proxyUsed: meta.featureFlags.has("stealthProxy") ? "stealth" : "basic",
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
@ -61,6 +61,17 @@ export async function deriveMarkdownFromHTML(
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (document.metadata.contentType?.includes("application/json")) {
|
||||||
|
if (document.rawHtml === undefined) {
|
||||||
|
throw new Error(
|
||||||
|
"rawHtml is undefined -- this transformer is being called out of order",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
document.markdown = "```json\n" + document.rawHtml + "\n```";
|
||||||
|
return document;
|
||||||
|
}
|
||||||
|
|
||||||
document.markdown = await parseMarkdown(document.html);
|
document.markdown = await parseMarkdown(document.html);
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user