mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
fix(scrape): allow getting valid JSON via rawHtml (FIR-852) (#1138)
* fix(scrape): allow getting valid JSON via rawHtml * fix(scrape/test):
This commit is contained in:
parent
401c18761a
commit
892f3a41f3
@ -10,7 +10,7 @@ use url::Url;
|
|||||||
/// # Safety
|
/// # Safety
|
||||||
/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
|
/// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut libc::c_char {
|
||||||
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||||
|
|
||||||
let document = parse_html().one(html);
|
let document = parse_html().one(html);
|
||||||
@ -54,7 +54,7 @@ macro_rules! insert_meta_property {
|
|||||||
/// # Safety
|
/// # Safety
|
||||||
/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
|
/// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
|
pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut libc::c_char {
|
||||||
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||||
|
|
||||||
let document = parse_html().one(html);
|
let document = parse_html().one(html);
|
||||||
@ -334,7 +334,7 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
|
|||||||
/// # Safety
|
/// # Safety
|
||||||
/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
|
/// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
|
||||||
#[no_mangle]
|
#[no_mangle]
|
||||||
pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
|
pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut libc::c_char {
|
||||||
let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
|
let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
|
||||||
Ok(x) => x,
|
Ok(x) => x,
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
@ -350,6 +350,26 @@ pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
|
|||||||
CString::new(out).unwrap().into_raw()
|
CString::new(out).unwrap().into_raw()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn _get_inner_json(html: &str) -> Result<String, ()> {
|
||||||
|
Ok(parse_html().one(html).select_first("body")?.text_contents())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON.
|
||||||
|
///
|
||||||
|
/// # Safety
|
||||||
|
/// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string.
|
||||||
|
#[no_mangle]
|
||||||
|
pub unsafe extern "C" fn get_inner_json(html: *const libc::c_char) -> *mut libc::c_char {
|
||||||
|
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||||
|
|
||||||
|
let out = match _get_inner_json(html) {
|
||||||
|
Ok(x) => x,
|
||||||
|
Err(_) => "RUSTFC:ERROR".to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
CString::new(out).unwrap().into_raw()
|
||||||
|
}
|
||||||
|
|
||||||
/// Frees a string allocated in Rust-land.
|
/// Frees a string allocated in Rust-land.
|
||||||
///
|
///
|
||||||
/// # Safety
|
/// # Safety
|
||||||
|
@ -76,4 +76,17 @@ describe("Scrape tests", () => {
|
|||||||
expect(response.body.data.markdown).toContain("| Country | United States |");
|
expect(response.body.data.markdown).toContain("| Country | United States |");
|
||||||
}, 10000);
|
}, 10000);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
describe("JSON scrape support", () => {
|
||||||
|
it.concurrent("returns parseable JSON", async () => {
|
||||||
|
const response = await scrape({
|
||||||
|
url: "https://jsonplaceholder.typicode.com/todos/1",
|
||||||
|
formats: ["rawHtml"],
|
||||||
|
});
|
||||||
|
|
||||||
|
expectScrapeToSucceed(response);
|
||||||
|
const obj = JSON.parse(response.body.data.rawHtml);
|
||||||
|
expect(obj.id).toBe(1);
|
||||||
|
}, 25000); // TODO: mock and shorten
|
||||||
|
})
|
||||||
});
|
});
|
||||||
|
@ -24,6 +24,7 @@ class RustHTMLTransformer {
|
|||||||
private _extractMetadata: KoffiFunction;
|
private _extractMetadata: KoffiFunction;
|
||||||
private _transformHtml: KoffiFunction;
|
private _transformHtml: KoffiFunction;
|
||||||
private _freeString: KoffiFunction;
|
private _freeString: KoffiFunction;
|
||||||
|
private _getInnerJSON: KoffiFunction;
|
||||||
|
|
||||||
private constructor() {
|
private constructor() {
|
||||||
const lib = koffi.load(rustExecutablePath);
|
const lib = koffi.load(rustExecutablePath);
|
||||||
@ -33,6 +34,7 @@ class RustHTMLTransformer {
|
|||||||
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
|
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
|
||||||
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
|
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
|
||||||
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
|
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
|
||||||
|
this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static async getInstance(): Promise<RustHTMLTransformer> {
|
public static async getInstance(): Promise<RustHTMLTransformer> {
|
||||||
@ -86,6 +88,22 @@ class RustHTMLTransformer {
|
|||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public async getInnerJSON(html: string): Promise<string> {
|
||||||
|
return new Promise<string>((resolve, reject) => {
|
||||||
|
this._getInnerJSON.async(html, (err: Error, res: string) => {
|
||||||
|
if (err) {
|
||||||
|
reject(err);
|
||||||
|
} else {
|
||||||
|
if (res === "RUSTFC:ERROR") {
|
||||||
|
reject(new Error("Something went wrong on the Rust side."));
|
||||||
|
} else {
|
||||||
|
resolve(res);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function extractLinks(
|
export async function extractLinks(
|
||||||
@ -116,3 +134,10 @@ export async function transformHtml(
|
|||||||
const converter = await RustHTMLTransformer.getInstance();
|
const converter = await RustHTMLTransformer.getInstance();
|
||||||
return await converter.transformHtml(opts);
|
return await converter.transformHtml(opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function getInnerJSON(
|
||||||
|
html: string,
|
||||||
|
): Promise<string> {
|
||||||
|
const converter = await RustHTMLTransformer.getInstance();
|
||||||
|
return await converter.getInnerJSON(html);
|
||||||
|
}
|
||||||
|
@ -25,6 +25,7 @@ import { Action } from "../../../../lib/entities";
|
|||||||
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
import { specialtyScrapeCheck } from "../utils/specialtyHandler";
|
||||||
import { fireEngineDelete } from "./delete";
|
import { fireEngineDelete } from "./delete";
|
||||||
import { MockState, saveMock } from "../../lib/mock";
|
import { MockState, saveMock } from "../../lib/mock";
|
||||||
|
import { getInnerJSON } from "../../../../lib/html-transformer";
|
||||||
|
|
||||||
// This function does not take `Meta` on purpose. It may not access any
|
// This function does not take `Meta` on purpose. It may not access any
|
||||||
// meta values to construct the request -- that must be done by the
|
// meta values to construct the request -- that must be done by the
|
||||||
@ -126,6 +127,14 @@ async function performFireEngineScrape<
|
|||||||
status.responseHeaders,
|
status.responseHeaders,
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const contentType = (Object.entries(status.responseHeaders ?? {}).find(
|
||||||
|
(x) => x[0].toLowerCase() === "content-type",
|
||||||
|
) ?? [])[1] ?? "";
|
||||||
|
|
||||||
|
if (contentType.includes("application/json")) {
|
||||||
|
status.content = await getInnerJSON(status.content);
|
||||||
|
}
|
||||||
|
|
||||||
if (status.file) {
|
if (status.file) {
|
||||||
const content = status.file.content;
|
const content = status.file.content;
|
||||||
delete status.file;
|
delete status.file;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user