From 892f3a41f3964d4634cbee1d3d94dd8a8a24a012 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 6 Feb 2025 18:35:28 +0100 Subject: [PATCH] fix(scrape): allow getting valid JSON via rawHtml (FIR-852) (#1138) * fix(scrape): allow getting valid JSON via rawHtml * fix(scrape/test): --- .../sharedLibs/html-transformer/src/lib.rs | 26 ++++++++++++++++--- apps/api/src/__tests__/snips/scrape.test.ts | 13 ++++++++++ apps/api/src/lib/html-transformer.ts | 25 ++++++++++++++++++ .../scrapeURL/engines/fire-engine/index.ts | 9 +++++++ 4 files changed, 70 insertions(+), 3 deletions(-) diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index d4a29934..ffa54d32 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -10,7 +10,7 @@ use url::Url; /// # Safety /// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string. #[no_mangle] -pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { +pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut libc::c_char { let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); let document = parse_html().one(html); @@ -54,7 +54,7 @@ macro_rules! insert_meta_property { /// # Safety /// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string. #[no_mangle] -pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 { +pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut libc::c_char { let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); let document = parse_html().one(html); @@ -334,7 +334,7 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result { /// # Safety /// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string. #[no_mangle] -pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 { +pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut libc::c_char { let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) { Ok(x) => x, Err(_) => { @@ -350,6 +350,26 @@ pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 { CString::new(out).unwrap().into_raw() } +fn _get_inner_json(html: &str) -> Result { + Ok(parse_html().one(html).select_first("body")?.text_contents()) +} + +/// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON. +/// +/// # Safety +/// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string. +#[no_mangle] +pub unsafe extern "C" fn get_inner_json(html: *const libc::c_char) -> *mut libc::c_char { + let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); + + let out = match _get_inner_json(html) { + Ok(x) => x, + Err(_) => "RUSTFC:ERROR".to_string(), + }; + + CString::new(out).unwrap().into_raw() +} + /// Frees a string allocated in Rust-land. /// /// # Safety diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 5b2e5db9..0c53edb8 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -76,4 +76,17 @@ describe("Scrape tests", () => { expect(response.body.data.markdown).toContain("| Country | United States |"); }, 10000); }); + + describe("JSON scrape support", () => { + it.concurrent("returns parseable JSON", async () => { + const response = await scrape({ + url: "https://jsonplaceholder.typicode.com/todos/1", + formats: ["rawHtml"], + }); + + expectScrapeToSucceed(response); + const obj = JSON.parse(response.body.data.rawHtml); + expect(obj.id).toBe(1); + }, 25000); // TODO: mock and shorten + }) }); diff --git a/apps/api/src/lib/html-transformer.ts b/apps/api/src/lib/html-transformer.ts index bab991c9..b089ccc6 100644 --- a/apps/api/src/lib/html-transformer.ts +++ b/apps/api/src/lib/html-transformer.ts @@ -24,6 +24,7 @@ class RustHTMLTransformer { private _extractMetadata: KoffiFunction; private _transformHtml: KoffiFunction; private _freeString: KoffiFunction; + private _getInnerJSON: KoffiFunction; private constructor() { const lib = koffi.load(rustExecutablePath); @@ -33,6 +34,7 @@ class RustHTMLTransformer { this._extractLinks = lib.func("extract_links", freedResultString, ["string"]); this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]); this._transformHtml = lib.func("transform_html", freedResultString, ["string"]); + this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]); } public static async getInstance(): Promise { @@ -86,6 +88,22 @@ class RustHTMLTransformer { }); }); } + + public async getInnerJSON(html: string): Promise { + return new Promise((resolve, reject) => { + this._getInnerJSON.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + if (res === "RUSTFC:ERROR") { + reject(new Error("Something went wrong on the Rust side.")); + } else { + resolve(res); + } + } + }); + }); + } } export async function extractLinks( @@ -116,3 +134,10 @@ export async function transformHtml( const converter = await RustHTMLTransformer.getInstance(); return await converter.transformHtml(opts); } + +export async function getInnerJSON( + html: string, +): Promise { + const converter = await RustHTMLTransformer.getInstance(); + return await converter.getInnerJSON(html); +} diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index eda8402d..04601da6 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -25,6 +25,7 @@ import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; import { fireEngineDelete } from "./delete"; import { MockState, saveMock } from "../../lib/mock"; +import { getInnerJSON } from "../../../../lib/html-transformer"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -126,6 +127,14 @@ async function performFireEngineScrape< status.responseHeaders, ); + const contentType = (Object.entries(status.responseHeaders ?? {}).find( + (x) => x[0].toLowerCase() === "content-type", + ) ?? [])[1] ?? ""; + + if (contentType.includes("application/json")) { + status.content = await getInnerJSON(status.content); + } + if (status.file) { const content = status.file.content; delete status.file;