fix(scrape): allow getting valid JSON via rawHtml (FIR-852) (#1138)

* fix(scrape): allow getting valid JSON via rawHtml * fix(scrape/test):
2025-08-12 07:39:00 +08:00 · 2025-02-06 18:35:28 +01:00 · 2025-02-06 18:35:28 +01:00 · 892f3a41f3
commit 892f3a41f3
parent 401c18761a
4 changed files with 70 additions and 3 deletions
--- a/apps/api/sharedLibs/html-transformer/src/lib.rs
+++ b/apps/api/sharedLibs/html-transformer/src/lib.rs
@ -10,7 +10,7 @@ use url::Url;
 /// # Safety
 /// Input options must be a C HTML string. Output will be a JSON string array. Output string must be freed with free_string.
 #[no_mangle]
-pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
+pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut libc::c_char {
    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();

    let document = parse_html().one(html);
@ -54,7 +54,7 @@ macro_rules! insert_meta_property {
 /// # Safety
 /// Input options must be a C HTML string. Output will be a JSON object. Output string must be freed with free_string.
 #[no_mangle]
-pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
+pub unsafe extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut libc::c_char {
    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();

    let document = parse_html().one(html);
@ -334,7 +334,7 @@ fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
 /// # Safety
 /// Input options must be a C JSON string. Output will be an HTML string. Output string must be freed with free_string.
 #[no_mangle]
-pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
+pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut libc::c_char {
    let opts: TranformHTMLOptions = match unsafe { CStr::from_ptr(opts) }.to_str().map_err(|_| ()).and_then(|x| serde_json::de::from_str(x).map_err(|_| ())) {
        Ok(x) => x,
        Err(_) => {
@ -350,6 +350,26 @@ pub unsafe extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
    CString::new(out).unwrap().into_raw()
 }

+fn _get_inner_json(html: &str) -> Result<String, ()> {
+    Ok(parse_html().one(html).select_first("body")?.text_contents())
+}
+
+/// For JSON pages retrieved by browser engines, this function can be used to transform it back into valid JSON.
+/// 
+/// # Safety
+/// Input must be a C HTML string. Output will be an HTML string. Output string must be freed with free_string.
+#[no_mangle]
+pub unsafe extern "C" fn get_inner_json(html: *const libc::c_char) -> *mut libc::c_char {
+    let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
+
+    let out = match _get_inner_json(html) {
+        Ok(x) => x,
+        Err(_) => "RUSTFC:ERROR".to_string(),
+    };
+
+    CString::new(out).unwrap().into_raw()
+}
+
 /// Frees a string allocated in Rust-land.
 /// 
 /// # Safety
--- a/apps/api/src/tests/snips/scrape.test.ts
+++ b/apps/api/src/tests/snips/scrape.test.ts
@ -76,4 +76,17 @@ describe("Scrape tests", () => {
      expect(response.body.data.markdown).toContain("| Country | United States |");
    }, 10000);
  });
+
+  describe("JSON scrape support", () => {
+    it.concurrent("returns parseable JSON", async () => {
+      const response = await scrape({
+        url: "https://jsonplaceholder.typicode.com/todos/1",
+        formats: ["rawHtml"],
+      });
+
+      expectScrapeToSucceed(response);
+      const obj = JSON.parse(response.body.data.rawHtml);
+      expect(obj.id).toBe(1);
+    }, 25000); // TODO: mock and shorten
+  })
 });
--- a/apps/api/src/lib/html-transformer.ts
+++ b/apps/api/src/lib/html-transformer.ts
@ -24,6 +24,7 @@ class RustHTMLTransformer {
  private _extractMetadata: KoffiFunction;
  private _transformHtml: KoffiFunction;
  private _freeString: KoffiFunction;
+  private _getInnerJSON: KoffiFunction;

  private constructor() {
    const lib = koffi.load(rustExecutablePath);
@ -33,6 +34,7 @@ class RustHTMLTransformer {
    this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
    this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
    this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
+    this._getInnerJSON = lib.func("get_inner_json", freedResultString, ["string"]);
  }

  public static async getInstance(): Promise<RustHTMLTransformer> {
@ -86,6 +88,22 @@ class RustHTMLTransformer {
      });
    });
  }
+
+  public async getInnerJSON(html: string): Promise<string> {
+    return new Promise<string>((resolve, reject) => {
+      this._getInnerJSON.async(html, (err: Error, res: string) => {
+        if (err) {
+          reject(err);
+        } else {
+          if (res === "RUSTFC:ERROR") {
+            reject(new Error("Something went wrong on the Rust side."));
+          } else {
+            resolve(res);
+          }
+        }
+      });
+    });
+  }
 }

 export async function extractLinks(
@ -116,3 +134,10 @@ export async function transformHtml(
  const converter = await RustHTMLTransformer.getInstance();
  return await converter.transformHtml(opts);
 }
+
+export async function getInnerJSON(
+  html: string,
+): Promise<string> {
+  const converter = await RustHTMLTransformer.getInstance();
+  return await converter.getInnerJSON(html);
+}
--- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
+++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts
@ -25,6 +25,7 @@ import { Action } from "../../../../lib/entities";
 import { specialtyScrapeCheck } from "../utils/specialtyHandler";
 import { fireEngineDelete } from "./delete";
 import { MockState, saveMock } from "../../lib/mock";
+import { getInnerJSON } from "../../../../lib/html-transformer";

 // This function does not take `Meta` on purpose. It may not access any
 // meta values to construct the request -- that must be done by the
@ -126,6 +127,14 @@ async function performFireEngineScrape<
    status.responseHeaders,
  );

+  const contentType = (Object.entries(status.responseHeaders ?? {}).find(
+    (x) => x[0].toLowerCase() === "content-type",
+  ) ?? [])[1] ?? "";
+
+  if (contentType.includes("application/json")) {
+    status.content = await getInnerJSON(status.content);
+  }
+
  if (status.file) {
    const content = status.file.content;
    delete status.file;