diff --git a/apps/api/sharedLibs/html-transformer/Cargo.lock b/apps/api/sharedLibs/html-transformer/Cargo.lock index 43696071..5e5e8b53 100644 --- a/apps/api/sharedLibs/html-transformer/Cargo.lock +++ b/apps/api/sharedLibs/html-transformer/Cargo.lock @@ -101,6 +101,17 @@ dependencies = [ "syn 2.0.96", ] +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "dtoa" version = "1.0.9" @@ -137,6 +148,15 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" +[[package]] +name = "form_urlencoded" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456" +dependencies = [ + "percent-encoding", +] + [[package]] name = "futf" version = "0.1.5" @@ -204,6 +224,7 @@ dependencies = [ "lol_html", "serde", "serde_json", + "url", ] [[package]] @@ -220,6 +241,145 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "icu_collections" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locid" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_locid_transform" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_locid_transform_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_locid_transform_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e" + +[[package]] +name = "icu_normalizer" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "utf16_iter", + "utf8_iter", + "write16", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516" + +[[package]] +name = "icu_properties" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5" +dependencies = [ + "displaydoc", + "icu_collections", + "icu_locid_transform", + "icu_properties_data", + "icu_provider", + "tinystr", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569" + +[[package]] +name = "icu_provider" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9" +dependencies = [ + "displaydoc", + "icu_locid", + "icu_provider_macros", + "stable_deref_trait", + "tinystr", + "writeable", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_provider_macros" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + +[[package]] +name = "idna" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + [[package]] name = "indexmap" version = "1.9.3" @@ -261,6 +421,12 @@ version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" +[[package]] +name = "litemap" +version = "0.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104" + [[package]] name = "lock_api" version = "0.4.12" @@ -373,6 +539,12 @@ dependencies = [ "windows-targets", ] +[[package]] +name = "percent-encoding" +version = "2.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e" + [[package]] name = "phf" version = "0.8.0" @@ -793,6 +965,17 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "synstructure" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + [[package]] name = "tendril" version = "0.4.3" @@ -830,18 +1013,51 @@ dependencies = [ "syn 2.0.96", ] +[[package]] +name = "tinystr" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f" +dependencies = [ + "displaydoc", + "zerovec", +] + [[package]] name = "unicode-ident" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" +[[package]] +name = "url" +version = "2.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", +] + [[package]] name = "utf-8" version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" +[[package]] +name = "utf16_iter" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + [[package]] name = "wasi" version = "0.9.0+wasi-snapshot-preview1" @@ -918,6 +1134,42 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" +[[package]] +name = "write16" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936" + +[[package]] +name = "writeable" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51" + +[[package]] +name = "yoke" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40" +dependencies = [ + "serde", + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", + "synstructure", +] + [[package]] name = "zerocopy" version = "0.7.35" @@ -938,3 +1190,46 @@ dependencies = [ "quote", "syn 2.0.96", ] + +[[package]] +name = "zerofrom" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", + "synstructure", +] + +[[package]] +name = "zerovec" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] diff --git a/apps/api/sharedLibs/html-transformer/Cargo.toml b/apps/api/sharedLibs/html-transformer/Cargo.toml index 9e242060..0cd74dc6 100644 --- a/apps/api/sharedLibs/html-transformer/Cargo.toml +++ b/apps/api/sharedLibs/html-transformer/Cargo.toml @@ -9,6 +9,7 @@ lol_html = "2.2.0" kuchikiki = "0.8.2" serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" +url = "2.5.4" [lib] crate-type = ["cdylib"] diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index 290e910b..13d86bfe 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -1,28 +1,9 @@ use std::{collections::HashMap, ffi::{CStr, CString}}; use kuchikiki::{parse_html, traits::TendrilSink}; +use serde::Deserialize; use serde_json::Value; - -// #[no_mangle] -// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { -// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); - -// let mut output = vec![]; - -// let mut rewriter = HtmlRewriter::new( -// Settings { -// element_content_handlers: vec! [ -// element!("") -// ], -// ..Settings::new() -// }, -// |c: &[u8]| output.extend_from_slice(c) -// ); - -// rewriter.write(html.as_bytes()).unwrap(); - -// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw() -// } +use url::Url; #[no_mangle] pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { @@ -162,6 +143,190 @@ pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 { CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw() } +const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [ + "header", + "footer", + "nav", + "aside", + ".header", + ".top", + ".navbar", + "#header", + ".footer", + ".bottom", + "#footer", + ".sidebar", + ".side", + ".aside", + "#sidebar", + ".modal", + ".popup", + "#modal", + ".overlay", + ".ad", + ".ads", + ".advert", + "#ad", + ".lang-selector", + ".language", + "#language-selector", + ".social", + ".social-media", + ".social-links", + "#social", + ".menu", + ".navigation", + "#nav", + ".breadcrumbs", + "#breadcrumbs", + ".share", + "#share", + ".widget", + "#widget", + ".cookie", + "#cookie", +]; + +const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [ + "#main" +]; + +#[derive(Deserialize)] +struct TranformHTMLOptions { + html: String, + url: String, + include_tags: Vec, + exclude_tags: Vec, + only_main_content: bool, +} + +struct ImageSource { + url: String, + size: i32, + is_x: bool, +} + +fn _transform_html_inner(opts: TranformHTMLOptions) -> Result { + let mut document = parse_html().one(opts.html); + + if opts.include_tags.len() > 0 { + let new_document = parse_html().one("
"); + let root = new_document.select_first("div")?; + + for x in opts.include_tags.iter() { + for tag in document.select(&x)? { + root.as_node().append(tag.as_node().clone()); + } + } + + document = new_document; + } + + while let Ok(x) = document.select_first("head") { + x.as_node().detach(); + } + + while let Ok(x) = document.select_first("meta") { + x.as_node().detach(); + } + + while let Ok(x) = document.select_first("noscript") { + x.as_node().detach(); + } + + while let Ok(x) = document.select_first("style") { + x.as_node().detach(); + } + + while let Ok(x) = document.select_first("script") { + x.as_node().detach(); + } + + for x in opts.exclude_tags.iter() { + // TODO: implement weird version + while let Ok(x) = document.select_first(&x) { + x.as_node().detach(); + } + } + + if opts.only_main_content { + for x in EXCLUDE_NON_MAIN_TAGS.iter() { + let x: Vec<_> = document.select(&format!("{}", x))?.collect(); + for tag in x { + if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(&x).is_ok_and(|mut x| x.next().is_some())) { + tag.as_node().detach(); + } + } + } + } + + for img in document.select("img[srcset]")? { + let mut sizes: Vec = img.attributes.borrow().get("srcset").ok_or(())?.to_string().split(",").filter_map(|x| { + let tok: Vec<&str> = x.trim().split(" ").collect(); + let tok_1 = if tok.len() > 0 { + tok[1] + } else { + "1x" + }; + if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() { + Some(ImageSource { + url: tok[0].to_string(), + size: parsed_size, + is_x: tok_1.ends_with("x") + }) + } else { + None + } + }).collect(); + + if sizes.iter().all(|x| x.is_x) { + if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) { + sizes.push(ImageSource { + url: src, + size: 1, + is_x: true, + }); + } + } + + sizes.sort_by(|a, b| b.size.cmp(&a.size)); + + if let Some(biggest) = sizes.first() { + img.attributes.borrow_mut().insert("src", biggest.url.clone()); + } + } + + let url = Url::parse(&opts.url).map_err(|_| ())?; + + for img in document.select("img[src]")? { + let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?; + if let Ok(new) = url.join(&old) { + img.attributes.borrow_mut().insert("src", new.to_string()); + } + } + + for anchor in document.select("a[href]")? { + let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?; + if let Ok(new) = url.join(&old) { + anchor.attributes.borrow_mut().insert("href", new.to_string()); + } + } + + Ok(document.to_string()) +} + +#[no_mangle] +pub extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 { + let opts: TranformHTMLOptions = serde_json::de::from_str(&unsafe { CStr::from_ptr(opts) }.to_str().unwrap()).unwrap(); + + let out = match _transform_html_inner(opts) { + Ok(x) => x, + Err(_) => "RUSTFC:ERROR".to_string(), + }; + + CString::new(out).unwrap().into_raw() +} + #[no_mangle] pub extern "C" fn free_string(ptr: *mut i8) { drop(unsafe { CString::from_raw(ptr) }) diff --git a/apps/api/src/lib/html-transformer.ts b/apps/api/src/lib/html-transformer.ts index f7d58fb9..bab991c9 100644 --- a/apps/api/src/lib/html-transformer.ts +++ b/apps/api/src/lib/html-transformer.ts @@ -10,10 +10,19 @@ const rustExecutablePath = join( platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so" ); +type TransformHtmlOptions = { + html: string, + url: string, + include_tags: string[], + exclude_tags: string[], + only_main_content: boolean, +}; + class RustHTMLTransformer { private static instance: RustHTMLTransformer; private _extractLinks: KoffiFunction; private _extractMetadata: KoffiFunction; + private _transformHtml: KoffiFunction; private _freeString: KoffiFunction; private constructor() { @@ -23,6 +32,7 @@ class RustHTMLTransformer { const freedResultString = koffi.disposable(cstn, "string", this._freeString); this._extractLinks = lib.func("extract_links", freedResultString, ["string"]); this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]); + this._transformHtml = lib.func("transform_html", freedResultString, ["string"]); } public static async getInstance(): Promise { @@ -60,6 +70,22 @@ class RustHTMLTransformer { }); }); } + + public async transformHtml(opts: TransformHtmlOptions): Promise { + return new Promise((resolve, reject) => { + this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => { + if (err) { + reject(err); + } else { + if (res === "RUSTFC:ERROR") { + reject(new Error("Something went wrong on the Rust side.")); + } else { + resolve(res); + } + } + }); + }); + } } export async function extractLinks( @@ -82,4 +108,11 @@ export async function extractMetadata( const converter = await RustHTMLTransformer.getInstance(); return await converter.extractMetadata(html); -} \ No newline at end of file +} + +export async function transformHtml( + opts: TransformHtmlOptions, +): Promise { + const converter = await RustHTMLTransformer.getInstance(); + return await converter.transformHtml(opts); +} diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 4edb21f8..9e5ac215 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -1,7 +1,9 @@ // TODO: refactor -import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify +import { AnyNode, Cheerio, load } from "cheerio"; // rustified import { ScrapeOptions } from "../../../controllers/v1/types"; +import { transformHtml } from "../../../lib/html-transformer"; +import { logger } from "../../../lib/logger"; const excludeNonMainTags = [ "header", @@ -49,11 +51,26 @@ const excludeNonMainTags = [ const forceIncludeMainTags = ["#main"]; -export const htmlTransform = ( +export const htmlTransform = async ( html: string, url: string, scrapeOptions: ScrapeOptions, ) => { + try { + return await transformHtml({ + html, + url, + include_tags: (scrapeOptions.includeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0), + exclude_tags: (scrapeOptions.excludeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0), + only_main_content: scrapeOptions.onlyMainContent, + }) + } catch (error) { + logger.error("Failed to call html-transformer! Falling back to cheerio...", { + error, + module: "scrapeURL", method: "extractLinks" + }); + } + let soup = load(html); // remove unwanted elements diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index fe132ffd..2775afcc 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -31,17 +31,17 @@ export async function deriveMetadataFromRawHTML( return document; } -export function deriveHTMLFromRawHTML( +export async function deriveHTMLFromRawHTML( meta: Meta, document: Document, -): Document { +): Promise { if (document.rawHtml === undefined) { throw new Error( "rawHtml is undefined -- this transformer is being called out of order", ); } - document.html = htmlTransform( + document.html = await htmlTransform( document.rawHtml, document.metadata.url ?? document.metadata.sourceURL ?? meta.url, meta.options,