feat: rewrite html transformer in rust

This commit is contained in:
Móricz Gergő 2025-01-25 09:41:33 +01:00
parent 9c40e0cc8d
commit a2d94b525f
6 changed files with 538 additions and 27 deletions

View File

@ -101,6 +101,17 @@ dependencies = [
"syn 2.0.96",
]
[[package]]
name = "displaydoc"
version = "0.2.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "dtoa"
version = "1.0.9"
@ -137,6 +148,15 @@ version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
[[package]]
name = "form_urlencoded"
version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
dependencies = [
"percent-encoding",
]
[[package]]
name = "futf"
version = "0.1.5"
@ -204,6 +224,7 @@ dependencies = [
"lol_html",
"serde",
"serde_json",
"url",
]
[[package]]
@ -220,6 +241,145 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "icu_collections"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
dependencies = [
"displaydoc",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_locid"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
dependencies = [
"displaydoc",
"litemap",
"tinystr",
"writeable",
"zerovec",
]
[[package]]
name = "icu_locid_transform"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
dependencies = [
"displaydoc",
"icu_locid",
"icu_locid_transform_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_locid_transform_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
[[package]]
name = "icu_normalizer"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
dependencies = [
"displaydoc",
"icu_collections",
"icu_normalizer_data",
"icu_properties",
"icu_provider",
"smallvec",
"utf16_iter",
"utf8_iter",
"write16",
"zerovec",
]
[[package]]
name = "icu_normalizer_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
[[package]]
name = "icu_properties"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
dependencies = [
"displaydoc",
"icu_collections",
"icu_locid_transform",
"icu_properties_data",
"icu_provider",
"tinystr",
"zerovec",
]
[[package]]
name = "icu_properties_data"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
[[package]]
name = "icu_provider"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
dependencies = [
"displaydoc",
"icu_locid",
"icu_provider_macros",
"stable_deref_trait",
"tinystr",
"writeable",
"yoke",
"zerofrom",
"zerovec",
]
[[package]]
name = "icu_provider_macros"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "idna"
version = "1.0.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
dependencies = [
"idna_adapter",
"smallvec",
"utf8_iter",
]
[[package]]
name = "idna_adapter"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
dependencies = [
"icu_normalizer",
"icu_properties",
]
[[package]]
name = "indexmap"
version = "1.9.3"
@ -261,6 +421,12 @@ version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "litemap"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
[[package]]
name = "lock_api"
version = "0.4.12"
@ -373,6 +539,12 @@ dependencies = [
"windows-targets",
]
[[package]]
name = "percent-encoding"
version = "2.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
[[package]]
name = "phf"
version = "0.8.0"
@ -793,6 +965,17 @@ dependencies = [
"unicode-ident",
]
[[package]]
name = "synstructure"
version = "0.13.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "tendril"
version = "0.4.3"
@ -830,18 +1013,51 @@ dependencies = [
"syn 2.0.96",
]
[[package]]
name = "tinystr"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
dependencies = [
"displaydoc",
"zerovec",
]
[[package]]
name = "unicode-ident"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
[[package]]
name = "url"
version = "2.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
dependencies = [
"form_urlencoded",
"idna",
"percent-encoding",
]
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "utf16_iter"
version = "1.0.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
[[package]]
name = "utf8_iter"
version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
@ -918,6 +1134,42 @@ version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "write16"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
[[package]]
name = "writeable"
version = "0.5.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
[[package]]
name = "yoke"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
dependencies = [
"serde",
"stable_deref_trait",
"yoke-derive",
"zerofrom",
]
[[package]]
name = "yoke-derive"
version = "0.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
"synstructure",
]
[[package]]
name = "zerocopy"
version = "0.7.35"
@ -938,3 +1190,46 @@ dependencies = [
"quote",
"syn 2.0.96",
]
[[package]]
name = "zerofrom"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
dependencies = [
"zerofrom-derive",
]
[[package]]
name = "zerofrom-derive"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
"synstructure",
]
[[package]]
name = "zerovec"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
dependencies = [
"yoke",
"zerofrom",
"zerovec-derive",
]
[[package]]
name = "zerovec-derive"
version = "0.10.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]

View File

@ -9,6 +9,7 @@ lol_html = "2.2.0"
kuchikiki = "0.8.2"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
url = "2.5.4"
[lib]
crate-type = ["cdylib"]

View File

@ -1,28 +1,9 @@
use std::{collections::HashMap, ffi::{CStr, CString}};
use kuchikiki::{parse_html, traits::TendrilSink};
use serde::Deserialize;
use serde_json::Value;
// #[no_mangle]
// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
// let mut output = vec![];
// let mut rewriter = HtmlRewriter::new(
// Settings {
// element_content_handlers: vec! [
// element!("")
// ],
// ..Settings::new()
// },
// |c: &[u8]| output.extend_from_slice(c)
// );
// rewriter.write(html.as_bytes()).unwrap();
// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw()
// }
use url::Url;
#[no_mangle]
pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
@ -162,6 +143,190 @@ pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
}
const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
"header",
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
".share",
"#share",
".widget",
"#widget",
".cookie",
"#cookie",
];
const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [
"#main"
];
#[derive(Deserialize)]
struct TranformHTMLOptions {
html: String,
url: String,
include_tags: Vec<String>,
exclude_tags: Vec<String>,
only_main_content: bool,
}
struct ImageSource {
url: String,
size: i32,
is_x: bool,
}
fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
let mut document = parse_html().one(opts.html);
if opts.include_tags.len() > 0 {
let new_document = parse_html().one("<div></div>");
let root = new_document.select_first("div")?;
for x in opts.include_tags.iter() {
for tag in document.select(&x)? {
root.as_node().append(tag.as_node().clone());
}
}
document = new_document;
}
while let Ok(x) = document.select_first("head") {
x.as_node().detach();
}
while let Ok(x) = document.select_first("meta") {
x.as_node().detach();
}
while let Ok(x) = document.select_first("noscript") {
x.as_node().detach();
}
while let Ok(x) = document.select_first("style") {
x.as_node().detach();
}
while let Ok(x) = document.select_first("script") {
x.as_node().detach();
}
for x in opts.exclude_tags.iter() {
// TODO: implement weird version
while let Ok(x) = document.select_first(&x) {
x.as_node().detach();
}
}
if opts.only_main_content {
for x in EXCLUDE_NON_MAIN_TAGS.iter() {
let x: Vec<_> = document.select(&format!("{}", x))?.collect();
for tag in x {
if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(&x).is_ok_and(|mut x| x.next().is_some())) {
tag.as_node().detach();
}
}
}
}
for img in document.select("img[srcset]")? {
let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.to_string().split(",").filter_map(|x| {
let tok: Vec<&str> = x.trim().split(" ").collect();
let tok_1 = if tok.len() > 0 {
tok[1]
} else {
"1x"
};
if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
Some(ImageSource {
url: tok[0].to_string(),
size: parsed_size,
is_x: tok_1.ends_with("x")
})
} else {
None
}
}).collect();
if sizes.iter().all(|x| x.is_x) {
if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) {
sizes.push(ImageSource {
url: src,
size: 1,
is_x: true,
});
}
}
sizes.sort_by(|a, b| b.size.cmp(&a.size));
if let Some(biggest) = sizes.first() {
img.attributes.borrow_mut().insert("src", biggest.url.clone());
}
}
let url = Url::parse(&opts.url).map_err(|_| ())?;
for img in document.select("img[src]")? {
let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?;
if let Ok(new) = url.join(&old) {
img.attributes.borrow_mut().insert("src", new.to_string());
}
}
for anchor in document.select("a[href]")? {
let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?;
if let Ok(new) = url.join(&old) {
anchor.attributes.borrow_mut().insert("href", new.to_string());
}
}
Ok(document.to_string())
}
#[no_mangle]
pub extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
let opts: TranformHTMLOptions = serde_json::de::from_str(&unsafe { CStr::from_ptr(opts) }.to_str().unwrap()).unwrap();
let out = match _transform_html_inner(opts) {
Ok(x) => x,
Err(_) => "RUSTFC:ERROR".to_string(),
};
CString::new(out).unwrap().into_raw()
}
#[no_mangle]
pub extern "C" fn free_string(ptr: *mut i8) {
drop(unsafe { CString::from_raw(ptr) })

View File

@ -10,10 +10,19 @@ const rustExecutablePath = join(
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
);
type TransformHtmlOptions = {
html: string,
url: string,
include_tags: string[],
exclude_tags: string[],
only_main_content: boolean,
};
class RustHTMLTransformer {
private static instance: RustHTMLTransformer;
private _extractLinks: KoffiFunction;
private _extractMetadata: KoffiFunction;
private _transformHtml: KoffiFunction;
private _freeString: KoffiFunction;
private constructor() {
@ -23,6 +32,7 @@ class RustHTMLTransformer {
const freedResultString = koffi.disposable(cstn, "string", this._freeString);
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
}
public static async getInstance(): Promise<RustHTMLTransformer> {
@ -60,6 +70,22 @@ class RustHTMLTransformer {
});
});
}
public async transformHtml(opts: TransformHtmlOptions): Promise<string> {
return new Promise<string>((resolve, reject) => {
this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => {
if (err) {
reject(err);
} else {
if (res === "RUSTFC:ERROR") {
reject(new Error("Something went wrong on the Rust side."));
} else {
resolve(res);
}
}
});
});
}
}
export async function extractLinks(
@ -82,4 +108,11 @@ export async function extractMetadata(
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractMetadata(html);
}
}
export async function transformHtml(
opts: TransformHtmlOptions,
): Promise<string> {
const converter = await RustHTMLTransformer.getInstance();
return await converter.transformHtml(opts);
}

View File

@ -1,7 +1,9 @@
// TODO: refactor
import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify
import { AnyNode, Cheerio, load } from "cheerio"; // rustified
import { ScrapeOptions } from "../../../controllers/v1/types";
import { transformHtml } from "../../../lib/html-transformer";
import { logger } from "../../../lib/logger";
const excludeNonMainTags = [
"header",
@ -49,11 +51,26 @@ const excludeNonMainTags = [
const forceIncludeMainTags = ["#main"];
export const htmlTransform = (
export const htmlTransform = async (
html: string,
url: string,
scrapeOptions: ScrapeOptions,
) => {
try {
return await transformHtml({
html,
url,
include_tags: (scrapeOptions.includeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0),
exclude_tags: (scrapeOptions.excludeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0),
only_main_content: scrapeOptions.onlyMainContent,
})
} catch (error) {
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractLinks"
});
}
let soup = load(html);
// remove unwanted elements

View File

@ -31,17 +31,17 @@ export async function deriveMetadataFromRawHTML(
return document;
}
export function deriveHTMLFromRawHTML(
export async function deriveHTMLFromRawHTML(
meta: Meta,
document: Document,
): Document {
): Promise<Document> {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
);
}
document.html = htmlTransform(
document.html = await htmlTransform(
document.rawHtml,
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
meta.options,