mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-04 11:24:40 +08:00
feat: rewrite html transformer in rust
This commit is contained in:
parent
9c40e0cc8d
commit
a2d94b525f
295
apps/api/sharedLibs/html-transformer/Cargo.lock
generated
295
apps/api/sharedLibs/html-transformer/Cargo.lock
generated
@ -101,6 +101,17 @@ dependencies = [
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "displaydoc"
|
||||
version = "0.2.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.9"
|
||||
@ -137,6 +148,15 @@ version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
|
||||
|
||||
[[package]]
|
||||
name = "form_urlencoded"
|
||||
version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e13624c2627564efccf4934284bdd98cbaa14e79b0b5a141218e507b3a823456"
|
||||
dependencies = [
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
@ -204,6 +224,7 @@ dependencies = [
|
||||
"lol_html",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"url",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -220,6 +241,145 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_collections"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "db2fa452206ebee18c4b5c2274dbf1de17008e874b4dc4f0aea9d01ca79e4526"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13acbb8371917fc971be86fc8057c41a64b521c184808a698c02acc242dbf637"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"litemap",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid_transform"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "01d11ac35de8e40fdeda00d9e1e9d92525f3f9d887cdd7aa81d727596788b54e"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_locid_transform_data",
|
||||
"icu_provider",
|
||||
"tinystr",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_locid_transform_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fdc8ff3388f852bede6b579ad4e978ab004f139284d7b28715f773507b946f6e"
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "19ce3e0da2ec68599d193c93d088142efd7f9c5d6fc9b803774855747dc6a84f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_normalizer_data",
|
||||
"icu_properties",
|
||||
"icu_provider",
|
||||
"smallvec",
|
||||
"utf16_iter",
|
||||
"utf8_iter",
|
||||
"write16",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_normalizer_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f8cafbf7aa791e9b22bec55a167906f9e1215fd475cd22adfcf660e03e989516"
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "93d6020766cfc6302c15dbbc9c8778c37e62c14427cb7f6e601d849e092aeef5"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_collections",
|
||||
"icu_locid_transform",
|
||||
"icu_properties_data",
|
||||
"icu_provider",
|
||||
"tinystr",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_properties_data"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "67a8effbc3dd3e4ba1afa8ad918d5684b8868b3b26500753effea8d2eed19569"
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ed421c8a8ef78d3e2dbc98a973be2f3770cb42b606e3ab18d6237c4dfde68d9"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"icu_locid",
|
||||
"icu_provider_macros",
|
||||
"stable_deref_trait",
|
||||
"tinystr",
|
||||
"writeable",
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "icu_provider_macros"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1ec89e9337638ecdc08744df490b221a7399bf8d164eb52a665454e60e075ad6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna"
|
||||
version = "1.0.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "686f825264d630750a544639377bae737628043f20d38bbc029e8f29ea968a7e"
|
||||
dependencies = [
|
||||
"idna_adapter",
|
||||
"smallvec",
|
||||
"utf8_iter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "idna_adapter"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "daca1df1c957320b2cf139ac61e7bd64fed304c5040df000a745aa1de3b4ef71"
|
||||
dependencies = [
|
||||
"icu_normalizer",
|
||||
"icu_properties",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
@ -261,6 +421,12 @@ version = "0.2.169"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||
|
||||
[[package]]
|
||||
name = "litemap"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.12"
|
||||
@ -373,6 +539,12 @@ dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
version = "2.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3148f5046208a5d56bcfc03053e3ca6334e51da8dfb19b6cdc8b306fae3283e"
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.8.0"
|
||||
@ -793,6 +965,17 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "synstructure"
|
||||
version = "0.13.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
@ -830,18 +1013,51 @@ dependencies = [
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinystr"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9117f5d4db391c1cf6927e7bea3db74b9a1c1add8f7eda9ffd5364f40f57b82f"
|
||||
dependencies = [
|
||||
"displaydoc",
|
||||
"zerovec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
|
||||
|
||||
[[package]]
|
||||
name = "url"
|
||||
version = "2.5.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32f8b686cadd1473f4bd0117a5d28d36b1ade384ea9b5069a1c40aefed7fda60"
|
||||
dependencies = [
|
||||
"form_urlencoded",
|
||||
"idna",
|
||||
"percent-encoding",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "utf16_iter"
|
||||
version = "1.0.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c8232dd3cdaed5356e0f716d285e4b40b932ac434100fe9b7e0e8e935b9e6246"
|
||||
|
||||
[[package]]
|
||||
name = "utf8_iter"
|
||||
version = "1.0.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
@ -918,6 +1134,42 @@ version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "write16"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d1890f4022759daae28ed4fe62859b1236caebfc61ede2f63ed4e695f3f6d936"
|
||||
|
||||
[[package]]
|
||||
name = "writeable"
|
||||
version = "0.5.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e9df38ee2d2c3c5948ea468a8406ff0db0b29ae1ffde1bcf20ef305bcc95c51"
|
||||
|
||||
[[package]]
|
||||
name = "yoke"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "120e6aef9aa629e3d4f52dc8cc43a015c7724194c97dfaf45180d2daf2b77f40"
|
||||
dependencies = [
|
||||
"serde",
|
||||
"stable_deref_trait",
|
||||
"yoke-derive",
|
||||
"zerofrom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "yoke-derive"
|
||||
version = "0.7.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2380878cad4ac9aac1e2435f3eb4020e8374b5f13c296cb75b4620ff8e229154"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
@ -938,3 +1190,46 @@ dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cff3ee08c995dee1859d998dea82f7374f2826091dd9cd47def953cae446cd2e"
|
||||
dependencies = [
|
||||
"zerofrom-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerofrom-derive"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "595eed982f7d355beb85837f651fa22e90b3c044842dc7f2c2842c086f295808"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
"synstructure",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "aa2b893d79df23bfb12d5461018d408ea19dfafe76c2c7ef6d4eba614f8ff079"
|
||||
dependencies = [
|
||||
"yoke",
|
||||
"zerofrom",
|
||||
"zerovec-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerovec-derive"
|
||||
version = "0.10.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6eafa6dfb17584ea3e2bd6e76e0cc15ad7af12b09abdd1ca55961bed9b1063c6"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
@ -9,6 +9,7 @@ lol_html = "2.2.0"
|
||||
kuchikiki = "0.8.2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
url = "2.5.4"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
||||
|
@ -1,28 +1,9 @@
|
||||
use std::{collections::HashMap, ffi::{CStr, CString}};
|
||||
|
||||
use kuchikiki::{parse_html, traits::TendrilSink};
|
||||
use serde::Deserialize;
|
||||
use serde_json::Value;
|
||||
|
||||
// #[no_mangle]
|
||||
// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
||||
// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||
|
||||
// let mut output = vec![];
|
||||
|
||||
// let mut rewriter = HtmlRewriter::new(
|
||||
// Settings {
|
||||
// element_content_handlers: vec! [
|
||||
// element!("")
|
||||
// ],
|
||||
// ..Settings::new()
|
||||
// },
|
||||
// |c: &[u8]| output.extend_from_slice(c)
|
||||
// );
|
||||
|
||||
// rewriter.write(html.as_bytes()).unwrap();
|
||||
|
||||
// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw()
|
||||
// }
|
||||
use url::Url;
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
||||
@ -162,6 +143,190 @@ pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
|
||||
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
|
||||
}
|
||||
|
||||
const EXCLUDE_NON_MAIN_TAGS: [&str; 41] = [
|
||||
"header",
|
||||
"footer",
|
||||
"nav",
|
||||
"aside",
|
||||
".header",
|
||||
".top",
|
||||
".navbar",
|
||||
"#header",
|
||||
".footer",
|
||||
".bottom",
|
||||
"#footer",
|
||||
".sidebar",
|
||||
".side",
|
||||
".aside",
|
||||
"#sidebar",
|
||||
".modal",
|
||||
".popup",
|
||||
"#modal",
|
||||
".overlay",
|
||||
".ad",
|
||||
".ads",
|
||||
".advert",
|
||||
"#ad",
|
||||
".lang-selector",
|
||||
".language",
|
||||
"#language-selector",
|
||||
".social",
|
||||
".social-media",
|
||||
".social-links",
|
||||
"#social",
|
||||
".menu",
|
||||
".navigation",
|
||||
"#nav",
|
||||
".breadcrumbs",
|
||||
"#breadcrumbs",
|
||||
".share",
|
||||
"#share",
|
||||
".widget",
|
||||
"#widget",
|
||||
".cookie",
|
||||
"#cookie",
|
||||
];
|
||||
|
||||
const FORCE_INCLUDE_MAIN_TAGS: [&str; 1] = [
|
||||
"#main"
|
||||
];
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct TranformHTMLOptions {
|
||||
html: String,
|
||||
url: String,
|
||||
include_tags: Vec<String>,
|
||||
exclude_tags: Vec<String>,
|
||||
only_main_content: bool,
|
||||
}
|
||||
|
||||
struct ImageSource {
|
||||
url: String,
|
||||
size: i32,
|
||||
is_x: bool,
|
||||
}
|
||||
|
||||
fn _transform_html_inner(opts: TranformHTMLOptions) -> Result<String, ()> {
|
||||
let mut document = parse_html().one(opts.html);
|
||||
|
||||
if opts.include_tags.len() > 0 {
|
||||
let new_document = parse_html().one("<div></div>");
|
||||
let root = new_document.select_first("div")?;
|
||||
|
||||
for x in opts.include_tags.iter() {
|
||||
for tag in document.select(&x)? {
|
||||
root.as_node().append(tag.as_node().clone());
|
||||
}
|
||||
}
|
||||
|
||||
document = new_document;
|
||||
}
|
||||
|
||||
while let Ok(x) = document.select_first("head") {
|
||||
x.as_node().detach();
|
||||
}
|
||||
|
||||
while let Ok(x) = document.select_first("meta") {
|
||||
x.as_node().detach();
|
||||
}
|
||||
|
||||
while let Ok(x) = document.select_first("noscript") {
|
||||
x.as_node().detach();
|
||||
}
|
||||
|
||||
while let Ok(x) = document.select_first("style") {
|
||||
x.as_node().detach();
|
||||
}
|
||||
|
||||
while let Ok(x) = document.select_first("script") {
|
||||
x.as_node().detach();
|
||||
}
|
||||
|
||||
for x in opts.exclude_tags.iter() {
|
||||
// TODO: implement weird version
|
||||
while let Ok(x) = document.select_first(&x) {
|
||||
x.as_node().detach();
|
||||
}
|
||||
}
|
||||
|
||||
if opts.only_main_content {
|
||||
for x in EXCLUDE_NON_MAIN_TAGS.iter() {
|
||||
let x: Vec<_> = document.select(&format!("{}", x))?.collect();
|
||||
for tag in x {
|
||||
if !FORCE_INCLUDE_MAIN_TAGS.iter().any(|x| tag.as_node().select(&x).is_ok_and(|mut x| x.next().is_some())) {
|
||||
tag.as_node().detach();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for img in document.select("img[srcset]")? {
|
||||
let mut sizes: Vec<ImageSource> = img.attributes.borrow().get("srcset").ok_or(())?.to_string().split(",").filter_map(|x| {
|
||||
let tok: Vec<&str> = x.trim().split(" ").collect();
|
||||
let tok_1 = if tok.len() > 0 {
|
||||
tok[1]
|
||||
} else {
|
||||
"1x"
|
||||
};
|
||||
if let Ok(parsed_size) = tok_1[..tok_1.len()-1].parse() {
|
||||
Some(ImageSource {
|
||||
url: tok[0].to_string(),
|
||||
size: parsed_size,
|
||||
is_x: tok_1.ends_with("x")
|
||||
})
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}).collect();
|
||||
|
||||
if sizes.iter().all(|x| x.is_x) {
|
||||
if let Some(src) = img.attributes.borrow().get("src").map(|x| x.to_string()) {
|
||||
sizes.push(ImageSource {
|
||||
url: src,
|
||||
size: 1,
|
||||
is_x: true,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
sizes.sort_by(|a, b| b.size.cmp(&a.size));
|
||||
|
||||
if let Some(biggest) = sizes.first() {
|
||||
img.attributes.borrow_mut().insert("src", biggest.url.clone());
|
||||
}
|
||||
}
|
||||
|
||||
let url = Url::parse(&opts.url).map_err(|_| ())?;
|
||||
|
||||
for img in document.select("img[src]")? {
|
||||
let old = img.attributes.borrow().get("src").map(|x| x.to_string()).ok_or(())?;
|
||||
if let Ok(new) = url.join(&old) {
|
||||
img.attributes.borrow_mut().insert("src", new.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
for anchor in document.select("a[href]")? {
|
||||
let old = anchor.attributes.borrow().get("href").map(|x| x.to_string()).ok_or(())?;
|
||||
if let Ok(new) = url.join(&old) {
|
||||
anchor.attributes.borrow_mut().insert("href", new.to_string());
|
||||
}
|
||||
}
|
||||
|
||||
Ok(document.to_string())
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn transform_html(opts: *const libc::c_char) -> *mut i8 {
|
||||
let opts: TranformHTMLOptions = serde_json::de::from_str(&unsafe { CStr::from_ptr(opts) }.to_str().unwrap()).unwrap();
|
||||
|
||||
let out = match _transform_html_inner(opts) {
|
||||
Ok(x) => x,
|
||||
Err(_) => "RUSTFC:ERROR".to_string(),
|
||||
};
|
||||
|
||||
CString::new(out).unwrap().into_raw()
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_string(ptr: *mut i8) {
|
||||
drop(unsafe { CString::from_raw(ptr) })
|
||||
|
@ -10,10 +10,19 @@ const rustExecutablePath = join(
|
||||
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
|
||||
);
|
||||
|
||||
type TransformHtmlOptions = {
|
||||
html: string,
|
||||
url: string,
|
||||
include_tags: string[],
|
||||
exclude_tags: string[],
|
||||
only_main_content: boolean,
|
||||
};
|
||||
|
||||
class RustHTMLTransformer {
|
||||
private static instance: RustHTMLTransformer;
|
||||
private _extractLinks: KoffiFunction;
|
||||
private _extractMetadata: KoffiFunction;
|
||||
private _transformHtml: KoffiFunction;
|
||||
private _freeString: KoffiFunction;
|
||||
|
||||
private constructor() {
|
||||
@ -23,6 +32,7 @@ class RustHTMLTransformer {
|
||||
const freedResultString = koffi.disposable(cstn, "string", this._freeString);
|
||||
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
|
||||
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
|
||||
this._transformHtml = lib.func("transform_html", freedResultString, ["string"]);
|
||||
}
|
||||
|
||||
public static async getInstance(): Promise<RustHTMLTransformer> {
|
||||
@ -60,6 +70,22 @@ class RustHTMLTransformer {
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
public async transformHtml(opts: TransformHtmlOptions): Promise<string> {
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
this._transformHtml.async(JSON.stringify(opts), (err: Error, res: string) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
if (res === "RUSTFC:ERROR") {
|
||||
reject(new Error("Something went wrong on the Rust side."));
|
||||
} else {
|
||||
resolve(res);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractLinks(
|
||||
@ -82,4 +108,11 @@ export async function extractMetadata(
|
||||
|
||||
const converter = await RustHTMLTransformer.getInstance();
|
||||
return await converter.extractMetadata(html);
|
||||
}
|
||||
}
|
||||
|
||||
export async function transformHtml(
|
||||
opts: TransformHtmlOptions,
|
||||
): Promise<string> {
|
||||
const converter = await RustHTMLTransformer.getInstance();
|
||||
return await converter.transformHtml(opts);
|
||||
}
|
||||
|
@ -1,7 +1,9 @@
|
||||
// TODO: refactor
|
||||
|
||||
import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify
|
||||
import { AnyNode, Cheerio, load } from "cheerio"; // rustified
|
||||
import { ScrapeOptions } from "../../../controllers/v1/types";
|
||||
import { transformHtml } from "../../../lib/html-transformer";
|
||||
import { logger } from "../../../lib/logger";
|
||||
|
||||
const excludeNonMainTags = [
|
||||
"header",
|
||||
@ -49,11 +51,26 @@ const excludeNonMainTags = [
|
||||
|
||||
const forceIncludeMainTags = ["#main"];
|
||||
|
||||
export const htmlTransform = (
|
||||
export const htmlTransform = async (
|
||||
html: string,
|
||||
url: string,
|
||||
scrapeOptions: ScrapeOptions,
|
||||
) => {
|
||||
try {
|
||||
return await transformHtml({
|
||||
html,
|
||||
url,
|
||||
include_tags: (scrapeOptions.includeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0),
|
||||
exclude_tags: (scrapeOptions.excludeTags ?? []).map(x => x.trim()).filter((x) => x.length !== 0),
|
||||
only_main_content: scrapeOptions.onlyMainContent,
|
||||
})
|
||||
} catch (error) {
|
||||
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
||||
error,
|
||||
module: "scrapeURL", method: "extractLinks"
|
||||
});
|
||||
}
|
||||
|
||||
let soup = load(html);
|
||||
|
||||
// remove unwanted elements
|
||||
|
@ -31,17 +31,17 @@ export async function deriveMetadataFromRawHTML(
|
||||
return document;
|
||||
}
|
||||
|
||||
export function deriveHTMLFromRawHTML(
|
||||
export async function deriveHTMLFromRawHTML(
|
||||
meta: Meta,
|
||||
document: Document,
|
||||
): Document {
|
||||
): Promise<Document> {
|
||||
if (document.rawHtml === undefined) {
|
||||
throw new Error(
|
||||
"rawHtml is undefined -- this transformer is being called out of order",
|
||||
);
|
||||
}
|
||||
|
||||
document.html = htmlTransform(
|
||||
document.html = await htmlTransform(
|
||||
document.rawHtml,
|
||||
document.metadata.url ?? document.metadata.sourceURL ?? meta.url,
|
||||
meta.options,
|
||||
|
Loading…
x
Reference in New Issue
Block a user