port most of cheerio stuff to rust (#1089)

This commit is contained in:
Gergő Móricz 2025-01-24 22:04:54 +01:00 committed by GitHub
parent 0d9c9f36b8
commit b005450a34
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
13 changed files with 1321 additions and 16 deletions

View File

@ -27,10 +27,20 @@ RUN cd /app/sharedLibs/go-html-to-md && \
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
chmod +x html-to-markdown.so
# Install Rust
FROM rust:1-bullseye AS rust-base
COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer
# Install Go dependencies and build parser lib
RUN cd /app/sharedLibs/html-transformer && \
cargo build --release && \
chmod +x target/release/libhtml_transformer.so
FROM base
COPY --from=prod-deps /app/node_modules /app/node_modules
COPY --from=build /app /app
COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so
# Start the server by default, this can be overwritten at runtime
EXPOSE 8080

View File

@ -0,0 +1 @@
target

View File

@ -0,0 +1,940 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "allocator-api2"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "bitflags"
version = "1.3.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
[[package]]
name = "bitflags"
version = "2.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
[[package]]
name = "byteorder"
version = "1.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "convert_case"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
[[package]]
name = "cssparser"
version = "0.27.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa 0.4.8",
"matches",
"phf 0.8.0",
"proc-macro2",
"quote",
"smallvec",
"syn 1.0.109",
]
[[package]]
name = "cssparser"
version = "0.29.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa"
dependencies = [
"cssparser-macros",
"dtoa-short",
"itoa 1.0.14",
"matches",
"phf 0.10.1",
"proc-macro2",
"quote",
"smallvec",
"syn 1.0.109",
]
[[package]]
name = "cssparser-macros"
version = "0.6.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.96",
]
[[package]]
name = "derive_more"
version = "0.99.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
dependencies = [
"convert_case",
"proc-macro2",
"quote",
"rustc_version",
"syn 2.0.96",
]
[[package]]
name = "dtoa"
version = "1.0.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
[[package]]
name = "dtoa-short"
version = "0.3.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
dependencies = [
"dtoa",
]
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "equivalent"
version = "1.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
[[package]]
name = "foldhash"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
[[package]]
name = "futf"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
dependencies = [
"mac",
"new_debug_unreachable",
]
[[package]]
name = "fxhash"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
dependencies = [
"byteorder",
]
[[package]]
name = "getrandom"
version = "0.1.16"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
dependencies = [
"cfg-if",
"libc",
"wasi 0.9.0+wasi-snapshot-preview1",
]
[[package]]
name = "getrandom"
version = "0.2.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if",
"libc",
"wasi 0.11.0+wasi-snapshot-preview1",
]
[[package]]
name = "hashbrown"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.15.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
dependencies = [
"allocator-api2",
"equivalent",
"foldhash",
]
[[package]]
name = "html-transformer"
version = "0.1.0"
dependencies = [
"kuchikiki",
"libc",
"lol_html",
"serde",
"serde_json",
]
[[package]]
name = "html5ever"
version = "0.26.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
dependencies = [
"log",
"mac",
"markup5ever",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "indexmap"
version = "1.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [
"autocfg",
"hashbrown 0.12.3",
]
[[package]]
name = "itoa"
version = "0.4.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
[[package]]
name = "itoa"
version = "1.0.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
[[package]]
name = "kuchikiki"
version = "0.8.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8"
dependencies = [
"cssparser 0.27.2",
"html5ever",
"indexmap",
"matches",
"selectors 0.22.0",
]
[[package]]
name = "libc"
version = "0.2.169"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
[[package]]
name = "lock_api"
version = "0.4.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
dependencies = [
"autocfg",
"scopeguard",
]
[[package]]
name = "log"
version = "0.4.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
[[package]]
name = "lol_html"
version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3b1058123f6262982b891dccc395cff0144d9439de366460b47fab719258b96e"
dependencies = [
"bitflags 2.8.0",
"cfg-if",
"cssparser 0.29.6",
"encoding_rs",
"hashbrown 0.15.2",
"memchr",
"mime",
"selectors 0.24.0",
"thiserror",
]
[[package]]
name = "mac"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
[[package]]
name = "markup5ever"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
dependencies = [
"log",
"phf 0.10.1",
"phf_codegen 0.10.0",
"string_cache",
"string_cache_codegen",
"tendril",
]
[[package]]
name = "matches"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "mime"
version = "0.3.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
[[package]]
name = "new_debug_unreachable"
version = "1.0.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
[[package]]
name = "nodrop"
version = "0.1.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
[[package]]
name = "once_cell"
version = "1.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
[[package]]
name = "parking_lot"
version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
dependencies = [
"lock_api",
"parking_lot_core",
]
[[package]]
name = "parking_lot_core"
version = "0.9.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
dependencies = [
"cfg-if",
"libc",
"redox_syscall",
"smallvec",
"windows-targets",
]
[[package]]
name = "phf"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
dependencies = [
"phf_macros 0.8.0",
"phf_shared 0.8.0",
"proc-macro-hack",
]
[[package]]
name = "phf"
version = "0.10.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
dependencies = [
"phf_macros 0.10.0",
"phf_shared 0.10.0",
"proc-macro-hack",
]
[[package]]
name = "phf_codegen"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
dependencies = [
"phf_generator 0.8.0",
"phf_shared 0.8.0",
]
[[package]]
name = "phf_codegen"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
]
[[package]]
name = "phf_generator"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
dependencies = [
"phf_shared 0.8.0",
"rand 0.7.3",
]
[[package]]
name = "phf_generator"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
dependencies = [
"phf_shared 0.10.0",
"rand 0.8.5",
]
[[package]]
name = "phf_macros"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
dependencies = [
"phf_generator 0.8.0",
"phf_shared 0.8.0",
"proc-macro-hack",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "phf_macros"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro-hack",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "phf_shared"
version = "0.8.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
dependencies = [
"siphasher",
]
[[package]]
name = "phf_shared"
version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
dependencies = [
"siphasher",
]
[[package]]
name = "ppv-lite86"
version = "0.2.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
dependencies = [
"zerocopy",
]
[[package]]
name = "precomputed-hash"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "proc-macro-hack"
version = "0.5.20+deprecated"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
[[package]]
name = "proc-macro2"
version = "1.0.93"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.38"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
dependencies = [
"proc-macro2",
]
[[package]]
name = "rand"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
dependencies = [
"getrandom 0.1.16",
"libc",
"rand_chacha 0.2.2",
"rand_core 0.5.1",
"rand_hc",
"rand_pcg",
]
[[package]]
name = "rand"
version = "0.8.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
dependencies = [
"libc",
"rand_chacha 0.3.1",
"rand_core 0.6.4",
]
[[package]]
name = "rand_chacha"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
dependencies = [
"ppv-lite86",
"rand_core 0.5.1",
]
[[package]]
name = "rand_chacha"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
dependencies = [
"ppv-lite86",
"rand_core 0.6.4",
]
[[package]]
name = "rand_core"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
dependencies = [
"getrandom 0.1.16",
]
[[package]]
name = "rand_core"
version = "0.6.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
dependencies = [
"getrandom 0.2.15",
]
[[package]]
name = "rand_hc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "rand_pcg"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
dependencies = [
"rand_core 0.5.1",
]
[[package]]
name = "redox_syscall"
version = "0.5.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
dependencies = [
"bitflags 2.8.0",
]
[[package]]
name = "rustc_version"
version = "0.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
dependencies = [
"semver",
]
[[package]]
name = "ryu"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
[[package]]
name = "scopeguard"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
[[package]]
name = "selectors"
version = "0.22.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
dependencies = [
"bitflags 1.3.2",
"cssparser 0.27.2",
"derive_more",
"fxhash",
"log",
"matches",
"phf 0.8.0",
"phf_codegen 0.8.0",
"precomputed-hash",
"servo_arc 0.1.1",
"smallvec",
"thin-slice",
]
[[package]]
name = "selectors"
version = "0.24.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
dependencies = [
"bitflags 1.3.2",
"cssparser 0.29.6",
"derive_more",
"fxhash",
"log",
"phf 0.8.0",
"phf_codegen 0.8.0",
"precomputed-hash",
"servo_arc 0.2.0",
"smallvec",
]
[[package]]
name = "semver"
version = "1.0.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
[[package]]
name = "serde"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.217"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "serde_json"
version = "1.0.137"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
dependencies = [
"itoa 1.0.14",
"memchr",
"ryu",
"serde",
]
[[package]]
name = "servo_arc"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
dependencies = [
"nodrop",
"stable_deref_trait",
]
[[package]]
name = "servo_arc"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741"
dependencies = [
"nodrop",
"stable_deref_trait",
]
[[package]]
name = "siphasher"
version = "0.3.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
[[package]]
name = "smallvec"
version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
[[package]]
name = "stable_deref_trait"
version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "string_cache"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
dependencies = [
"new_debug_unreachable",
"once_cell",
"parking_lot",
"phf_shared 0.10.0",
"precomputed-hash",
"serde",
]
[[package]]
name = "string_cache_codegen"
version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
dependencies = [
"phf_generator 0.10.0",
"phf_shared 0.10.0",
"proc-macro2",
"quote",
]
[[package]]
name = "syn"
version = "1.0.109"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "syn"
version = "2.0.96"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "tendril"
version = "0.4.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
dependencies = [
"futf",
"mac",
"utf-8",
]
[[package]]
name = "thin-slice"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
[[package]]
name = "thiserror"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.11"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]
[[package]]
name = "unicode-ident"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
[[package]]
name = "utf-8"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
[[package]]
name = "wasi"
version = "0.9.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
[[package]]
name = "wasi"
version = "0.11.0+wasi-snapshot-preview1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "zerocopy"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
dependencies = [
"byteorder",
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.7.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.96",
]

View File

@ -0,0 +1,14 @@
[package]
name = "html-transformer"
version = "0.1.0"
edition = "2021"
[dependencies]
libc = "0.2.0"
lol_html = "2.2.0"
kuchikiki = "0.8.2"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"
[lib]
crate-type = ["cdylib"]

View File

@ -0,0 +1,168 @@
use std::{collections::HashMap, ffi::{CStr, CString}};
use kuchikiki::{parse_html, traits::TendrilSink};
use serde_json::Value;
// #[no_mangle]
// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
// let mut output = vec![];
// let mut rewriter = HtmlRewriter::new(
// Settings {
// element_content_handlers: vec! [
// element!("")
// ],
// ..Settings::new()
// },
// |c: &[u8]| output.extend_from_slice(c)
// );
// rewriter.write(html.as_bytes()).unwrap();
// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw()
// }
#[no_mangle]
pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
let document = parse_html().one(html);
let mut out: Vec<String> = Vec::new();
for anchor in document.select("a[href]").unwrap() {
let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
if href.starts_with("http:/") && !href.starts_with("http://") {
href = format!("http://{}", &href[6..]);
} else if href.starts_with("https:/") && !href.starts_with("https://") {
href = format!("https://{}", &href[7..]);
}
out.push(href);
}
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
}
macro_rules! insert_meta_name {
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
$out.insert(($outName).to_string(), Value::String(x));
}
};
}
macro_rules! insert_meta_property {
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
$out.insert(($outName).to_string(), Value::String(x));
}
};
}
#[no_mangle]
pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
let document = parse_html().one(html);
let mut out = HashMap::<String, Value>::new();
if let Some(title) = document.select("title").unwrap().next() {
out.insert("title".to_string(), Value::String(title.text_contents()));
}
// insert_meta_name!(out, document, "description", "description");
if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))
.or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next()
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) {
out.insert("favicon".to_string(), Value::String(favicon_link));
}
if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) {
out.insert("language".to_string(), Value::String(lang));
}
// insert_meta_name!(out, document, "keywords", "keywords");
// insert_meta_name!(out, document, "robots", "robots");
insert_meta_property!(out, document, "og:title", "ogTitle");
insert_meta_property!(out, document, "og:description", "ogDescription");
insert_meta_property!(out, document, "og:url", "ogUrl");
insert_meta_property!(out, document, "og:image", "ogImage");
insert_meta_property!(out, document, "og:audio", "ogAudio");
insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
insert_meta_property!(out, document, "og:locale", "ogLocale");
for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
let attrs = meta.attributes.borrow();
if let Some(content) = attrs.get("content") {
if let Some(v) = out.get_mut("og:locale:alternate") {
match v {
Value::Array(x) => {
x.push(Value::String(content.to_string()));
},
_ => unreachable!(),
}
} else {
out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
}
}
}
insert_meta_property!(out, document, "og:site_name", "ogSiteName");
insert_meta_property!(out, document, "og:video", "ogVideo");
insert_meta_name!(out, document, "article:section", "articleSection");
insert_meta_name!(out, document, "article:tag", "articleTag");
insert_meta_property!(out, document, "article:published_time", "publishedTime");
insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
insert_meta_name!(out, document, "dc.description", "dcDescription");
insert_meta_name!(out, document, "dc.subject", "dcSubject");
insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
insert_meta_name!(out, document, "dc.type", "dcType");
insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
insert_meta_name!(out, document, "dc.date", "dcDate");
insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");
for meta in document.select("meta").unwrap() {
let meta = meta.as_node().as_element().unwrap();
let attrs = meta.attributes.borrow();
if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
if let Some(content) = attrs.get("content") {
if let Some(v) = out.get(name) {
match v {
Value::String(_) => {
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
},
Value::Array(_) => {
match out.get_mut(name) {
Some(Value::Array(x)) => {
x.push(Value::String(content.to_string()));
},
_ => unreachable!(),
}
},
_ => unreachable!(),
}
} else {
out.insert(name.to_string(), Value::String(content.to_string()));
}
}
}
}
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
}
#[no_mangle]
pub extern "C" fn free_string(ptr: *mut i8) {
drop(unsafe { CString::from_raw(ptr) })
}

View File

@ -0,0 +1,84 @@
import koffi, { KoffiFunction } from "koffi";
import { join } from "path";
import { stat } from "fs/promises";
import { platform } from "os";
// TODO: add a timeout to the Rust transformer
const rustExecutablePath = join(
process.cwd(),
"sharedLibs/html-transformer/target/release/",
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
);
class RustHTMLTransformer {
private static instance: RustHTMLTransformer;
private _extractLinks: KoffiFunction;
private _extractMetadata: KoffiFunction;
private _freeString: KoffiFunction;
private constructor() {
const lib = koffi.load(rustExecutablePath);
this._freeString = lib.func("free_string", "void", ["string"]);
const freedResultString = koffi.disposable("CString", "string", this._freeString);
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
}
public static async getInstance(): Promise<RustHTMLTransformer> {
if (!RustHTMLTransformer.instance) {
try {
await stat(rustExecutablePath);
} catch (_) {
throw Error("Rust html-transformer shared library not found");
}
RustHTMLTransformer.instance = new RustHTMLTransformer();
}
return RustHTMLTransformer.instance;
}
public async extractLinks(html: string): Promise<string[]> {
return new Promise<string[]>((resolve, reject) => {
this._extractLinks.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
public async extractMetadata(html: string): Promise<any> {
return new Promise<string[]>((resolve, reject) => {
this._extractMetadata.async(html, (err: Error, res: string) => {
if (err) {
reject(err);
} else {
resolve(JSON.parse(res));
}
});
});
}
}
export async function extractLinks(
html: string | null | undefined,
): Promise<string[]> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractLinks(html);
}
export async function extractMetadata(
html: string | null | undefined,
): Promise<any> {
if (!html) {
return [];
}
const converter = await RustHTMLTransformer.getInstance();
return await converter.extractMetadata(html);
}

View File

@ -1,5 +1,5 @@
import axios, { AxiosError } from "axios";
import cheerio, { load } from "cheerio";
import { load } from "cheerio"; // rustified
import { URL } from "url";
import { getLinksFromSitemap } from "./sitemap";
import robotsParser, { Robot } from "robots-parser";
@ -8,6 +8,7 @@ import { axiosTimeout } from "../../lib/timeout";
import { logger as _logger } from "../../lib/logger";
import https from "https";
import { redisConnection } from "../../services/queue-service";
import { extractLinks } from "../../lib/html-transformer";
export class WebCrawler {
private jobId: string;
private initialUrl: string;
@ -364,7 +365,11 @@ export class WebCrawler {
return null;
}
public extractLinksFromHTML(html: string, url: string) {
private async extractLinksFromHTMLRust(html: string, url: string) {
return (await extractLinks(html)).filter(x => this.filterURL(x, url));
}
private extractLinksFromHTMLCheerio(html: string, url: string) {
let links: string[] = [];
const $ = load(html);
@ -386,7 +391,7 @@ export class WebCrawler {
const src = $(element).attr("src");
if (src && src.startsWith("data:text/html")) {
const iframeHtml = decodeURIComponent(src.split(",")[1]);
const iframeLinks = this.extractLinksFromHTML(iframeHtml, url);
const iframeLinks = this.extractLinksFromHTMLCheerio(iframeHtml, url);
links = links.concat(iframeLinks);
}
});
@ -394,6 +399,19 @@ export class WebCrawler {
return links;
}
public async extractLinksFromHTML(html: string, url: string) {
try {
return await this.extractLinksFromHTMLRust(html, url);
} catch (error) {
this.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractMetadata"
});
}
return this.extractLinksFromHTMLCheerio(html, url);
}
private isRobotsAllowed(
url: string,
ignoreRobotsTxt: boolean = false,

View File

@ -1,8 +1,52 @@
// TODO: refactor
import { load } from "cheerio";
import { load } from "cheerio"; // rustified
import { logger } from "../../../lib/logger";
import { extractLinks as _extractLinks } from "../../../lib/html-transformer";
async function extractLinksRust(html: string, baseUrl: string): Promise<string[]> {
const hrefs = await _extractLinks(html);
const links: string[] = [];
hrefs.forEach(href => {
href = href.trim();
try {
if (href.startsWith("http://") || href.startsWith("https://")) {
// Absolute URL, add as is
links.push(href);
} else if (href.startsWith("/")) {
// Relative URL starting with '/', append to origin
links.push(new URL(href, baseUrl).href);
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
// Relative URL not starting with '/', append to base URL
links.push(new URL(href, baseUrl).href);
} else if (href.startsWith("mailto:")) {
// mailto: links, add as is
links.push(href);
}
// Fragment-only links (#) are ignored
} catch (error) {
logger.error(
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
{ error },
);
}
});
// Remove duplicates and return
return [...new Set(links)];
}
export async function extractLinks(html: string, baseUrl: string): Promise<string[]> {
try {
return await extractLinksRust(html, baseUrl);
} catch (error) {
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractLinks"
});
}
export function extractLinks(html: string, baseUrl: string): string[] {
const $ = load(html);
const links: string[] = [];

View File

@ -1,11 +1,37 @@
import { load } from "cheerio";
import { load } from "cheerio"; // rustified
import { Document } from "../../../controllers/v1/types";
import { Meta } from "..";
import { extractMetadata as _extractMetadata } from "../../../lib/html-transformer";
export function extractMetadata(
export async function extractMetadataRust(
meta: Meta,
html: string,
): Partial<Document["metadata"]> {
): Promise<Partial<Document["metadata"]>> {
const fromRust = await _extractMetadata(html);
return {
...fromRust,
...(fromRust.favicon ? {
favicon: new URL(fromRust.favicon, meta.url)
} : {}),
scrapeId: meta.id,
};
}
export async function extractMetadata(
meta: Meta,
html: string,
): Promise<Partial<Document["metadata"]>> {
try {
return await extractMetadataRust(meta, html);
} catch (error) {
meta.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
error,
module: "scrapeURL", method: "extractMetadata"
});
}
let title: string | undefined = undefined;
let description: string | undefined = undefined;
let favicon: string | undefined = undefined;

View File

@ -1,6 +1,6 @@
// TODO: refactor
import { AnyNode, Cheerio, load } from "cheerio";
import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify
import { ScrapeOptions } from "../../../controllers/v1/types";
const excludeNonMainTags = [

View File

@ -14,10 +14,10 @@ export type Transformer = (
document: Document,
) => Document | Promise<Document>;
export function deriveMetadataFromRawHTML(
export async function deriveMetadataFromRawHTML(
meta: Meta,
document: Document,
): Document {
): Promise<Document> {
if (document.rawHtml === undefined) {
throw new Error(
"rawHtml is undefined -- this transformer is being called out of order",
@ -25,7 +25,7 @@ export function deriveMetadataFromRawHTML(
}
document.metadata = {
...extractMetadata(meta, document.rawHtml),
...(await extractMetadata(meta, document.rawHtml)),
...document.metadata,
};
return document;
@ -63,7 +63,7 @@ export async function deriveMarkdownFromHTML(
return document;
}
export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
export async function deriveLinksFromHTML(meta: Meta, document: Document): Promise<Document> {
// Only derive if the formats has links
if (meta.options.formats.includes("links")) {
if (document.html === undefined) {
@ -72,7 +72,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
);
}
document.links = extractLinks(document.html, meta.url);
document.links = await extractLinks(document.html, meta.url);
}
return document;

View File

@ -1,5 +1,5 @@
import axios from "axios";
import * as cheerio from "cheerio";
import * as cheerio from "cheerio"; // TODO: rustify
import * as querystring from "querystring";
import { SearchResult } from "../../src/lib/entities";
import { logger } from "../../src/lib/logger";

View File

@ -884,7 +884,7 @@ async function processJob(job: Job & { id: string }, token: string) {
);
const links = crawler.filterLinks(
crawler.extractLinksFromHTML(
await crawler.extractLinksFromHTML(
rawHtml ?? "",
doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
),