mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:29:04 +08:00
port most of cheerio stuff to rust (#1089)
This commit is contained in:
parent
0d9c9f36b8
commit
b005450a34
@ -27,10 +27,20 @@ RUN cd /app/sharedLibs/go-html-to-md && \
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \
|
||||
chmod +x html-to-markdown.so
|
||||
|
||||
# Install Rust
|
||||
FROM rust:1-bullseye AS rust-base
|
||||
COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer
|
||||
|
||||
# Install Go dependencies and build parser lib
|
||||
RUN cd /app/sharedLibs/html-transformer && \
|
||||
cargo build --release && \
|
||||
chmod +x target/release/libhtml_transformer.so
|
||||
|
||||
FROM base
|
||||
COPY --from=prod-deps /app/node_modules /app/node_modules
|
||||
COPY --from=build /app /app
|
||||
COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
|
||||
COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so
|
||||
|
||||
# Start the server by default, this can be overwritten at runtime
|
||||
EXPOSE 8080
|
||||
|
1
apps/api/sharedLibs/html-transformer/.gitignore
vendored
Normal file
1
apps/api/sharedLibs/html-transformer/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
target
|
940
apps/api/sharedLibs/html-transformer/Cargo.lock
generated
Normal file
940
apps/api/sharedLibs/html-transformer/Cargo.lock
generated
Normal file
@ -0,0 +1,940 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "allocator-api2"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923"
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "1.3.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36"
|
||||
|
||||
[[package]]
|
||||
name = "byteorder"
|
||||
version = "1.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "convert_case"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e"
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.27.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa 0.4.8",
|
||||
"matches",
|
||||
"phf 0.8.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"smallvec",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser"
|
||||
version = "0.29.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa"
|
||||
dependencies = [
|
||||
"cssparser-macros",
|
||||
"dtoa-short",
|
||||
"itoa 1.0.14",
|
||||
"matches",
|
||||
"phf 0.10.1",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"smallvec",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cssparser-macros"
|
||||
version = "0.6.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_more"
|
||||
version = "0.99.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce"
|
||||
dependencies = [
|
||||
"convert_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"rustc_version",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dtoa"
|
||||
version = "1.0.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653"
|
||||
|
||||
[[package]]
|
||||
name = "dtoa-short"
|
||||
version = "0.3.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87"
|
||||
dependencies = [
|
||||
"dtoa",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
|
||||
|
||||
[[package]]
|
||||
name = "foldhash"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f"
|
||||
|
||||
[[package]]
|
||||
name = "futf"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843"
|
||||
dependencies = [
|
||||
"mac",
|
||||
"new_debug_unreachable",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "fxhash"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.1.16"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi 0.9.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"wasi 0.11.0+wasi-snapshot-preview1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289"
|
||||
dependencies = [
|
||||
"allocator-api2",
|
||||
"equivalent",
|
||||
"foldhash",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html-transformer"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"kuchikiki",
|
||||
"libc",
|
||||
"lol_html",
|
||||
"serde",
|
||||
"serde_json",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "html5ever"
|
||||
version = "0.26.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7"
|
||||
dependencies = [
|
||||
"log",
|
||||
"mac",
|
||||
"markup5ever",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "1.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"hashbrown 0.12.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "0.4.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
|
||||
|
||||
[[package]]
|
||||
name = "kuchikiki"
|
||||
version = "0.8.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8"
|
||||
dependencies = [
|
||||
"cssparser 0.27.2",
|
||||
"html5ever",
|
||||
"indexmap",
|
||||
"matches",
|
||||
"selectors 0.22.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.169"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
|
||||
|
||||
[[package]]
|
||||
name = "lock_api"
|
||||
version = "0.4.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
"scopeguard",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f"
|
||||
|
||||
[[package]]
|
||||
name = "lol_html"
|
||||
version = "2.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3b1058123f6262982b891dccc395cff0144d9439de366460b47fab719258b96e"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
"cfg-if",
|
||||
"cssparser 0.29.6",
|
||||
"encoding_rs",
|
||||
"hashbrown 0.15.2",
|
||||
"memchr",
|
||||
"mime",
|
||||
"selectors 0.24.0",
|
||||
"thiserror",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "mac"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "markup5ever"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016"
|
||||
dependencies = [
|
||||
"log",
|
||||
"phf 0.10.1",
|
||||
"phf_codegen 0.10.0",
|
||||
"string_cache",
|
||||
"string_cache_codegen",
|
||||
"tendril",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5"
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "mime"
|
||||
version = "0.3.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a"
|
||||
|
||||
[[package]]
|
||||
name = "new_debug_unreachable"
|
||||
version = "1.0.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086"
|
||||
|
||||
[[package]]
|
||||
name = "nodrop"
|
||||
version = "0.1.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb"
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot"
|
||||
version = "0.12.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27"
|
||||
dependencies = [
|
||||
"lock_api",
|
||||
"parking_lot_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "parking_lot_core"
|
||||
version = "0.9.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"redox_syscall",
|
||||
"smallvec",
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12"
|
||||
dependencies = [
|
||||
"phf_macros 0.8.0",
|
||||
"phf_shared 0.8.0",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf"
|
||||
version = "0.10.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259"
|
||||
dependencies = [
|
||||
"phf_macros 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"proc-macro-hack",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815"
|
||||
dependencies = [
|
||||
"phf_generator 0.8.0",
|
||||
"phf_shared 0.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_codegen"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526"
|
||||
dependencies = [
|
||||
"phf_shared 0.8.0",
|
||||
"rand 0.7.3",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_generator"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6"
|
||||
dependencies = [
|
||||
"phf_shared 0.10.0",
|
||||
"rand 0.8.5",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c"
|
||||
dependencies = [
|
||||
"phf_generator 0.8.0",
|
||||
"phf_shared 0.8.0",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_macros"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"proc-macro-hack",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.8.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "phf_shared"
|
||||
version = "0.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096"
|
||||
dependencies = [
|
||||
"siphasher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.20"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "precomputed-hash"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro-hack"
|
||||
version = "0.5.20+deprecated"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.93"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.38"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.7.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
"libc",
|
||||
"rand_chacha 0.2.2",
|
||||
"rand_core 0.5.1",
|
||||
"rand_hc",
|
||||
"rand_pcg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.8.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"rand_chacha 0.3.1",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.3.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core 0.6.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
|
||||
dependencies = [
|
||||
"getrandom 0.1.16",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.6.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c"
|
||||
dependencies = [
|
||||
"getrandom 0.2.15",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_hc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_pcg"
|
||||
version = "0.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429"
|
||||
dependencies = [
|
||||
"rand_core 0.5.1",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redox_syscall"
|
||||
version = "0.5.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834"
|
||||
dependencies = [
|
||||
"bitflags 2.8.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustc_version"
|
||||
version = "0.4.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92"
|
||||
dependencies = [
|
||||
"semver",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ryu"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f"
|
||||
|
||||
[[package]]
|
||||
name = "scopeguard"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49"
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.22.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"cssparser 0.27.2",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
"log",
|
||||
"matches",
|
||||
"phf 0.8.0",
|
||||
"phf_codegen 0.8.0",
|
||||
"precomputed-hash",
|
||||
"servo_arc 0.1.1",
|
||||
"smallvec",
|
||||
"thin-slice",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "selectors"
|
||||
version = "0.24.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416"
|
||||
dependencies = [
|
||||
"bitflags 1.3.2",
|
||||
"cssparser 0.29.6",
|
||||
"derive_more",
|
||||
"fxhash",
|
||||
"log",
|
||||
"phf 0.8.0",
|
||||
"phf_codegen 0.8.0",
|
||||
"precomputed-hash",
|
||||
"servo_arc 0.2.0",
|
||||
"smallvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "semver"
|
||||
version = "1.0.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.217"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.217"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_json"
|
||||
version = "1.0.137"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b"
|
||||
dependencies = [
|
||||
"itoa 1.0.14",
|
||||
"memchr",
|
||||
"ryu",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432"
|
||||
dependencies = [
|
||||
"nodrop",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "servo_arc"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741"
|
||||
dependencies = [
|
||||
"nodrop",
|
||||
"stable_deref_trait",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "siphasher"
|
||||
version = "0.3.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d"
|
||||
|
||||
[[package]]
|
||||
name = "smallvec"
|
||||
version = "1.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67"
|
||||
|
||||
[[package]]
|
||||
name = "stable_deref_trait"
|
||||
version = "1.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "string_cache"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b"
|
||||
dependencies = [
|
||||
"new_debug_unreachable",
|
||||
"once_cell",
|
||||
"parking_lot",
|
||||
"phf_shared 0.10.0",
|
||||
"precomputed-hash",
|
||||
"serde",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "string_cache_codegen"
|
||||
version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988"
|
||||
dependencies = [
|
||||
"phf_generator 0.10.0",
|
||||
"phf_shared 0.10.0",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "1.0.109"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.96"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tendril"
|
||||
version = "0.4.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0"
|
||||
dependencies = [
|
||||
"futf",
|
||||
"mac",
|
||||
"utf-8",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thin-slice"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c"
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.11"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243"
|
||||
|
||||
[[package]]
|
||||
name = "utf-8"
|
||||
version = "0.7.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.9.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.11.0+wasi-snapshot-preview1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.7.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.96",
|
||||
]
|
14
apps/api/sharedLibs/html-transformer/Cargo.toml
Normal file
14
apps/api/sharedLibs/html-transformer/Cargo.toml
Normal file
@ -0,0 +1,14 @@
|
||||
[package]
|
||||
name = "html-transformer"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
libc = "0.2.0"
|
||||
lol_html = "2.2.0"
|
||||
kuchikiki = "0.8.2"
|
||||
serde = { version = "1.0", features = ["derive"] }
|
||||
serde_json = "1.0"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
168
apps/api/sharedLibs/html-transformer/src/lib.rs
Normal file
168
apps/api/sharedLibs/html-transformer/src/lib.rs
Normal file
@ -0,0 +1,168 @@
|
||||
use std::{collections::HashMap, ffi::{CStr, CString}};
|
||||
|
||||
use kuchikiki::{parse_html, traits::TendrilSink};
|
||||
use serde_json::Value;
|
||||
|
||||
// #[no_mangle]
|
||||
// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
||||
// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||
|
||||
// let mut output = vec![];
|
||||
|
||||
// let mut rewriter = HtmlRewriter::new(
|
||||
// Settings {
|
||||
// element_content_handlers: vec! [
|
||||
// element!("")
|
||||
// ],
|
||||
// ..Settings::new()
|
||||
// },
|
||||
// |c: &[u8]| output.extend_from_slice(c)
|
||||
// );
|
||||
|
||||
// rewriter.write(html.as_bytes()).unwrap();
|
||||
|
||||
// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw()
|
||||
// }
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
||||
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||
|
||||
let document = parse_html().one(html);
|
||||
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
|
||||
for anchor in document.select("a[href]").unwrap() {
|
||||
let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
|
||||
|
||||
if href.starts_with("http:/") && !href.starts_with("http://") {
|
||||
href = format!("http://{}", &href[6..]);
|
||||
} else if href.starts_with("https:/") && !href.starts_with("https://") {
|
||||
href = format!("https://{}", &href[7..]);
|
||||
}
|
||||
|
||||
out.push(href);
|
||||
}
|
||||
|
||||
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
|
||||
}
|
||||
|
||||
macro_rules! insert_meta_name {
|
||||
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
|
||||
if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
|
||||
$out.insert(($outName).to_string(), Value::String(x));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
macro_rules! insert_meta_property {
|
||||
($out:ident, $document:ident, $metaName:expr, $outName:expr) => {
|
||||
if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) {
|
||||
$out.insert(($outName).to_string(), Value::String(x));
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 {
|
||||
let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap();
|
||||
|
||||
let document = parse_html().one(html);
|
||||
let mut out = HashMap::<String, Value>::new();
|
||||
|
||||
if let Some(title) = document.select("title").unwrap().next() {
|
||||
out.insert("title".to_string(), Value::String(title.text_contents()));
|
||||
}
|
||||
// insert_meta_name!(out, document, "description", "description");
|
||||
|
||||
if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next()
|
||||
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))
|
||||
.or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next()
|
||||
.and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) {
|
||||
out.insert("favicon".to_string(), Value::String(favicon_link));
|
||||
}
|
||||
|
||||
if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) {
|
||||
out.insert("language".to_string(), Value::String(lang));
|
||||
}
|
||||
|
||||
// insert_meta_name!(out, document, "keywords", "keywords");
|
||||
// insert_meta_name!(out, document, "robots", "robots");
|
||||
insert_meta_property!(out, document, "og:title", "ogTitle");
|
||||
insert_meta_property!(out, document, "og:description", "ogDescription");
|
||||
insert_meta_property!(out, document, "og:url", "ogUrl");
|
||||
insert_meta_property!(out, document, "og:image", "ogImage");
|
||||
insert_meta_property!(out, document, "og:audio", "ogAudio");
|
||||
insert_meta_property!(out, document, "og:determiner", "ogDeterminer");
|
||||
insert_meta_property!(out, document, "og:locale", "ogLocale");
|
||||
|
||||
for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() {
|
||||
let attrs = meta.attributes.borrow();
|
||||
|
||||
if let Some(content) = attrs.get("content") {
|
||||
if let Some(v) = out.get_mut("og:locale:alternate") {
|
||||
match v {
|
||||
Value::Array(x) => {
|
||||
x.push(Value::String(content.to_string()));
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
} else {
|
||||
out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())]));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
insert_meta_property!(out, document, "og:site_name", "ogSiteName");
|
||||
insert_meta_property!(out, document, "og:video", "ogVideo");
|
||||
insert_meta_name!(out, document, "article:section", "articleSection");
|
||||
insert_meta_name!(out, document, "article:tag", "articleTag");
|
||||
insert_meta_property!(out, document, "article:published_time", "publishedTime");
|
||||
insert_meta_property!(out, document, "article:modified_time", "modifiedTime");
|
||||
insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords");
|
||||
insert_meta_name!(out, document, "dc.description", "dcDescription");
|
||||
insert_meta_name!(out, document, "dc.subject", "dcSubject");
|
||||
insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject");
|
||||
insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience");
|
||||
insert_meta_name!(out, document, "dc.type", "dcType");
|
||||
insert_meta_name!(out, document, "dcterms.type", "dcTermsType");
|
||||
insert_meta_name!(out, document, "dc.date", "dcDate");
|
||||
insert_meta_name!(out, document, "dc.date.created", "dcDateCreated");
|
||||
insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated");
|
||||
|
||||
for meta in document.select("meta").unwrap() {
|
||||
let meta = meta.as_node().as_element().unwrap();
|
||||
let attrs = meta.attributes.borrow();
|
||||
|
||||
if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) {
|
||||
if let Some(content) = attrs.get("content") {
|
||||
if let Some(v) = out.get(name) {
|
||||
match v {
|
||||
Value::String(_) => {
|
||||
out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())]));
|
||||
},
|
||||
Value::Array(_) => {
|
||||
match out.get_mut(name) {
|
||||
Some(Value::Array(x)) => {
|
||||
x.push(Value::String(content.to_string()));
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
},
|
||||
_ => unreachable!(),
|
||||
}
|
||||
} else {
|
||||
out.insert(name.to_string(), Value::String(content.to_string()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw()
|
||||
}
|
||||
|
||||
#[no_mangle]
|
||||
pub extern "C" fn free_string(ptr: *mut i8) {
|
||||
drop(unsafe { CString::from_raw(ptr) })
|
||||
}
|
84
apps/api/src/lib/html-transformer.ts
Normal file
84
apps/api/src/lib/html-transformer.ts
Normal file
@ -0,0 +1,84 @@
|
||||
import koffi, { KoffiFunction } from "koffi";
|
||||
import { join } from "path";
|
||||
import { stat } from "fs/promises";
|
||||
import { platform } from "os";
|
||||
|
||||
// TODO: add a timeout to the Rust transformer
|
||||
const rustExecutablePath = join(
|
||||
process.cwd(),
|
||||
"sharedLibs/html-transformer/target/release/",
|
||||
platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so"
|
||||
);
|
||||
|
||||
class RustHTMLTransformer {
|
||||
private static instance: RustHTMLTransformer;
|
||||
private _extractLinks: KoffiFunction;
|
||||
private _extractMetadata: KoffiFunction;
|
||||
private _freeString: KoffiFunction;
|
||||
|
||||
private constructor() {
|
||||
const lib = koffi.load(rustExecutablePath);
|
||||
this._freeString = lib.func("free_string", "void", ["string"]);
|
||||
const freedResultString = koffi.disposable("CString", "string", this._freeString);
|
||||
this._extractLinks = lib.func("extract_links", freedResultString, ["string"]);
|
||||
this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]);
|
||||
}
|
||||
|
||||
public static async getInstance(): Promise<RustHTMLTransformer> {
|
||||
if (!RustHTMLTransformer.instance) {
|
||||
try {
|
||||
await stat(rustExecutablePath);
|
||||
} catch (_) {
|
||||
throw Error("Rust html-transformer shared library not found");
|
||||
}
|
||||
RustHTMLTransformer.instance = new RustHTMLTransformer();
|
||||
}
|
||||
return RustHTMLTransformer.instance;
|
||||
}
|
||||
|
||||
public async extractLinks(html: string): Promise<string[]> {
|
||||
return new Promise<string[]>((resolve, reject) => {
|
||||
this._extractLinks.async(html, (err: Error, res: string) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(JSON.parse(res));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
public async extractMetadata(html: string): Promise<any> {
|
||||
return new Promise<string[]>((resolve, reject) => {
|
||||
this._extractMetadata.async(html, (err: Error, res: string) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
resolve(JSON.parse(res));
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function extractLinks(
|
||||
html: string | null | undefined,
|
||||
): Promise<string[]> {
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const converter = await RustHTMLTransformer.getInstance();
|
||||
return await converter.extractLinks(html);
|
||||
}
|
||||
|
||||
export async function extractMetadata(
|
||||
html: string | null | undefined,
|
||||
): Promise<any> {
|
||||
if (!html) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const converter = await RustHTMLTransformer.getInstance();
|
||||
return await converter.extractMetadata(html);
|
||||
}
|
@ -1,5 +1,5 @@
|
||||
import axios, { AxiosError } from "axios";
|
||||
import cheerio, { load } from "cheerio";
|
||||
import { load } from "cheerio"; // rustified
|
||||
import { URL } from "url";
|
||||
import { getLinksFromSitemap } from "./sitemap";
|
||||
import robotsParser, { Robot } from "robots-parser";
|
||||
@ -8,6 +8,7 @@ import { axiosTimeout } from "../../lib/timeout";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import https from "https";
|
||||
import { redisConnection } from "../../services/queue-service";
|
||||
import { extractLinks } from "../../lib/html-transformer";
|
||||
export class WebCrawler {
|
||||
private jobId: string;
|
||||
private initialUrl: string;
|
||||
@ -364,7 +365,11 @@ export class WebCrawler {
|
||||
return null;
|
||||
}
|
||||
|
||||
public extractLinksFromHTML(html: string, url: string) {
|
||||
private async extractLinksFromHTMLRust(html: string, url: string) {
|
||||
return (await extractLinks(html)).filter(x => this.filterURL(x, url));
|
||||
}
|
||||
|
||||
private extractLinksFromHTMLCheerio(html: string, url: string) {
|
||||
let links: string[] = [];
|
||||
|
||||
const $ = load(html);
|
||||
@ -386,7 +391,7 @@ export class WebCrawler {
|
||||
const src = $(element).attr("src");
|
||||
if (src && src.startsWith("data:text/html")) {
|
||||
const iframeHtml = decodeURIComponent(src.split(",")[1]);
|
||||
const iframeLinks = this.extractLinksFromHTML(iframeHtml, url);
|
||||
const iframeLinks = this.extractLinksFromHTMLCheerio(iframeHtml, url);
|
||||
links = links.concat(iframeLinks);
|
||||
}
|
||||
});
|
||||
@ -394,6 +399,19 @@ export class WebCrawler {
|
||||
return links;
|
||||
}
|
||||
|
||||
public async extractLinksFromHTML(html: string, url: string) {
|
||||
try {
|
||||
return await this.extractLinksFromHTMLRust(html, url);
|
||||
} catch (error) {
|
||||
this.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
||||
error,
|
||||
module: "scrapeURL", method: "extractMetadata"
|
||||
});
|
||||
}
|
||||
|
||||
return this.extractLinksFromHTMLCheerio(html, url);
|
||||
}
|
||||
|
||||
private isRobotsAllowed(
|
||||
url: string,
|
||||
ignoreRobotsTxt: boolean = false,
|
||||
|
@ -1,8 +1,52 @@
|
||||
// TODO: refactor
|
||||
import { load } from "cheerio";
|
||||
import { load } from "cheerio"; // rustified
|
||||
import { logger } from "../../../lib/logger";
|
||||
import { extractLinks as _extractLinks } from "../../../lib/html-transformer";
|
||||
|
||||
async function extractLinksRust(html: string, baseUrl: string): Promise<string[]> {
|
||||
const hrefs = await _extractLinks(html);
|
||||
|
||||
const links: string[] = [];
|
||||
|
||||
hrefs.forEach(href => {
|
||||
href = href.trim();
|
||||
try {
|
||||
if (href.startsWith("http://") || href.startsWith("https://")) {
|
||||
// Absolute URL, add as is
|
||||
links.push(href);
|
||||
} else if (href.startsWith("/")) {
|
||||
// Relative URL starting with '/', append to origin
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (!href.startsWith("#") && !href.startsWith("mailto:")) {
|
||||
// Relative URL not starting with '/', append to base URL
|
||||
links.push(new URL(href, baseUrl).href);
|
||||
} else if (href.startsWith("mailto:")) {
|
||||
// mailto: links, add as is
|
||||
links.push(href);
|
||||
}
|
||||
// Fragment-only links (#) are ignored
|
||||
} catch (error) {
|
||||
logger.error(
|
||||
`Failed to construct URL for href: ${href} with base: ${baseUrl}`,
|
||||
{ error },
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// Remove duplicates and return
|
||||
return [...new Set(links)];
|
||||
}
|
||||
|
||||
export async function extractLinks(html: string, baseUrl: string): Promise<string[]> {
|
||||
try {
|
||||
return await extractLinksRust(html, baseUrl);
|
||||
} catch (error) {
|
||||
logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
||||
error,
|
||||
module: "scrapeURL", method: "extractLinks"
|
||||
});
|
||||
}
|
||||
|
||||
export function extractLinks(html: string, baseUrl: string): string[] {
|
||||
const $ = load(html);
|
||||
const links: string[] = [];
|
||||
|
||||
|
@ -1,11 +1,37 @@
|
||||
import { load } from "cheerio";
|
||||
import { load } from "cheerio"; // rustified
|
||||
import { Document } from "../../../controllers/v1/types";
|
||||
import { Meta } from "..";
|
||||
import { extractMetadata as _extractMetadata } from "../../../lib/html-transformer";
|
||||
|
||||
export function extractMetadata(
|
||||
export async function extractMetadataRust(
|
||||
meta: Meta,
|
||||
html: string,
|
||||
): Partial<Document["metadata"]> {
|
||||
): Promise<Partial<Document["metadata"]>> {
|
||||
const fromRust = await _extractMetadata(html);
|
||||
|
||||
return {
|
||||
...fromRust,
|
||||
...(fromRust.favicon ? {
|
||||
favicon: new URL(fromRust.favicon, meta.url)
|
||||
} : {}),
|
||||
scrapeId: meta.id,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
export async function extractMetadata(
|
||||
meta: Meta,
|
||||
html: string,
|
||||
): Promise<Partial<Document["metadata"]>> {
|
||||
try {
|
||||
return await extractMetadataRust(meta, html);
|
||||
} catch (error) {
|
||||
meta.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
||||
error,
|
||||
module: "scrapeURL", method: "extractMetadata"
|
||||
});
|
||||
}
|
||||
|
||||
let title: string | undefined = undefined;
|
||||
let description: string | undefined = undefined;
|
||||
let favicon: string | undefined = undefined;
|
||||
|
@ -1,6 +1,6 @@
|
||||
// TODO: refactor
|
||||
|
||||
import { AnyNode, Cheerio, load } from "cheerio";
|
||||
import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify
|
||||
import { ScrapeOptions } from "../../../controllers/v1/types";
|
||||
|
||||
const excludeNonMainTags = [
|
||||
|
@ -14,10 +14,10 @@ export type Transformer = (
|
||||
document: Document,
|
||||
) => Document | Promise<Document>;
|
||||
|
||||
export function deriveMetadataFromRawHTML(
|
||||
export async function deriveMetadataFromRawHTML(
|
||||
meta: Meta,
|
||||
document: Document,
|
||||
): Document {
|
||||
): Promise<Document> {
|
||||
if (document.rawHtml === undefined) {
|
||||
throw new Error(
|
||||
"rawHtml is undefined -- this transformer is being called out of order",
|
||||
@ -25,7 +25,7 @@ export function deriveMetadataFromRawHTML(
|
||||
}
|
||||
|
||||
document.metadata = {
|
||||
...extractMetadata(meta, document.rawHtml),
|
||||
...(await extractMetadata(meta, document.rawHtml)),
|
||||
...document.metadata,
|
||||
};
|
||||
return document;
|
||||
@ -63,7 +63,7 @@ export async function deriveMarkdownFromHTML(
|
||||
return document;
|
||||
}
|
||||
|
||||
export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
||||
export async function deriveLinksFromHTML(meta: Meta, document: Document): Promise<Document> {
|
||||
// Only derive if the formats has links
|
||||
if (meta.options.formats.includes("links")) {
|
||||
if (document.html === undefined) {
|
||||
@ -72,7 +72,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document {
|
||||
);
|
||||
}
|
||||
|
||||
document.links = extractLinks(document.html, meta.url);
|
||||
document.links = await extractLinks(document.html, meta.url);
|
||||
}
|
||||
|
||||
return document;
|
||||
|
@ -1,5 +1,5 @@
|
||||
import axios from "axios";
|
||||
import * as cheerio from "cheerio";
|
||||
import * as cheerio from "cheerio"; // TODO: rustify
|
||||
import * as querystring from "querystring";
|
||||
import { SearchResult } from "../../src/lib/entities";
|
||||
import { logger } from "../../src/lib/logger";
|
||||
|
@ -884,7 +884,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
);
|
||||
|
||||
const links = crawler.filterLinks(
|
||||
crawler.extractLinksFromHTML(
|
||||
await crawler.extractLinksFromHTML(
|
||||
rawHtml ?? "",
|
||||
doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!,
|
||||
),
|
||||
|
Loading…
x
Reference in New Issue
Block a user