From b005450a342d38ff90deded0550548b5172c3aaa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 24 Jan 2025 22:04:54 +0100 Subject: [PATCH] port most of cheerio stuff to rust (#1089) --- apps/api/Dockerfile | 10 + .../sharedLibs/html-transformer/.gitignore | 1 + .../sharedLibs/html-transformer/Cargo.lock | 940 ++++++++++++++++++ .../sharedLibs/html-transformer/Cargo.toml | 14 + .../sharedLibs/html-transformer/src/lib.rs | 168 ++++ apps/api/src/lib/html-transformer.ts | 84 ++ apps/api/src/scraper/WebScraper/crawler.ts | 24 +- .../src/scraper/scrapeURL/lib/extractLinks.ts | 48 +- .../scraper/scrapeURL/lib/extractMetadata.ts | 32 +- .../scrapeURL/lib/removeUnwantedElements.ts | 2 +- .../scraper/scrapeURL/transformers/index.ts | 10 +- apps/api/src/search/googlesearch.ts | 2 +- apps/api/src/services/queue-worker.ts | 2 +- 13 files changed, 1321 insertions(+), 16 deletions(-) create mode 100644 apps/api/sharedLibs/html-transformer/.gitignore create mode 100644 apps/api/sharedLibs/html-transformer/Cargo.lock create mode 100644 apps/api/sharedLibs/html-transformer/Cargo.toml create mode 100644 apps/api/sharedLibs/html-transformer/src/lib.rs create mode 100644 apps/api/src/lib/html-transformer.ts diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index adc78927..601bf3b4 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -27,10 +27,20 @@ RUN cd /app/sharedLibs/go-html-to-md && \ go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \ chmod +x html-to-markdown.so +# Install Rust +FROM rust:1-bullseye AS rust-base +COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer + +# Install Go dependencies and build parser lib +RUN cd /app/sharedLibs/html-transformer && \ + cargo build --release && \ + chmod +x target/release/libhtml_transformer.so + FROM base COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=build /app /app COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so +COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so # Start the server by default, this can be overwritten at runtime EXPOSE 8080 diff --git a/apps/api/sharedLibs/html-transformer/.gitignore b/apps/api/sharedLibs/html-transformer/.gitignore new file mode 100644 index 00000000..1de56593 --- /dev/null +++ b/apps/api/sharedLibs/html-transformer/.gitignore @@ -0,0 +1 @@ +target \ No newline at end of file diff --git a/apps/api/sharedLibs/html-transformer/Cargo.lock b/apps/api/sharedLibs/html-transformer/Cargo.lock new file mode 100644 index 00000000..43696071 --- /dev/null +++ b/apps/api/sharedLibs/html-transformer/Cargo.lock @@ -0,0 +1,940 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" + +[[package]] +name = "bitflags" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f68f53c83ab957f72c32642f3868eec03eb974d1fb82e453128456482613d36" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "convert_case" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6245d59a3e82a7fc217c5828a6692dbc6dfb63a0c8c90495621f7b9d79704a0e" + +[[package]] +name = "cssparser" +version = "0.27.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "754b69d351cdc2d8ee09ae203db831e005560fc6030da058f86ad60c92a9cb0a" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa 0.4.8", + "matches", + "phf 0.8.0", + "proc-macro2", + "quote", + "smallvec", + "syn 1.0.109", +] + +[[package]] +name = "cssparser" +version = "0.29.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f93d03419cb5950ccfd3daf3ff1c7a36ace64609a1a8746d493df1ca0afde0fa" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa 1.0.14", + "matches", + "phf 0.10.1", + "proc-macro2", + "quote", + "smallvec", + "syn 1.0.109", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn 2.0.96", +] + +[[package]] +name = "derive_more" +version = "0.99.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f33878137e4dafd7fa914ad4e259e18a4e8e532b9617a2d0150262bf53abfce" +dependencies = [ + "convert_case", + "proc-macro2", + "quote", + "rustc_version", + "syn 2.0.96", +] + +[[package]] +name = "dtoa" +version = "1.0.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcbb2bf8e87535c23f7a8a321e364ce21462d0ff10cb6407820e8e96dfff6653" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + +[[package]] +name = "foldhash" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0d2fde1f7b3d48b8395d5f2de76c18a528bd6a9cdde438df747bfcba3e05d6f" + +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + +[[package]] +name = "getrandom" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fc3cb4d91f53b50155bdcfd23f6a4c39ae1969c2ae85982b135750cccaf5fce" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.9.0+wasi-snapshot-preview1", +] + +[[package]] +name = "getrandom" +version = "0.2.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" +dependencies = [ + "cfg-if", + "libc", + "wasi 0.11.0+wasi-snapshot-preview1", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.15.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "html-transformer" +version = "0.1.0" +dependencies = [ + "kuchikiki", + "libc", + "lol_html", + "serde", + "serde_json", +] + +[[package]] +name = "html5ever" +version = "0.26.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bea68cab48b8459f17cf1c944c67ddc572d272d9f2b274140f223ecb1da4a3b7" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", +] + +[[package]] +name = "itoa" +version = "0.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" + +[[package]] +name = "itoa" +version = "1.0.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" + +[[package]] +name = "kuchikiki" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f29e4755b7b995046f510a7520c42b2fed58b77bd94d5a87a8eb43d2fd126da8" +dependencies = [ + "cssparser 0.27.2", + "html5ever", + "indexmap", + "matches", + "selectors 0.22.0", +] + +[[package]] +name = "libc" +version = "0.2.169" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" + +[[package]] +name = "lock_api" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07af8b9cdd281b7915f413fa73f29ebd5d55d0d3f0155584dade1ff18cea1b17" +dependencies = [ + "autocfg", + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04cbf5b083de1c7e0222a7a51dbfdba1cbe1c6ab0b15e29fff3f6c077fd9cd9f" + +[[package]] +name = "lol_html" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b1058123f6262982b891dccc395cff0144d9439de366460b47fab719258b96e" +dependencies = [ + "bitflags 2.8.0", + "cfg-if", + "cssparser 0.29.6", + "encoding_rs", + "hashbrown 0.15.2", + "memchr", + "mime", + "selectors 0.24.0", + "thiserror", +] + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + +[[package]] +name = "markup5ever" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a2629bb1404f3d34c2e921f21fd34ba00b206124c81f65c50b43b6aaefeb016" +dependencies = [ + "log", + "phf 0.10.1", + "phf_codegen 0.10.0", + "string_cache", + "string_cache_codegen", + "tendril", +] + +[[package]] +name = "matches" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2532096657941c2fea9c289d370a250971c689d4f143798ff67113ec042024a5" + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nodrop" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72ef4a56884ca558e5ddb05a1d1e7e1bfd9a68d9ed024c21704cc98872dae1bb" + +[[package]] +name = "once_cell" +version = "1.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" + +[[package]] +name = "parking_lot" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1bf18183cf54e8d6059647fc3063646a1801cf30896933ec2311622cc4b9a27" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e401f977ab385c9e4e3ab30627d6f26d00e2c73eef317493c4ec6d468726cf8" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-targets", +] + +[[package]] +name = "phf" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3dfb61232e34fcb633f43d12c58f83c1df82962dcdfa565a4e866ffc17dafe12" +dependencies = [ + "phf_macros 0.8.0", + "phf_shared 0.8.0", + "proc-macro-hack", +] + +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_macros 0.10.0", + "phf_shared 0.10.0", + "proc-macro-hack", +] + +[[package]] +name = "phf_codegen" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cbffee61585b0411840d3ece935cce9cb6321f01c45477d30066498cd5e1a815" +dependencies = [ + "phf_generator 0.8.0", + "phf_shared 0.8.0", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_generator" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "17367f0cc86f2d25802b2c26ee58a7b23faeccf78a396094c13dced0d0182526" +dependencies = [ + "phf_shared 0.8.0", + "rand 0.7.3", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.5", +] + +[[package]] +name = "phf_macros" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6fde18ff429ffc8fe78e2bf7f8b7a5a5a6e2a8b58bc5a9ac69198bbda9189c" +dependencies = [ + "phf_generator 0.8.0", + "phf_shared 0.8.0", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "phf_macros" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58fdf3184dd560f160dd73922bea2d5cd6e8f064bf4b13110abd81b03697b4e0" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro-hack", + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "phf_shared" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c00cf8b9eafe68dde5e9eaa2cef8ee84a9336a47d566ec55ca16589633b65af7" +dependencies = [ + "siphasher", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + +[[package]] +name = "proc-macro2" +version = "1.0.93" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60946a68e5f9d28b0dc1c21bb8a97ee7d018a8b322fa57838ba31cc878e22d99" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "rand" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a6b1679d49b24bbfe0c803429aa1874472f50d9b363131f0e89fc356b544d03" +dependencies = [ + "getrandom 0.1.16", + "libc", + "rand_chacha 0.2.2", + "rand_core 0.5.1", + "rand_hc", + "rand_pcg", +] + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4c8ed856279c9737206bf725bf36935d8666ead7aa69b52be55af369d193402" +dependencies = [ + "ppv-lite86", + "rand_core 0.5.1", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_core" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" +dependencies = [ + "getrandom 0.1.16", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.15", +] + +[[package]] +name = "rand_hc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca3129af7b92a17112d59ad498c6f81eaf463253766b90396d39ea7a39d6613c" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "rand_pcg" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16abd0c1b639e9eb4d7c50c0b8100b0d0f849be2349829c740fe8e6eb4816429" +dependencies = [ + "rand_core 0.5.1", +] + +[[package]] +name = "redox_syscall" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03a862b389f93e68874fbf580b9de08dd02facb9a788ebadaf4a3fd33cf58834" +dependencies = [ + "bitflags 2.8.0", +] + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "ryu" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "selectors" +version = "0.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df320f1889ac4ba6bc0cdc9c9af7af4bd64bb927bccdf32d81140dc1f9be12fe" +dependencies = [ + "bitflags 1.3.2", + "cssparser 0.27.2", + "derive_more", + "fxhash", + "log", + "matches", + "phf 0.8.0", + "phf_codegen 0.8.0", + "precomputed-hash", + "servo_arc 0.1.1", + "smallvec", + "thin-slice", +] + +[[package]] +name = "selectors" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c37578180969d00692904465fb7f6b3d50b9a2b952b87c23d0e2e5cb5013416" +dependencies = [ + "bitflags 1.3.2", + "cssparser 0.29.6", + "derive_more", + "fxhash", + "log", + "phf 0.8.0", + "phf_codegen 0.8.0", + "precomputed-hash", + "servo_arc 0.2.0", + "smallvec", +] + +[[package]] +name = "semver" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" + +[[package]] +name = "serde" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.217" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + +[[package]] +name = "serde_json" +version = "1.0.137" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" +dependencies = [ + "itoa 1.0.14", + "memchr", + "ryu", + "serde", +] + +[[package]] +name = "servo_arc" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98238b800e0d1576d8b6e3de32827c2d74bee68bb97748dcf5071fb53965432" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + +[[package]] +name = "servo_arc" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52aa42f8fdf0fed91e5ce7f23d8138441002fa31dca008acf47e6fd4721f741" +dependencies = [ + "nodrop", + "stable_deref_trait", +] + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "smallvec" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c5e1a9a646d36c3599cd173a41282daf47c44583ad367b8e6837255952e5c67" + +[[package]] +name = "stable_deref_trait" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" + +[[package]] +name = "string_cache" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f91138e76242f575eb1d3b38b4f1362f10d3a43f47d182a5b359af488a02293b" +dependencies = [ + "new_debug_unreachable", + "once_cell", + "parking_lot", + "phf_shared 0.10.0", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bb30289b722be4ff74a408c3cc27edeaad656e06cb1fe8fa9231fa59c728988" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", + "proc-macro2", + "quote", +] + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.96" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d5d0adab1ae378d7f53bdebc67a39f1f151407ef230f0ce2883572f5d8985c80" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + +[[package]] +name = "thin-slice" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eaa81235c7058867fa8c0e7314f33dcce9c215f535d1913822a2b3f5e289f3c" + +[[package]] +name = "thiserror" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d452f284b73e6d76dd36758a0c8684b1d5be31f92b89d07fd5822175732206fc" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26afc1baea8a989337eeb52b6e72a039780ce45c3edfcc9c5b9d112feeb173c2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] + +[[package]] +name = "unicode-ident" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" + +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "wasi" +version = "0.9.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cccddf32554fecc6acb585f82a32a72e28b48f8c4c1883ddfeeeaa96f7d8e519" + +[[package]] +name = "wasi" +version = "0.11.0+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.96", +] diff --git a/apps/api/sharedLibs/html-transformer/Cargo.toml b/apps/api/sharedLibs/html-transformer/Cargo.toml new file mode 100644 index 00000000..9e242060 --- /dev/null +++ b/apps/api/sharedLibs/html-transformer/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "html-transformer" +version = "0.1.0" +edition = "2021" + +[dependencies] +libc = "0.2.0" +lol_html = "2.2.0" +kuchikiki = "0.8.2" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" + +[lib] +crate-type = ["cdylib"] diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs new file mode 100644 index 00000000..290e910b --- /dev/null +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -0,0 +1,168 @@ +use std::{collections::HashMap, ffi::{CStr, CString}}; + +use kuchikiki::{parse_html, traits::TendrilSink}; +use serde_json::Value; + +// #[no_mangle] +// pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { +// let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); + +// let mut output = vec![]; + +// let mut rewriter = HtmlRewriter::new( +// Settings { +// element_content_handlers: vec! [ +// element!("") +// ], +// ..Settings::new() +// }, +// |c: &[u8]| output.extend_from_slice(c) +// ); + +// rewriter.write(html.as_bytes()).unwrap(); + +// CString::new(String::from_utf8(output).unwrap()).unwrap().into_raw() +// } + +#[no_mangle] +pub extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { + let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); + + let document = parse_html().one(html); + + let mut out: Vec = Vec::new(); + + for anchor in document.select("a[href]").unwrap() { + let mut href = anchor.attributes.borrow().get("href").unwrap().to_string(); + + if href.starts_with("http:/") && !href.starts_with("http://") { + href = format!("http://{}", &href[6..]); + } else if href.starts_with("https:/") && !href.starts_with("https://") { + href = format!("https://{}", &href[7..]); + } + + out.push(href); + } + + CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw() +} + +macro_rules! insert_meta_name { + ($out:ident, $document:ident, $metaName:expr, $outName:expr) => { + if let Some(x) = $document.select(&format!("meta[name=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) { + $out.insert(($outName).to_string(), Value::String(x)); + } + }; +} + +macro_rules! insert_meta_property { + ($out:ident, $document:ident, $metaName:expr, $outName:expr) => { + if let Some(x) = $document.select(&format!("meta[property=\"{}\"]", $metaName)).unwrap().next().and_then(|description| description.attributes.borrow().get("content").map(|x| x.to_string())) { + $out.insert(($outName).to_string(), Value::String(x)); + } + }; +} + + +#[no_mangle] +pub extern "C" fn extract_metadata(html: *const libc::c_char) -> *mut i8 { + let html = unsafe { CStr::from_ptr(html) }.to_str().unwrap(); + + let document = parse_html().one(html); + let mut out = HashMap::::new(); + + if let Some(title) = document.select("title").unwrap().next() { + out.insert("title".to_string(), Value::String(title.text_contents())); + } + // insert_meta_name!(out, document, "description", "description"); + + if let Some(favicon_link) = document.select("link[rel=\"icon\"]").unwrap().next() + .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string())) + .or_else(|| document.select("link[rel*=\"icon\"]").unwrap().next() + .and_then(|x| x.attributes.borrow().get("href").map(|x| x.to_string()))) { + out.insert("favicon".to_string(), Value::String(favicon_link)); + } + + if let Some(lang) = document.select("html[lang]").unwrap().next().and_then(|x| x.attributes.borrow().get("lang").map(|x| x.to_string())) { + out.insert("language".to_string(), Value::String(lang)); + } + + // insert_meta_name!(out, document, "keywords", "keywords"); + // insert_meta_name!(out, document, "robots", "robots"); + insert_meta_property!(out, document, "og:title", "ogTitle"); + insert_meta_property!(out, document, "og:description", "ogDescription"); + insert_meta_property!(out, document, "og:url", "ogUrl"); + insert_meta_property!(out, document, "og:image", "ogImage"); + insert_meta_property!(out, document, "og:audio", "ogAudio"); + insert_meta_property!(out, document, "og:determiner", "ogDeterminer"); + insert_meta_property!(out, document, "og:locale", "ogLocale"); + + for meta in document.select("meta[property=\"og:locale:alternate\"]").unwrap() { + let attrs = meta.attributes.borrow(); + + if let Some(content) = attrs.get("content") { + if let Some(v) = out.get_mut("og:locale:alternate") { + match v { + Value::Array(x) => { + x.push(Value::String(content.to_string())); + }, + _ => unreachable!(), + } + } else { + out.insert("og:locale:alternate".to_string(), Value::Array(vec! [Value::String(content.to_string())])); + } + } + } + + insert_meta_property!(out, document, "og:site_name", "ogSiteName"); + insert_meta_property!(out, document, "og:video", "ogVideo"); + insert_meta_name!(out, document, "article:section", "articleSection"); + insert_meta_name!(out, document, "article:tag", "articleTag"); + insert_meta_property!(out, document, "article:published_time", "publishedTime"); + insert_meta_property!(out, document, "article:modified_time", "modifiedTime"); + insert_meta_name!(out, document, "dcterms.keywords", "dcTermsKeywords"); + insert_meta_name!(out, document, "dc.description", "dcDescription"); + insert_meta_name!(out, document, "dc.subject", "dcSubject"); + insert_meta_name!(out, document, "dcterms.subject", "dcTermsSubject"); + insert_meta_name!(out, document, "dcterms.audience", "dcTermsAudience"); + insert_meta_name!(out, document, "dc.type", "dcType"); + insert_meta_name!(out, document, "dcterms.type", "dcTermsType"); + insert_meta_name!(out, document, "dc.date", "dcDate"); + insert_meta_name!(out, document, "dc.date.created", "dcDateCreated"); + insert_meta_name!(out, document, "dcterms.created", "dcTermsCreated"); + + for meta in document.select("meta").unwrap() { + let meta = meta.as_node().as_element().unwrap(); + let attrs = meta.attributes.borrow(); + + if let Some(name) = attrs.get("name").or_else(|| attrs.get("property")) { + if let Some(content) = attrs.get("content") { + if let Some(v) = out.get(name) { + match v { + Value::String(_) => { + out.insert(name.to_string(), Value::Array(vec! [v.clone(), Value::String(content.to_string())])); + }, + Value::Array(_) => { + match out.get_mut(name) { + Some(Value::Array(x)) => { + x.push(Value::String(content.to_string())); + }, + _ => unreachable!(), + } + }, + _ => unreachable!(), + } + } else { + out.insert(name.to_string(), Value::String(content.to_string())); + } + } + } + } + + CString::new(serde_json::ser::to_string(&out).unwrap()).unwrap().into_raw() +} + +#[no_mangle] +pub extern "C" fn free_string(ptr: *mut i8) { + drop(unsafe { CString::from_raw(ptr) }) +} diff --git a/apps/api/src/lib/html-transformer.ts b/apps/api/src/lib/html-transformer.ts new file mode 100644 index 00000000..6f4ca3d6 --- /dev/null +++ b/apps/api/src/lib/html-transformer.ts @@ -0,0 +1,84 @@ +import koffi, { KoffiFunction } from "koffi"; +import { join } from "path"; +import { stat } from "fs/promises"; +import { platform } from "os"; + +// TODO: add a timeout to the Rust transformer +const rustExecutablePath = join( + process.cwd(), + "sharedLibs/html-transformer/target/release/", + platform() === "darwin" ? "libhtml_transformer.dylib" : "libhtml_transformer.so" +); + +class RustHTMLTransformer { + private static instance: RustHTMLTransformer; + private _extractLinks: KoffiFunction; + private _extractMetadata: KoffiFunction; + private _freeString: KoffiFunction; + + private constructor() { + const lib = koffi.load(rustExecutablePath); + this._freeString = lib.func("free_string", "void", ["string"]); + const freedResultString = koffi.disposable("CString", "string", this._freeString); + this._extractLinks = lib.func("extract_links", freedResultString, ["string"]); + this._extractMetadata = lib.func("extract_metadata", freedResultString, ["string"]); + } + + public static async getInstance(): Promise { + if (!RustHTMLTransformer.instance) { + try { + await stat(rustExecutablePath); + } catch (_) { + throw Error("Rust html-transformer shared library not found"); + } + RustHTMLTransformer.instance = new RustHTMLTransformer(); + } + return RustHTMLTransformer.instance; + } + + public async extractLinks(html: string): Promise { + return new Promise((resolve, reject) => { + this._extractLinks.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + resolve(JSON.parse(res)); + } + }); + }); + } + + public async extractMetadata(html: string): Promise { + return new Promise((resolve, reject) => { + this._extractMetadata.async(html, (err: Error, res: string) => { + if (err) { + reject(err); + } else { + resolve(JSON.parse(res)); + } + }); + }); + } +} + +export async function extractLinks( + html: string | null | undefined, +): Promise { + if (!html) { + return []; + } + + const converter = await RustHTMLTransformer.getInstance(); + return await converter.extractLinks(html); +} + +export async function extractMetadata( + html: string | null | undefined, +): Promise { + if (!html) { + return []; + } + + const converter = await RustHTMLTransformer.getInstance(); + return await converter.extractMetadata(html); +} \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 9ceaf434..ceee94ef 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -1,5 +1,5 @@ import axios, { AxiosError } from "axios"; -import cheerio, { load } from "cheerio"; +import { load } from "cheerio"; // rustified import { URL } from "url"; import { getLinksFromSitemap } from "./sitemap"; import robotsParser, { Robot } from "robots-parser"; @@ -8,6 +8,7 @@ import { axiosTimeout } from "../../lib/timeout"; import { logger as _logger } from "../../lib/logger"; import https from "https"; import { redisConnection } from "../../services/queue-service"; +import { extractLinks } from "../../lib/html-transformer"; export class WebCrawler { private jobId: string; private initialUrl: string; @@ -364,7 +365,11 @@ export class WebCrawler { return null; } - public extractLinksFromHTML(html: string, url: string) { + private async extractLinksFromHTMLRust(html: string, url: string) { + return (await extractLinks(html)).filter(x => this.filterURL(x, url)); + } + + private extractLinksFromHTMLCheerio(html: string, url: string) { let links: string[] = []; const $ = load(html); @@ -386,7 +391,7 @@ export class WebCrawler { const src = $(element).attr("src"); if (src && src.startsWith("data:text/html")) { const iframeHtml = decodeURIComponent(src.split(",")[1]); - const iframeLinks = this.extractLinksFromHTML(iframeHtml, url); + const iframeLinks = this.extractLinksFromHTMLCheerio(iframeHtml, url); links = links.concat(iframeLinks); } }); @@ -394,6 +399,19 @@ export class WebCrawler { return links; } + public async extractLinksFromHTML(html: string, url: string) { + try { + return await this.extractLinksFromHTMLRust(html, url); + } catch (error) { + this.logger.error("Failed to call html-transformer! Falling back to cheerio...", { + error, + module: "scrapeURL", method: "extractMetadata" + }); + } + + return this.extractLinksFromHTMLCheerio(html, url); + } + private isRobotsAllowed( url: string, ignoreRobotsTxt: boolean = false, diff --git a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts index dab00004..ef784a71 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractLinks.ts @@ -1,8 +1,52 @@ // TODO: refactor -import { load } from "cheerio"; +import { load } from "cheerio"; // rustified import { logger } from "../../../lib/logger"; +import { extractLinks as _extractLinks } from "../../../lib/html-transformer"; + +async function extractLinksRust(html: string, baseUrl: string): Promise { + const hrefs = await _extractLinks(html); + + const links: string[] = []; + + hrefs.forEach(href => { + href = href.trim(); + try { + if (href.startsWith("http://") || href.startsWith("https://")) { + // Absolute URL, add as is + links.push(href); + } else if (href.startsWith("/")) { + // Relative URL starting with '/', append to origin + links.push(new URL(href, baseUrl).href); + } else if (!href.startsWith("#") && !href.startsWith("mailto:")) { + // Relative URL not starting with '/', append to base URL + links.push(new URL(href, baseUrl).href); + } else if (href.startsWith("mailto:")) { + // mailto: links, add as is + links.push(href); + } + // Fragment-only links (#) are ignored + } catch (error) { + logger.error( + `Failed to construct URL for href: ${href} with base: ${baseUrl}`, + { error }, + ); + } + }); + + // Remove duplicates and return + return [...new Set(links)]; +} + +export async function extractLinks(html: string, baseUrl: string): Promise { + try { + return await extractLinksRust(html, baseUrl); + } catch (error) { + logger.error("Failed to call html-transformer! Falling back to cheerio...", { + error, + module: "scrapeURL", method: "extractLinks" + }); + } -export function extractLinks(html: string, baseUrl: string): string[] { const $ = load(html); const links: string[] = []; diff --git a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts index a82dff79..858b7c29 100644 --- a/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts +++ b/apps/api/src/scraper/scrapeURL/lib/extractMetadata.ts @@ -1,11 +1,37 @@ -import { load } from "cheerio"; +import { load } from "cheerio"; // rustified import { Document } from "../../../controllers/v1/types"; import { Meta } from ".."; +import { extractMetadata as _extractMetadata } from "../../../lib/html-transformer"; -export function extractMetadata( +export async function extractMetadataRust( meta: Meta, html: string, -): Partial { +): Promise> { + const fromRust = await _extractMetadata(html); + + return { + ...fromRust, + ...(fromRust.favicon ? { + favicon: new URL(fromRust.favicon, meta.url) + } : {}), + scrapeId: meta.id, + }; +} + + +export async function extractMetadata( + meta: Meta, + html: string, +): Promise> { + try { + return await extractMetadataRust(meta, html); + } catch (error) { + meta.logger.error("Failed to call html-transformer! Falling back to cheerio...", { + error, + module: "scrapeURL", method: "extractMetadata" + }); + } + let title: string | undefined = undefined; let description: string | undefined = undefined; let favicon: string | undefined = undefined; diff --git a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts index 4a3d2ae4..4edb21f8 100644 --- a/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts +++ b/apps/api/src/scraper/scrapeURL/lib/removeUnwantedElements.ts @@ -1,6 +1,6 @@ // TODO: refactor -import { AnyNode, Cheerio, load } from "cheerio"; +import { AnyNode, Cheerio, load } from "cheerio"; // TODO: rustify import { ScrapeOptions } from "../../../controllers/v1/types"; const excludeNonMainTags = [ diff --git a/apps/api/src/scraper/scrapeURL/transformers/index.ts b/apps/api/src/scraper/scrapeURL/transformers/index.ts index cf29a562..fe132ffd 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/index.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/index.ts @@ -14,10 +14,10 @@ export type Transformer = ( document: Document, ) => Document | Promise; -export function deriveMetadataFromRawHTML( +export async function deriveMetadataFromRawHTML( meta: Meta, document: Document, -): Document { +): Promise { if (document.rawHtml === undefined) { throw new Error( "rawHtml is undefined -- this transformer is being called out of order", @@ -25,7 +25,7 @@ export function deriveMetadataFromRawHTML( } document.metadata = { - ...extractMetadata(meta, document.rawHtml), + ...(await extractMetadata(meta, document.rawHtml)), ...document.metadata, }; return document; @@ -63,7 +63,7 @@ export async function deriveMarkdownFromHTML( return document; } -export function deriveLinksFromHTML(meta: Meta, document: Document): Document { +export async function deriveLinksFromHTML(meta: Meta, document: Document): Promise { // Only derive if the formats has links if (meta.options.formats.includes("links")) { if (document.html === undefined) { @@ -72,7 +72,7 @@ export function deriveLinksFromHTML(meta: Meta, document: Document): Document { ); } - document.links = extractLinks(document.html, meta.url); + document.links = await extractLinks(document.html, meta.url); } return document; diff --git a/apps/api/src/search/googlesearch.ts b/apps/api/src/search/googlesearch.ts index 74620651..07719ae5 100644 --- a/apps/api/src/search/googlesearch.ts +++ b/apps/api/src/search/googlesearch.ts @@ -1,5 +1,5 @@ import axios from "axios"; -import * as cheerio from "cheerio"; +import * as cheerio from "cheerio"; // TODO: rustify import * as querystring from "querystring"; import { SearchResult } from "../../src/lib/entities"; import { logger } from "../../src/lib/logger"; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 00b73054..76e5f336 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -884,7 +884,7 @@ async function processJob(job: Job & { id: string }, token: string) { ); const links = crawler.filterLinks( - crawler.extractLinksFromHTML( + await crawler.extractLinksFromHTML( rawHtml ?? "", doc.metadata?.url ?? doc.metadata?.sourceURL ?? sc.originUrl!, ),