From b03670a8b7571a96ed520aa0349d27173d98d68f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 23 May 2025 13:45:53 +0200 Subject: [PATCH] feat: parse PDFs on fc side and reject if too long for timeout (FIR-2083) (#1592) * feat: pdf-parser, implementation in scrapeURL * use pdf-parser for page count instead of mu * fix(pdf-parser): bindings * feat(scrapeURL/pdf): adjust MILLISECONDS_PER_PAGE * implement post-runsync polling and fix * fix(Dockerfile): copy in the pdf-parser source code * fix(scrapeURL/pdf): better error for timeout below 0 --- .github/workflows/test-server-self-host.yml | 12 + .github/workflows/test-server.yml | 12 + apps/api/Dockerfile | 7 + apps/api/sharedLibs/pdf-parser/.gitignore | 1 + apps/api/sharedLibs/pdf-parser/Cargo.lock | 999 ++++++++++++++++++ apps/api/sharedLibs/pdf-parser/Cargo.toml | 11 + apps/api/sharedLibs/pdf-parser/src/lib.rs | 24 + apps/api/src/__tests__/snips/lib.ts | 15 + apps/api/src/__tests__/snips/scrape.test.ts | 39 +- apps/api/src/lib/gcs-pdf-cache.ts | 5 +- apps/api/src/lib/pdf-parser.ts | 70 ++ .../scraper/scrapeURL/engines/pdf/index.ts | 75 +- apps/api/src/scraper/scrapeURL/error.ts | 6 + apps/api/src/scraper/scrapeURL/index.ts | 5 + 14 files changed, 1257 insertions(+), 24 deletions(-) create mode 100644 apps/api/sharedLibs/pdf-parser/.gitignore create mode 100644 apps/api/sharedLibs/pdf-parser/Cargo.lock create mode 100644 apps/api/sharedLibs/pdf-parser/Cargo.toml create mode 100644 apps/api/sharedLibs/pdf-parser/src/lib.rs create mode 100644 apps/api/src/lib/pdf-parser.ts diff --git a/.github/workflows/test-server-self-host.yml b/.github/workflows/test-server-self-host.yml index 747d0ba0..7ab4d5c6 100644 --- a/.github/workflows/test-server-self-host.yml +++ b/.github/workflows/test-server-self-host.yml @@ -68,12 +68,24 @@ jobs: with: go-version: '1.19' cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum + - name: Set up Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 - name: Build go-html-to-md run: | go mod tidy go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go chmod +x html-to-markdown.so working-directory: ./apps/api/sharedLibs/go-html-to-md + - name: Build html-transformer + run: | + cargo build --release + chmod +x target/release/libhtml_transformer.so + working-directory: ./apps/api/sharedLibs/html-transformer + - name: Build pdf-parser + run: | + cargo build --release + chmod +x target/release/libpdf_parser.so + working-directory: ./apps/api/sharedLibs/pdf-parser - name: Set up SearXNG if: matrix.search == 'searxng' run: | diff --git a/.github/workflows/test-server.yml b/.github/workflows/test-server.yml index cdde350b..8d043ba3 100644 --- a/.github/workflows/test-server.yml +++ b/.github/workflows/test-server.yml @@ -73,12 +73,24 @@ jobs: with: go-version: '1.19' cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum + - name: Set up Rust + uses: actions-rust-lang/setup-rust-toolchain@v1 - name: Build go-html-to-md run: | go mod tidy go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go chmod +x html-to-markdown.so working-directory: ./apps/api/sharedLibs/go-html-to-md + - name: Build html-transformer + run: | + cargo build --release + chmod +x target/release/libhtml_transformer.so + working-directory: ./apps/api/sharedLibs/html-transformer + - name: Build pdf-parser + run: | + cargo build --release + chmod +x target/release/libpdf_parser.so + working-directory: ./apps/api/sharedLibs/pdf-parser - name: Start the application run: npm start > api.log 2>&1 & working-directory: ./apps/api diff --git a/apps/api/Dockerfile b/apps/api/Dockerfile index ec15a2cf..2080e766 100644 --- a/apps/api/Dockerfile +++ b/apps/api/Dockerfile @@ -26,17 +26,24 @@ RUN cd /app/sharedLibs/go-html-to-md && \ # Install Rust FROM rust:1-slim AS rust-base COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer +COPY sharedLibs/pdf-parser /app/sharedLibs/pdf-parser # Install Rust dependencies and build transformer lib RUN cd /app/sharedLibs/html-transformer && \ cargo build --release && \ chmod +x target/release/libhtml_transformer.so +# Install Rust dependencies and build PDF parser lib +RUN cd /app/sharedLibs/pdf-parser && \ + cargo build --release && \ + chmod +x target/release/libpdf_parser.so + FROM base COPY --from=build /app/dist /app/dist COPY --from=prod-deps /app/node_modules /app/node_modules COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so +COPY --from=rust-base /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so # Install git RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/* diff --git a/apps/api/sharedLibs/pdf-parser/.gitignore b/apps/api/sharedLibs/pdf-parser/.gitignore new file mode 100644 index 00000000..1de56593 --- /dev/null +++ b/apps/api/sharedLibs/pdf-parser/.gitignore @@ -0,0 +1 @@ +target \ No newline at end of file diff --git a/apps/api/sharedLibs/pdf-parser/Cargo.lock b/apps/api/sharedLibs/pdf-parser/Cargo.lock new file mode 100644 index 00000000..6509899b --- /dev/null +++ b/apps/api/sharedLibs/pdf-parser/Cargo.lock @@ -0,0 +1,999 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "adler2" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "autocfg" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" + +[[package]] +name = "bitflags" +version = "2.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967" + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bumpalo" +version = "3.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" + +[[package]] +name = "bytecount" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce" + +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + +[[package]] +name = "cc" +version = "1.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766" +dependencies = [ + "shlex", +] + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "chrono" +version = "0.4.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-link", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32fast" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crypto-common" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "deranged" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e" +dependencies = [ + "powerfmt", +] + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "crypto-common", +] + +[[package]] +name = "ecb" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7" +dependencies = [ + "cipher", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "flate2" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4" +dependencies = [ + "cfg-if", + "libc", + "r-efi", + "wasi", +] + +[[package]] +name = "hashbrown" +version = "0.15.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3" + +[[package]] +name = "iana-time-zone" +version = "0.1.63" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "indexmap" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + +[[package]] +name = "itoa" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c" + +[[package]] +name = "jiff" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a194df1107f33c79f4f93d02c80798520551949d59dfad22b6157048a88cca93" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde", + "windows-sys", +] + +[[package]] +name = "jiff-static" +version = "0.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c6e1db7ed32c6c71b759497fae34bf7933636f75a251b9e736555da426f6442" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + +[[package]] +name = "js-sys" +version = "0.3.77" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "libc" +version = "0.2.172" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa" + +[[package]] +name = "log" +version = "0.4.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94" + +[[package]] +name = "lopdf" +version = "0.36.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7" +dependencies = [ + "aes", + "bitflags", + "cbc", + "chrono", + "ecb", + "encoding_rs", + "flate2", + "indexmap", + "itoa", + "jiff", + "log", + "md-5", + "nom", + "nom_locate", + "rand", + "rangemap", + "rayon", + "sha2", + "stringprep", + "thiserror", + "time", + "weezl", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "memchr" +version = "2.7.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3" + +[[package]] +name = "miniz_oxide" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a" +dependencies = [ + "adler2", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nom_locate" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d" +dependencies = [ + "bytecount", + "memchr", + "nom", +] + +[[package]] +name = "num-conv" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "pdf-parser" +version = "0.1.0" +dependencies = [ + "libc", + "lopdf", +] + +[[package]] +name = "portable-atomic" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e" + +[[package]] +name = "portable-atomic-util" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "proc-macro2" +version = "1.0.95" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "quote" +version = "1.0.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5" + +[[package]] +name = "rand" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97" +dependencies = [ + "rand_chacha", + "rand_core", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core", +] + +[[package]] +name = "rand_core" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38" +dependencies = [ + "getrandom", +] + +[[package]] +name = "rangemap" +version = "1.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684" + +[[package]] +name = "rayon" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "rustversion" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d" + +[[package]] +name = "serde" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.219" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "stringprep" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1" +dependencies = [ + "unicode-bidi", + "unicode-normalization", + "unicode-properties", +] + +[[package]] +name = "syn" +version = "2.0.101" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "thiserror" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "time" +version = "0.3.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c" + +[[package]] +name = "time-macros" +version = "0.2.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tinyvec" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "typenum" +version = "1.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f" + +[[package]] +name = "unicode-bidi" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5" + +[[package]] +name = "unicode-ident" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512" + +[[package]] +name = "unicode-normalization" +version = "0.1.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-properties" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "wasi" +version = "0.14.2+wasi-0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3" +dependencies = [ + "wit-bindgen-rt", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", +] + +[[package]] +name = "wasm-bindgen-backend" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6" +dependencies = [ + "bumpalo", + "log", + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de" +dependencies = [ + "proc-macro2", + "quote", + "syn", + "wasm-bindgen-backend", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.100" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "weezl" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3" + +[[package]] +name = "windows-core" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-interface" +version = "0.59.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "windows-link" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38" + +[[package]] +name = "windows-result" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm", + "windows_aarch64_msvc", + "windows_i686_gnu", + "windows_i686_gnullvm", + "windows_i686_msvc", + "windows_x86_64_gnu", + "windows_x86_64_gnullvm", + "windows_x86_64_msvc", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "wit-bindgen-rt" +version = "0.39.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1" +dependencies = [ + "bitflags", +] + +[[package]] +name = "zerocopy" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/apps/api/sharedLibs/pdf-parser/Cargo.toml b/apps/api/sharedLibs/pdf-parser/Cargo.toml new file mode 100644 index 00000000..ec2e163a --- /dev/null +++ b/apps/api/sharedLibs/pdf-parser/Cargo.toml @@ -0,0 +1,11 @@ +[package] +name = "pdf-parser" +version = "0.1.0" +edition = "2021" + +[dependencies] +libc = "0.2.0" +lopdf = "0.36.0" + +[lib] +crate-type = ["cdylib"] diff --git a/apps/api/sharedLibs/pdf-parser/src/lib.rs b/apps/api/sharedLibs/pdf-parser/src/lib.rs new file mode 100644 index 00000000..dbe64a06 --- /dev/null +++ b/apps/api/sharedLibs/pdf-parser/src/lib.rs @@ -0,0 +1,24 @@ +use std::{ffi::CStr}; + +/// Returns the number of pages in a PDF file +/// +/// # Safety +/// Input path must be a C string of a path pointing to a PDF file. Output will be an integer, either the number of pages in the PDF or -1 indicating an error. +#[no_mangle] +pub unsafe extern "C" fn get_page_count(path: *const libc::c_char) -> i32 { + let path: String = match unsafe { CStr::from_ptr(path) }.to_str().map_err(|_| ()) { + Ok(x) => x.to_string(), + Err(_) => { + return -1; + } + }; + + let doc = match lopdf::Document::load(&path) { + Ok(x) => x, + Err(_) => { + return -1; + } + }; + + doc.get_pages().len() as i32 +} \ No newline at end of file diff --git a/apps/api/src/__tests__/snips/lib.ts b/apps/api/src/__tests__/snips/lib.ts index 7c87d654..c8d81838 100644 --- a/apps/api/src/__tests__/snips/lib.ts +++ b/apps/api/src/__tests__/snips/lib.ts @@ -28,6 +28,12 @@ function expectScrapeToSucceed(response: Awaited>) expect(typeof response.body.data).toBe("object"); } +function expectScrapeToFail(response: Awaited>) { + expect(response.statusCode).not.toBe(200); + expect(response.body.success).toBe(false); + expect(typeof response.body.error).toBe("string"); +} + export async function scrape(body: ScrapeRequestInput): Promise { const raw = await scrapeRaw(body); expectScrapeToSucceed(raw); @@ -39,6 +45,15 @@ export async function scrape(body: ScrapeRequestInput): Promise { return raw.body.data; } +export async function scrapeWithFailure(body: ScrapeRequestInput): Promise<{ + success: false; + error: string; +}> { + const raw = await scrapeRaw(body); + expectScrapeToFail(raw); + return raw.body; +} + export async function scrapeStatusRaw(jobId: string) { return await request(TEST_URL) .get("/v1/scrape/" + encodeURIComponent(jobId)) diff --git a/apps/api/src/__tests__/snips/scrape.test.ts b/apps/api/src/__tests__/snips/scrape.test.ts index 2bcad148..4703964e 100644 --- a/apps/api/src/__tests__/snips/scrape.test.ts +++ b/apps/api/src/__tests__/snips/scrape.test.ts @@ -1,4 +1,4 @@ -import { scrape, scrapeStatus } from "./lib"; +import { scrape, scrapeStatus, scrapeWithFailure } from "./lib"; describe("Scrape tests", () => { it.concurrent("mocking works properly", async () => { @@ -297,16 +297,35 @@ describe("Scrape tests", () => { }, 130000); }); - // Temporarily disabled, too flaky - // describe("PDF (f-e dependant)", () => { - // it.concurrent("works for PDFs behind anti-bot", async () => { - // const response = await scrape({ - // url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" - // }); + describe("PDF (f-e dependant)", () => { + // Temporarily disabled, too flaky + // it.concurrent("works for PDFs behind anti-bot", async () => { + // const response = await scrape({ + // url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf" + // }); - // expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); - // }, 60000); - // }); + // expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix"); + // }, 60000); + + it.concurrent("blocks long PDFs with insufficient timeout", async () => { + const response = await scrapeWithFailure({ + url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf", + timeout: 30000, + }); + + expect(response.error).toContain("Insufficient time to process PDF"); + }, 30000); + + it.concurrent("scrapes long PDFs with sufficient timeout", async () => { + const response = await scrape({ + url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf", + timeout: 300000, + }); + + // text on the last page + expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification"); + }, 310000); + }); } if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) { diff --git a/apps/api/src/lib/gcs-pdf-cache.ts b/apps/api/src/lib/gcs-pdf-cache.ts index 90eaa67c..77f6b828 100644 --- a/apps/api/src/lib/gcs-pdf-cache.ts +++ b/apps/api/src/lib/gcs-pdf-cache.ts @@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string { */ export async function savePdfResultToCache( pdfContent: string, - result: { markdown: string; html: string; numPages: number } + result: { markdown: string; html: string } ): Promise { try { if (!process.env.GCS_BUCKET_NAME) { @@ -76,7 +76,7 @@ export async function savePdfResultToCache( */ export async function getPdfResultFromCache( pdfContent: string -): Promise<{ markdown: string; html: string; numPages: number } | null> { +): Promise<{ markdown: string; html: string } | null> { try { if (!process.env.GCS_BUCKET_NAME) { return null; @@ -104,7 +104,6 @@ export async function getPdfResultFromCache( return { ...result, - numPages: result.numPages ?? 1, // default to 1 page if cache is old }; } catch (error) { logger.error(`Error retrieving PDF RunPod result from GCS cache`, { diff --git a/apps/api/src/lib/pdf-parser.ts b/apps/api/src/lib/pdf-parser.ts new file mode 100644 index 00000000..524eb9d0 --- /dev/null +++ b/apps/api/src/lib/pdf-parser.ts @@ -0,0 +1,70 @@ +import koffi, { KoffiFunction } from "koffi"; +import { join } from "path"; +import { stat } from "fs/promises"; +import { platform } from "os"; + +// TODO: add a timeout to the Rust parser +const rustExecutablePath = join( + process.cwd(), + "sharedLibs/pdf-parser/target/release/", + platform() === "darwin" ? "libpdf_parser.dylib" : "libpdf_parser.so" +); + +class RustPDFParser { + private static instance: RustPDFParser; + private _getPageCount: KoffiFunction; + + private constructor() { + const lib = koffi.load(rustExecutablePath); + this._getPageCount = lib.func("get_page_count", "int32", ["string"]); + } + + public static async isParserAvailable(): Promise { + if (RustPDFParser.instance) { + return true; + } + + try { + await stat(rustExecutablePath); + RustPDFParser.instance = new RustPDFParser(); + return true; + } catch (_) { + return false; + } + } + + public static async getInstance(): Promise { + if (!RustPDFParser.instance) { + try { + await stat(rustExecutablePath); + } catch (_) { + throw Error("Rust pdf-parser shared library not found"); + } + RustPDFParser.instance = new RustPDFParser(); + } + return RustPDFParser.instance; + } + + public async getPageCount(path: string): Promise { + return new Promise((resolve, reject) => { + this._getPageCount.async(path, (err: Error, res: number) => { + if (err) { + reject(err); + } else { + if (res === -1) { + reject(new Error("Failed to parse PDF.")); + } else { + resolve(res); + } + } + }); + }); + } +} + +export async function getPageCount( + path: string, +): Promise { + const converter = await RustPDFParser.getInstance(); + return await converter.getPageCount(path); +} diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index d66eb3f5..98455288 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -7,15 +7,17 @@ import * as Sentry from "@sentry/node"; import escapeHtml from "escape-html"; import PdfParse from "pdf-parse"; import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile"; -import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error"; +import { PDFAntibotError, PDFInsufficientTimeError, RemoveFeatureError, TimeoutError } from "../../error"; import { readFile, unlink } from "node:fs/promises"; import path from "node:path"; import type { Response } from "undici"; import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache"; +import { getPageCount } from "../../../../lib/pdf-parser"; -type PDFProcessorResult = { html: string; markdown?: string; numPages: number }; +type PDFProcessorResult = { html: string; markdown?: string; }; const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB +const MILLISECONDS_PER_PAGE = 150; async function scrapePDFWithRunPodMU( meta: Meta, @@ -46,8 +48,13 @@ async function scrapePDFWithRunPodMU( } const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined; + if (timeout && timeout < 0) { + throw new TimeoutError("MU PDF parser already timed out before call"); + } - const result = await robustFetch({ + const abort = timeout ? AbortSignal.timeout(timeout) : undefined; + + const podStart = await robustFetch({ url: "https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync", method: "POST", @@ -63,22 +70,63 @@ async function scrapePDFWithRunPodMU( }, }, logger: meta.logger.child({ - method: "scrapePDFWithRunPodMU/robustFetch", + method: "scrapePDFWithRunPodMU/runsync/robustFetch", }), schema: z.object({ + id: z.string(), + status: z.string(), output: z.object({ markdown: z.string(), - num_pages: z.number(), - }), + }).optional(), }), mock: meta.mock, - abort: timeout ? AbortSignal.timeout(timeout) : undefined, + abort, }); + let status: string = podStart.status; + let result: { markdown: string } | undefined = podStart.output; + + if (status === "IN_QUEUE" || status === "IN_PROGRESS") { + meta.logger.info("RunPod MU returned while in status " + status); + do { + abort?.throwIfAborted(); + await new Promise(resolve => setTimeout(resolve, 2500)); + abort?.throwIfAborted(); + const podStatus = await robustFetch({ + url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${podStart.id}`, + method: "GET", + headers: { + Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`, + }, + logger: meta.logger.child({ + method: "scrapePDFWithRunPodMU/status/robustFetch", + }), + schema: z.object({ + status: z.string(), + output: z.object({ + markdown: z.string(), + }).optional(), + }), + mock: meta.mock, + abort, + }); + meta.logger.info("RunPod MU status " + podStatus.status); + status = podStatus.status; + result = podStatus.output; + } while (status !== "COMPLETED" && status !== "FAILED"); + } + + if (status === "FAILED") { + throw new Error("RunPod MU failed to parse PDF"); + } + + if (!result) { + throw new Error("RunPod MU returned no result"); + } + const processorResult = { - markdown: result.output.markdown, - html: await marked.parse(result.output.markdown, { async: true }), - numPages: result.output.num_pages, + markdown: result.markdown, + html: await marked.parse(result.markdown, { async: true }), }; try { @@ -105,7 +153,6 @@ async function scrapePDFWithParsePDF( return { markdown: escaped, html: escaped, - numPages: result.numpages, }; } @@ -160,6 +207,11 @@ export async function scrapePDF( } } + const pageCount = await getPageCount(tempFilePath); + if (pageCount * MILLISECONDS_PER_PAGE > (timeToRun ?? Infinity)) { + throw new PDFInsufficientTimeError(pageCount, pageCount * MILLISECONDS_PER_PAGE + 5000); + } + let result: PDFProcessorResult | null = null; const base64Content = (await readFile(tempFilePath)).toString("base64"); @@ -214,5 +266,6 @@ export async function scrapePDF( statusCode: response.status, html: result?.html ?? "", markdown: result?.markdown ?? "", + numPages: pageCount, }; } diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 29b8970e..33f59c1d 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -86,3 +86,9 @@ export class PDFAntibotError extends Error { super("PDF scrape was prevented by anti-bot") } } + +export class PDFInsufficientTimeError extends Error { + constructor(pageCount: number, minTimeout: number) { + super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`); + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index dbe5867c..85254f37 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -22,6 +22,7 @@ import { TimeoutError, UnsupportedFileError, SSLError, + PDFInsufficientTimeError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -340,6 +341,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof TimeoutSignal) { throw error; + } else if (error instanceof PDFInsufficientTimeError) { + throw error; } else { Sentry.captureException(error); meta.logger.warn( @@ -489,6 +492,8 @@ export async function scrapeURL( meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error, }); + } else if (error instanceof PDFInsufficientTimeError) { + meta.logger.warn("scrapeURL: Insufficient time to process PDF", { error }); } else if (error instanceof TimeoutSignal) { throw error; } else {