feat: parse PDFs on fc side and reject if too long for timeout (FIR-2083) (#1592)

* feat: pdf-parser, implementation in scrapeURL

* use pdf-parser for page count instead of mu

* fix(pdf-parser): bindings

* feat(scrapeURL/pdf): adjust MILLISECONDS_PER_PAGE

* implement post-runsync polling and fix

* fix(Dockerfile): copy in the pdf-parser source code

* fix(scrapeURL/pdf): better error for timeout below 0
This commit is contained in:
Gergő Móricz 2025-05-23 13:45:53 +02:00 committed by GitHub
parent 321fff1695
commit b03670a8b7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 1257 additions and 24 deletions

View File

@ -68,12 +68,24 @@ jobs:
with:
go-version: '1.19'
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Build go-html-to-md
run: |
go mod tidy
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Build html-transformer
run: |
cargo build --release
chmod +x target/release/libhtml_transformer.so
working-directory: ./apps/api/sharedLibs/html-transformer
- name: Build pdf-parser
run: |
cargo build --release
chmod +x target/release/libpdf_parser.so
working-directory: ./apps/api/sharedLibs/pdf-parser
- name: Set up SearXNG
if: matrix.search == 'searxng'
run: |

View File

@ -73,12 +73,24 @@ jobs:
with:
go-version: '1.19'
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
- name: Set up Rust
uses: actions-rust-lang/setup-rust-toolchain@v1
- name: Build go-html-to-md
run: |
go mod tidy
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
chmod +x html-to-markdown.so
working-directory: ./apps/api/sharedLibs/go-html-to-md
- name: Build html-transformer
run: |
cargo build --release
chmod +x target/release/libhtml_transformer.so
working-directory: ./apps/api/sharedLibs/html-transformer
- name: Build pdf-parser
run: |
cargo build --release
chmod +x target/release/libpdf_parser.so
working-directory: ./apps/api/sharedLibs/pdf-parser
- name: Start the application
run: npm start > api.log 2>&1 &
working-directory: ./apps/api

View File

@ -26,17 +26,24 @@ RUN cd /app/sharedLibs/go-html-to-md && \
# Install Rust
FROM rust:1-slim AS rust-base
COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer
COPY sharedLibs/pdf-parser /app/sharedLibs/pdf-parser
# Install Rust dependencies and build transformer lib
RUN cd /app/sharedLibs/html-transformer && \
cargo build --release && \
chmod +x target/release/libhtml_transformer.so
# Install Rust dependencies and build PDF parser lib
RUN cd /app/sharedLibs/pdf-parser && \
cargo build --release && \
chmod +x target/release/libpdf_parser.so
FROM base
COPY --from=build /app/dist /app/dist
COPY --from=prod-deps /app/node_modules /app/node_modules
COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so
COPY --from=rust-base /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so
# Install git
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*

View File

@ -0,0 +1 @@
target

999
apps/api/sharedLibs/pdf-parser/Cargo.lock generated Normal file
View File

@ -0,0 +1,999 @@
# This file is automatically @generated by Cargo.
# It is not intended for manual editing.
version = 4
[[package]]
name = "adler2"
version = "2.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
[[package]]
name = "aes"
version = "0.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
dependencies = [
"cfg-if",
"cipher",
"cpufeatures",
]
[[package]]
name = "android-tzdata"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
[[package]]
name = "android_system_properties"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
dependencies = [
"libc",
]
[[package]]
name = "autocfg"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
[[package]]
name = "bitflags"
version = "2.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
[[package]]
name = "block-buffer"
version = "0.10.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
dependencies = [
"generic-array",
]
[[package]]
name = "block-padding"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
dependencies = [
"generic-array",
]
[[package]]
name = "bumpalo"
version = "3.17.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
[[package]]
name = "bytecount"
version = "0.6.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
[[package]]
name = "cbc"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
dependencies = [
"cipher",
]
[[package]]
name = "cc"
version = "1.2.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766"
dependencies = [
"shlex",
]
[[package]]
name = "cfg-if"
version = "1.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
[[package]]
name = "chrono"
version = "0.4.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
dependencies = [
"android-tzdata",
"iana-time-zone",
"num-traits",
"windows-link",
]
[[package]]
name = "cipher"
version = "0.4.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
dependencies = [
"crypto-common",
"inout",
]
[[package]]
name = "core-foundation-sys"
version = "0.8.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
[[package]]
name = "cpufeatures"
version = "0.2.17"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
dependencies = [
"libc",
]
[[package]]
name = "crc32fast"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if",
]
[[package]]
name = "crossbeam-deque"
version = "0.8.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
dependencies = [
"crossbeam-epoch",
"crossbeam-utils",
]
[[package]]
name = "crossbeam-epoch"
version = "0.9.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
dependencies = [
"crossbeam-utils",
]
[[package]]
name = "crossbeam-utils"
version = "0.8.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
[[package]]
name = "crypto-common"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
dependencies = [
"generic-array",
"typenum",
]
[[package]]
name = "deranged"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e"
dependencies = [
"powerfmt",
]
[[package]]
name = "digest"
version = "0.10.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
dependencies = [
"block-buffer",
"crypto-common",
]
[[package]]
name = "ecb"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
dependencies = [
"cipher",
]
[[package]]
name = "either"
version = "1.15.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
[[package]]
name = "encoding_rs"
version = "0.8.35"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
dependencies = [
"cfg-if",
]
[[package]]
name = "equivalent"
version = "1.0.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
[[package]]
name = "flate2"
version = "1.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
dependencies = [
"crc32fast",
"miniz_oxide",
]
[[package]]
name = "generic-array"
version = "0.14.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
dependencies = [
"typenum",
"version_check",
]
[[package]]
name = "getrandom"
version = "0.3.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
dependencies = [
"cfg-if",
"libc",
"r-efi",
"wasi",
]
[[package]]
name = "hashbrown"
version = "0.15.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
[[package]]
name = "iana-time-zone"
version = "0.1.63"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8"
dependencies = [
"android_system_properties",
"core-foundation-sys",
"iana-time-zone-haiku",
"js-sys",
"log",
"wasm-bindgen",
"windows-core",
]
[[package]]
name = "iana-time-zone-haiku"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
dependencies = [
"cc",
]
[[package]]
name = "indexmap"
version = "2.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
dependencies = [
"equivalent",
"hashbrown",
]
[[package]]
name = "inout"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
dependencies = [
"block-padding",
"generic-array",
]
[[package]]
name = "itoa"
version = "1.0.15"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
[[package]]
name = "jiff"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a194df1107f33c79f4f93d02c80798520551949d59dfad22b6157048a88cca93"
dependencies = [
"jiff-static",
"jiff-tzdb-platform",
"log",
"portable-atomic",
"portable-atomic-util",
"serde",
"windows-sys",
]
[[package]]
name = "jiff-static"
version = "0.2.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6c6e1db7ed32c6c71b759497fae34bf7933636f75a251b9e736555da426f6442"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "jiff-tzdb"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524"
[[package]]
name = "jiff-tzdb-platform"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8"
dependencies = [
"jiff-tzdb",
]
[[package]]
name = "js-sys"
version = "0.3.77"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
dependencies = [
"once_cell",
"wasm-bindgen",
]
[[package]]
name = "libc"
version = "0.2.172"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
[[package]]
name = "log"
version = "0.4.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
[[package]]
name = "lopdf"
version = "0.36.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7"
dependencies = [
"aes",
"bitflags",
"cbc",
"chrono",
"ecb",
"encoding_rs",
"flate2",
"indexmap",
"itoa",
"jiff",
"log",
"md-5",
"nom",
"nom_locate",
"rand",
"rangemap",
"rayon",
"sha2",
"stringprep",
"thiserror",
"time",
"weezl",
]
[[package]]
name = "md-5"
version = "0.10.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
dependencies = [
"cfg-if",
"digest",
]
[[package]]
name = "memchr"
version = "2.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
[[package]]
name = "miniz_oxide"
version = "0.8.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a"
dependencies = [
"adler2",
]
[[package]]
name = "nom"
version = "8.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
dependencies = [
"memchr",
]
[[package]]
name = "nom_locate"
version = "5.0.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
dependencies = [
"bytecount",
"memchr",
"nom",
]
[[package]]
name = "num-conv"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
[[package]]
name = "num-traits"
version = "0.2.19"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
dependencies = [
"autocfg",
]
[[package]]
name = "once_cell"
version = "1.21.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
[[package]]
name = "pdf-parser"
version = "0.1.0"
dependencies = [
"libc",
"lopdf",
]
[[package]]
name = "portable-atomic"
version = "1.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
[[package]]
name = "portable-atomic-util"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
dependencies = [
"portable-atomic",
]
[[package]]
name = "powerfmt"
version = "0.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
[[package]]
name = "ppv-lite86"
version = "0.2.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
dependencies = [
"zerocopy",
]
[[package]]
name = "proc-macro2"
version = "1.0.95"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
dependencies = [
"unicode-ident",
]
[[package]]
name = "quote"
version = "1.0.40"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
dependencies = [
"proc-macro2",
]
[[package]]
name = "r-efi"
version = "5.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
[[package]]
name = "rand"
version = "0.9.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
dependencies = [
"rand_chacha",
"rand_core",
]
[[package]]
name = "rand_chacha"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
dependencies = [
"ppv-lite86",
"rand_core",
]
[[package]]
name = "rand_core"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
dependencies = [
"getrandom",
]
[[package]]
name = "rangemap"
version = "1.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
[[package]]
name = "rayon"
version = "1.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
dependencies = [
"either",
"rayon-core",
]
[[package]]
name = "rayon-core"
version = "1.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
dependencies = [
"crossbeam-deque",
"crossbeam-utils",
]
[[package]]
name = "rustversion"
version = "1.0.21"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
[[package]]
name = "serde"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.219"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "sha2"
version = "0.10.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
dependencies = [
"cfg-if",
"cpufeatures",
"digest",
]
[[package]]
name = "shlex"
version = "1.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
[[package]]
name = "stringprep"
version = "0.1.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
dependencies = [
"unicode-bidi",
"unicode-normalization",
"unicode-properties",
]
[[package]]
name = "syn"
version = "2.0.101"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
dependencies = [
"proc-macro2",
"quote",
"unicode-ident",
]
[[package]]
name = "thiserror"
version = "2.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "2.0.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "time"
version = "0.3.41"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40"
dependencies = [
"deranged",
"itoa",
"num-conv",
"powerfmt",
"serde",
"time-core",
"time-macros",
]
[[package]]
name = "time-core"
version = "0.1.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c"
[[package]]
name = "time-macros"
version = "0.2.22"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49"
dependencies = [
"num-conv",
"time-core",
]
[[package]]
name = "tinyvec"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71"
dependencies = [
"tinyvec_macros",
]
[[package]]
name = "tinyvec_macros"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
[[package]]
name = "typenum"
version = "1.18.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
[[package]]
name = "unicode-bidi"
version = "0.3.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
[[package]]
name = "unicode-ident"
version = "1.0.18"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
[[package]]
name = "unicode-normalization"
version = "0.1.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
dependencies = [
"tinyvec",
]
[[package]]
name = "unicode-properties"
version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
[[package]]
name = "version_check"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
[[package]]
name = "wasi"
version = "0.14.2+wasi-0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
dependencies = [
"wit-bindgen-rt",
]
[[package]]
name = "wasm-bindgen"
version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
dependencies = [
"cfg-if",
"once_cell",
"rustversion",
"wasm-bindgen-macro",
]
[[package]]
name = "wasm-bindgen-backend"
version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
dependencies = [
"bumpalo",
"log",
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-macro"
version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
dependencies = [
"quote",
"wasm-bindgen-macro-support",
]
[[package]]
name = "wasm-bindgen-macro-support"
version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
dependencies = [
"proc-macro2",
"quote",
"syn",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
[[package]]
name = "wasm-bindgen-shared"
version = "0.2.100"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
dependencies = [
"unicode-ident",
]
[[package]]
name = "weezl"
version = "0.1.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3"
[[package]]
name = "windows-core"
version = "0.61.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
dependencies = [
"windows-implement",
"windows-interface",
"windows-link",
"windows-result",
"windows-strings",
]
[[package]]
name = "windows-implement"
version = "0.60.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "windows-interface"
version = "0.59.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
dependencies = [
"proc-macro2",
"quote",
"syn",
]
[[package]]
name = "windows-link"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
[[package]]
name = "windows-result"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-strings"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
dependencies = [
"windows-link",
]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets",
]
[[package]]
name = "windows-targets"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
dependencies = [
"windows_aarch64_gnullvm",
"windows_aarch64_msvc",
"windows_i686_gnu",
"windows_i686_gnullvm",
"windows_i686_msvc",
"windows_x86_64_gnu",
"windows_x86_64_gnullvm",
"windows_x86_64_msvc",
]
[[package]]
name = "windows_aarch64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
[[package]]
name = "windows_aarch64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
[[package]]
name = "windows_i686_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
[[package]]
name = "windows_i686_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
[[package]]
name = "windows_i686_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
[[package]]
name = "windows_x86_64_gnu"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
[[package]]
name = "windows_x86_64_gnullvm"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
[[package]]
name = "windows_x86_64_msvc"
version = "0.52.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
[[package]]
name = "wit-bindgen-rt"
version = "0.39.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
dependencies = [
"bitflags",
]
[[package]]
name = "zerocopy"
version = "0.8.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
dependencies = [
"zerocopy-derive",
]
[[package]]
name = "zerocopy-derive"
version = "0.8.25"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
dependencies = [
"proc-macro2",
"quote",
"syn",
]

View File

@ -0,0 +1,11 @@
[package]
name = "pdf-parser"
version = "0.1.0"
edition = "2021"
[dependencies]
libc = "0.2.0"
lopdf = "0.36.0"
[lib]
crate-type = ["cdylib"]

View File

@ -0,0 +1,24 @@
use std::{ffi::CStr};
/// Returns the number of pages in a PDF file
///
/// # Safety
/// Input path must be a C string of a path pointing to a PDF file. Output will be an integer, either the number of pages in the PDF or -1 indicating an error.
#[no_mangle]
pub unsafe extern "C" fn get_page_count(path: *const libc::c_char) -> i32 {
let path: String = match unsafe { CStr::from_ptr(path) }.to_str().map_err(|_| ()) {
Ok(x) => x.to_string(),
Err(_) => {
return -1;
}
};
let doc = match lopdf::Document::load(&path) {
Ok(x) => x,
Err(_) => {
return -1;
}
};
doc.get_pages().len() as i32
}

View File

@ -28,6 +28,12 @@ function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>)
expect(typeof response.body.data).toBe("object");
}
function expectScrapeToFail(response: Awaited<ReturnType<typeof scrapeRaw>>) {
expect(response.statusCode).not.toBe(200);
expect(response.body.success).toBe(false);
expect(typeof response.body.error).toBe("string");
}
export async function scrape(body: ScrapeRequestInput): Promise<Document> {
const raw = await scrapeRaw(body);
expectScrapeToSucceed(raw);
@ -39,6 +45,15 @@ export async function scrape(body: ScrapeRequestInput): Promise<Document> {
return raw.body.data;
}
export async function scrapeWithFailure(body: ScrapeRequestInput): Promise<{
success: false;
error: string;
}> {
const raw = await scrapeRaw(body);
expectScrapeToFail(raw);
return raw.body;
}
export async function scrapeStatusRaw(jobId: string) {
return await request(TEST_URL)
.get("/v1/scrape/" + encodeURIComponent(jobId))

View File

@ -1,4 +1,4 @@
import { scrape, scrapeStatus } from "./lib";
import { scrape, scrapeStatus, scrapeWithFailure } from "./lib";
describe("Scrape tests", () => {
it.concurrent("mocking works properly", async () => {
@ -297,16 +297,35 @@ describe("Scrape tests", () => {
}, 130000);
});
// Temporarily disabled, too flaky
// describe("PDF (f-e dependant)", () => {
// it.concurrent("works for PDFs behind anti-bot", async () => {
// const response = await scrape({
// url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
// });
describe("PDF (f-e dependant)", () => {
// Temporarily disabled, too flaky
// it.concurrent("works for PDFs behind anti-bot", async () => {
// const response = await scrape({
// url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
// });
// expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
// }, 60000);
// });
// expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
// }, 60000);
it.concurrent("blocks long PDFs with insufficient timeout", async () => {
const response = await scrapeWithFailure({
url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf",
timeout: 30000,
});
expect(response.error).toContain("Insufficient time to process PDF");
}, 30000);
it.concurrent("scrapes long PDFs with sufficient timeout", async () => {
const response = await scrape({
url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf",
timeout: 300000,
});
// text on the last page
expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification");
}, 310000);
});
}
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {

View File

@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string {
*/
export async function savePdfResultToCache(
pdfContent: string,
result: { markdown: string; html: string; numPages: number }
result: { markdown: string; html: string }
): Promise<string | null> {
try {
if (!process.env.GCS_BUCKET_NAME) {
@ -76,7 +76,7 @@ export async function savePdfResultToCache(
*/
export async function getPdfResultFromCache(
pdfContent: string
): Promise<{ markdown: string; html: string; numPages: number } | null> {
): Promise<{ markdown: string; html: string } | null> {
try {
if (!process.env.GCS_BUCKET_NAME) {
return null;
@ -104,7 +104,6 @@ export async function getPdfResultFromCache(
return {
...result,
numPages: result.numPages ?? 1, // default to 1 page if cache is old
};
} catch (error) {
logger.error(`Error retrieving PDF RunPod result from GCS cache`, {

View File

@ -0,0 +1,70 @@
import koffi, { KoffiFunction } from "koffi";
import { join } from "path";
import { stat } from "fs/promises";
import { platform } from "os";
// TODO: add a timeout to the Rust parser
const rustExecutablePath = join(
process.cwd(),
"sharedLibs/pdf-parser/target/release/",
platform() === "darwin" ? "libpdf_parser.dylib" : "libpdf_parser.so"
);
class RustPDFParser {
private static instance: RustPDFParser;
private _getPageCount: KoffiFunction;
private constructor() {
const lib = koffi.load(rustExecutablePath);
this._getPageCount = lib.func("get_page_count", "int32", ["string"]);
}
public static async isParserAvailable(): Promise<boolean> {
if (RustPDFParser.instance) {
return true;
}
try {
await stat(rustExecutablePath);
RustPDFParser.instance = new RustPDFParser();
return true;
} catch (_) {
return false;
}
}
public static async getInstance(): Promise<RustPDFParser> {
if (!RustPDFParser.instance) {
try {
await stat(rustExecutablePath);
} catch (_) {
throw Error("Rust pdf-parser shared library not found");
}
RustPDFParser.instance = new RustPDFParser();
}
return RustPDFParser.instance;
}
public async getPageCount(path: string): Promise<number> {
return new Promise<number>((resolve, reject) => {
this._getPageCount.async(path, (err: Error, res: number) => {
if (err) {
reject(err);
} else {
if (res === -1) {
reject(new Error("Failed to parse PDF."));
} else {
resolve(res);
}
}
});
});
}
}
export async function getPageCount(
path: string,
): Promise<number> {
const converter = await RustPDFParser.getInstance();
return await converter.getPageCount(path);
}

View File

@ -7,15 +7,17 @@ import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
import { PDFAntibotError, PDFInsufficientTimeError, RemoveFeatureError, TimeoutError } from "../../error";
import { readFile, unlink } from "node:fs/promises";
import path from "node:path";
import type { Response } from "undici";
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
import { getPageCount } from "../../../../lib/pdf-parser";
type PDFProcessorResult = { html: string; markdown?: string; numPages: number };
type PDFProcessorResult = { html: string; markdown?: string; };
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
const MILLISECONDS_PER_PAGE = 150;
async function scrapePDFWithRunPodMU(
meta: Meta,
@ -46,8 +48,13 @@ async function scrapePDFWithRunPodMU(
}
const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined;
if (timeout && timeout < 0) {
throw new TimeoutError("MU PDF parser already timed out before call");
}
const result = await robustFetch({
const abort = timeout ? AbortSignal.timeout(timeout) : undefined;
const podStart = await robustFetch({
url:
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
method: "POST",
@ -63,22 +70,63 @@ async function scrapePDFWithRunPodMU(
},
},
logger: meta.logger.child({
method: "scrapePDFWithRunPodMU/robustFetch",
method: "scrapePDFWithRunPodMU/runsync/robustFetch",
}),
schema: z.object({
id: z.string(),
status: z.string(),
output: z.object({
markdown: z.string(),
num_pages: z.number(),
}),
}).optional(),
}),
mock: meta.mock,
abort: timeout ? AbortSignal.timeout(timeout) : undefined,
abort,
});
let status: string = podStart.status;
let result: { markdown: string } | undefined = podStart.output;
if (status === "IN_QUEUE" || status === "IN_PROGRESS") {
meta.logger.info("RunPod MU returned while in status " + status);
do {
abort?.throwIfAborted();
await new Promise(resolve => setTimeout(resolve, 2500));
abort?.throwIfAborted();
const podStatus = await robustFetch({
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${podStart.id}`,
method: "GET",
headers: {
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
},
logger: meta.logger.child({
method: "scrapePDFWithRunPodMU/status/robustFetch",
}),
schema: z.object({
status: z.string(),
output: z.object({
markdown: z.string(),
}).optional(),
}),
mock: meta.mock,
abort,
});
meta.logger.info("RunPod MU status " + podStatus.status);
status = podStatus.status;
result = podStatus.output;
} while (status !== "COMPLETED" && status !== "FAILED");
}
if (status === "FAILED") {
throw new Error("RunPod MU failed to parse PDF");
}
if (!result) {
throw new Error("RunPod MU returned no result");
}
const processorResult = {
markdown: result.output.markdown,
html: await marked.parse(result.output.markdown, { async: true }),
numPages: result.output.num_pages,
markdown: result.markdown,
html: await marked.parse(result.markdown, { async: true }),
};
try {
@ -105,7 +153,6 @@ async function scrapePDFWithParsePDF(
return {
markdown: escaped,
html: escaped,
numPages: result.numpages,
};
}
@ -160,6 +207,11 @@ export async function scrapePDF(
}
}
const pageCount = await getPageCount(tempFilePath);
if (pageCount * MILLISECONDS_PER_PAGE > (timeToRun ?? Infinity)) {
throw new PDFInsufficientTimeError(pageCount, pageCount * MILLISECONDS_PER_PAGE + 5000);
}
let result: PDFProcessorResult | null = null;
const base64Content = (await readFile(tempFilePath)).toString("base64");
@ -214,5 +266,6 @@ export async function scrapePDF(
statusCode: response.status,
html: result?.html ?? "",
markdown: result?.markdown ?? "",
numPages: pageCount,
};
}

View File

@ -86,3 +86,9 @@ export class PDFAntibotError extends Error {
super("PDF scrape was prevented by anti-bot")
}
}
export class PDFInsufficientTimeError extends Error {
constructor(pageCount: number, minTimeout: number) {
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
}
}

View File

@ -22,6 +22,7 @@ import {
TimeoutError,
UnsupportedFileError,
SSLError,
PDFInsufficientTimeError,
} from "./error";
import { executeTransformers } from "./transformers";
import { LLMRefusalError } from "./transformers/llmExtract";
@ -340,6 +341,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
throw error;
} else if (error instanceof TimeoutSignal) {
throw error;
} else if (error instanceof PDFInsufficientTimeError) {
throw error;
} else {
Sentry.captureException(error);
meta.logger.warn(
@ -489,6 +492,8 @@ export async function scrapeURL(
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
error,
});
} else if (error instanceof PDFInsufficientTimeError) {
meta.logger.warn("scrapeURL: Insufficient time to process PDF", { error });
} else if (error instanceof TimeoutSignal) {
throw error;
} else {