mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 15:46:05 +08:00
feat: parse PDFs on fc side and reject if too long for timeout (FIR-2083) (#1592)
* feat: pdf-parser, implementation in scrapeURL * use pdf-parser for page count instead of mu * fix(pdf-parser): bindings * feat(scrapeURL/pdf): adjust MILLISECONDS_PER_PAGE * implement post-runsync polling and fix * fix(Dockerfile): copy in the pdf-parser source code * fix(scrapeURL/pdf): better error for timeout below 0
This commit is contained in:
parent
321fff1695
commit
b03670a8b7
12
.github/workflows/test-server-self-host.yml
vendored
12
.github/workflows/test-server-self-host.yml
vendored
@ -68,12 +68,24 @@ jobs:
|
||||
with:
|
||||
go-version: '1.19'
|
||||
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
|
||||
- name: Set up Rust
|
||||
uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
- name: Build go-html-to-md
|
||||
run: |
|
||||
go mod tidy
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
|
||||
chmod +x html-to-markdown.so
|
||||
working-directory: ./apps/api/sharedLibs/go-html-to-md
|
||||
- name: Build html-transformer
|
||||
run: |
|
||||
cargo build --release
|
||||
chmod +x target/release/libhtml_transformer.so
|
||||
working-directory: ./apps/api/sharedLibs/html-transformer
|
||||
- name: Build pdf-parser
|
||||
run: |
|
||||
cargo build --release
|
||||
chmod +x target/release/libpdf_parser.so
|
||||
working-directory: ./apps/api/sharedLibs/pdf-parser
|
||||
- name: Set up SearXNG
|
||||
if: matrix.search == 'searxng'
|
||||
run: |
|
||||
|
12
.github/workflows/test-server.yml
vendored
12
.github/workflows/test-server.yml
vendored
@ -73,12 +73,24 @@ jobs:
|
||||
with:
|
||||
go-version: '1.19'
|
||||
cache-dependency-path: ./apps/api/sharedLibs/go-html-to-md/go.sum
|
||||
- name: Set up Rust
|
||||
uses: actions-rust-lang/setup-rust-toolchain@v1
|
||||
- name: Build go-html-to-md
|
||||
run: |
|
||||
go mod tidy
|
||||
go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go
|
||||
chmod +x html-to-markdown.so
|
||||
working-directory: ./apps/api/sharedLibs/go-html-to-md
|
||||
- name: Build html-transformer
|
||||
run: |
|
||||
cargo build --release
|
||||
chmod +x target/release/libhtml_transformer.so
|
||||
working-directory: ./apps/api/sharedLibs/html-transformer
|
||||
- name: Build pdf-parser
|
||||
run: |
|
||||
cargo build --release
|
||||
chmod +x target/release/libpdf_parser.so
|
||||
working-directory: ./apps/api/sharedLibs/pdf-parser
|
||||
- name: Start the application
|
||||
run: npm start > api.log 2>&1 &
|
||||
working-directory: ./apps/api
|
||||
|
@ -26,17 +26,24 @@ RUN cd /app/sharedLibs/go-html-to-md && \
|
||||
# Install Rust
|
||||
FROM rust:1-slim AS rust-base
|
||||
COPY sharedLibs/html-transformer /app/sharedLibs/html-transformer
|
||||
COPY sharedLibs/pdf-parser /app/sharedLibs/pdf-parser
|
||||
|
||||
# Install Rust dependencies and build transformer lib
|
||||
RUN cd /app/sharedLibs/html-transformer && \
|
||||
cargo build --release && \
|
||||
chmod +x target/release/libhtml_transformer.so
|
||||
|
||||
# Install Rust dependencies and build PDF parser lib
|
||||
RUN cd /app/sharedLibs/pdf-parser && \
|
||||
cargo build --release && \
|
||||
chmod +x target/release/libpdf_parser.so
|
||||
|
||||
FROM base
|
||||
COPY --from=build /app/dist /app/dist
|
||||
COPY --from=prod-deps /app/node_modules /app/node_modules
|
||||
COPY --from=go-base /app/sharedLibs/go-html-to-md/html-to-markdown.so /app/sharedLibs/go-html-to-md/html-to-markdown.so
|
||||
COPY --from=rust-base /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so /app/sharedLibs/html-transformer/target/release/libhtml_transformer.so
|
||||
COPY --from=rust-base /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so /app/sharedLibs/pdf-parser/target/release/libpdf_parser.so
|
||||
|
||||
# Install git
|
||||
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
|
||||
|
1
apps/api/sharedLibs/pdf-parser/.gitignore
vendored
Normal file
1
apps/api/sharedLibs/pdf-parser/.gitignore
vendored
Normal file
@ -0,0 +1 @@
|
||||
target
|
999
apps/api/sharedLibs/pdf-parser/Cargo.lock
generated
Normal file
999
apps/api/sharedLibs/pdf-parser/Cargo.lock
generated
Normal file
@ -0,0 +1,999 @@
|
||||
# This file is automatically @generated by Cargo.
|
||||
# It is not intended for manual editing.
|
||||
version = 4
|
||||
|
||||
[[package]]
|
||||
name = "adler2"
|
||||
version = "2.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "512761e0bb2578dd7380c6baaa0f4ce03e84f95e960231d1dec8bf4d7d6e2627"
|
||||
|
||||
[[package]]
|
||||
name = "aes"
|
||||
version = "0.8.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cipher",
|
||||
"cpufeatures",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "android-tzdata"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
|
||||
|
||||
[[package]]
|
||||
name = "android_system_properties"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "autocfg"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26"
|
||||
|
||||
[[package]]
|
||||
name = "bitflags"
|
||||
version = "2.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1b8e56985ec62d17e9c1001dc89c88ecd7dc08e47eba5ec7c29c7b5eeecde967"
|
||||
|
||||
[[package]]
|
||||
name = "block-buffer"
|
||||
version = "0.10.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "block-padding"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "bumpalo"
|
||||
version = "3.17.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf"
|
||||
|
||||
[[package]]
|
||||
name = "bytecount"
|
||||
version = "0.6.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
|
||||
|
||||
[[package]]
|
||||
name = "cbc"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6"
|
||||
dependencies = [
|
||||
"cipher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cc"
|
||||
version = "1.2.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f4ac86a9e5bc1e2b3449ab9d7d3a6a405e3d1bb28d7b9be8614f55846ae3766"
|
||||
dependencies = [
|
||||
"shlex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cfg-if"
|
||||
version = "1.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
||||
|
||||
[[package]]
|
||||
name = "chrono"
|
||||
version = "0.4.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c469d952047f47f91b68d1cba3f10d63c11d73e4636f24f08daf0278abf01c4d"
|
||||
dependencies = [
|
||||
"android-tzdata",
|
||||
"iana-time-zone",
|
||||
"num-traits",
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "cipher"
|
||||
version = "0.4.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad"
|
||||
dependencies = [
|
||||
"crypto-common",
|
||||
"inout",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "core-foundation-sys"
|
||||
version = "0.8.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b"
|
||||
|
||||
[[package]]
|
||||
name = "cpufeatures"
|
||||
version = "0.2.17"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280"
|
||||
dependencies = [
|
||||
"libc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crc32fast"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-deque"
|
||||
version = "0.8.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
|
||||
dependencies = [
|
||||
"crossbeam-epoch",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-epoch"
|
||||
version = "0.9.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e"
|
||||
dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "crossbeam-utils"
|
||||
version = "0.8.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
||||
|
||||
[[package]]
|
||||
name = "crypto-common"
|
||||
version = "0.1.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1bfb12502f3fc46cca1bb51ac28df9d618d813cdc3d2f25b9fe775a34af26bb3"
|
||||
dependencies = [
|
||||
"generic-array",
|
||||
"typenum",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "deranged"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9c9e6a11ca8224451684bc0d7d5a7adbf8f2fd6887261a1cfc3c0432f9d4068e"
|
||||
dependencies = [
|
||||
"powerfmt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "digest"
|
||||
version = "0.10.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292"
|
||||
dependencies = [
|
||||
"block-buffer",
|
||||
"crypto-common",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ecb"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a8bfa975b1aec2145850fcaa1c6fe269a16578c44705a532ae3edc92b8881c7"
|
||||
dependencies = [
|
||||
"cipher",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "either"
|
||||
version = "1.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719"
|
||||
|
||||
[[package]]
|
||||
name = "encoding_rs"
|
||||
version = "0.8.35"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "equivalent"
|
||||
version = "1.0.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f"
|
||||
|
||||
[[package]]
|
||||
name = "flate2"
|
||||
version = "1.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7ced92e76e966ca2fd84c8f7aa01a4aea65b0eb6648d72f7c8f3e2764a67fece"
|
||||
dependencies = [
|
||||
"crc32fast",
|
||||
"miniz_oxide",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "generic-array"
|
||||
version = "0.14.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
|
||||
dependencies = [
|
||||
"typenum",
|
||||
"version_check",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.3.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"r-efi",
|
||||
"wasi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "hashbrown"
|
||||
version = "0.15.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "84b26c544d002229e640969970a2e74021aadf6e2f96372b9c58eff97de08eb3"
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone"
|
||||
version = "0.1.63"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b0c919e5debc312ad217002b8048a17b7d83f80703865bbfcfebb0458b0b27d8"
|
||||
dependencies = [
|
||||
"android_system_properties",
|
||||
"core-foundation-sys",
|
||||
"iana-time-zone-haiku",
|
||||
"js-sys",
|
||||
"log",
|
||||
"wasm-bindgen",
|
||||
"windows-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "iana-time-zone-haiku"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f"
|
||||
dependencies = [
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "indexmap"
|
||||
version = "2.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cea70ddb795996207ad57735b50c5982d8844f38ba9ee5f1aedcfb708a2aa11e"
|
||||
dependencies = [
|
||||
"equivalent",
|
||||
"hashbrown",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "inout"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01"
|
||||
dependencies = [
|
||||
"block-padding",
|
||||
"generic-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "itoa"
|
||||
version = "1.0.15"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "4a5f13b858c8d314ee3e8f639011f7ccefe71f97f96e50151fb991f267928e2c"
|
||||
|
||||
[[package]]
|
||||
name = "jiff"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a194df1107f33c79f4f93d02c80798520551949d59dfad22b6157048a88cca93"
|
||||
dependencies = [
|
||||
"jiff-static",
|
||||
"jiff-tzdb-platform",
|
||||
"log",
|
||||
"portable-atomic",
|
||||
"portable-atomic-util",
|
||||
"serde",
|
||||
"windows-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jiff-static"
|
||||
version = "0.2.14"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c6e1db7ed32c6c71b759497fae34bf7933636f75a251b9e736555da426f6442"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "jiff-tzdb"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c1283705eb0a21404d2bfd6eef2a7593d240bc42a0bdb39db0ad6fa2ec026524"
|
||||
|
||||
[[package]]
|
||||
name = "jiff-tzdb-platform"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8"
|
||||
dependencies = [
|
||||
"jiff-tzdb",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "js-sys"
|
||||
version = "0.3.77"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1cfaf33c695fc6e08064efbc1f72ec937429614f25eef83af942d0e227c3a28f"
|
||||
dependencies = [
|
||||
"once_cell",
|
||||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "libc"
|
||||
version = "0.2.172"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d750af042f7ef4f724306de029d18836c26c1765a54a6a3f094cbd23a7267ffa"
|
||||
|
||||
[[package]]
|
||||
name = "log"
|
||||
version = "0.4.27"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "13dc2df351e3202783a1fe0d44375f7295ffb4049267b0f3018346dc122a1d94"
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.36.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "59fa2559e99ba0f26a12458aabc754432c805bbb8cba516c427825a997af1fb7"
|
||||
dependencies = [
|
||||
"aes",
|
||||
"bitflags",
|
||||
"cbc",
|
||||
"chrono",
|
||||
"ecb",
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap",
|
||||
"itoa",
|
||||
"jiff",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom",
|
||||
"nom_locate",
|
||||
"rand",
|
||||
"rangemap",
|
||||
"rayon",
|
||||
"sha2",
|
||||
"stringprep",
|
||||
"thiserror",
|
||||
"time",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "md-5"
|
||||
version = "0.10.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "memchr"
|
||||
version = "2.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "78ca9ab1a0babb1e7d5695e3530886289c18cf2f87ec19a575a0abdce112e3a3"
|
||||
|
||||
[[package]]
|
||||
name = "miniz_oxide"
|
||||
version = "0.8.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3be647b768db090acb35d5ec5db2b0e1f1de11133ca123b9eacf5137868f892a"
|
||||
dependencies = [
|
||||
"adler2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom"
|
||||
version = "8.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nom_locate"
|
||||
version = "5.0.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b577e2d69827c4740cba2b52efaad1c4cc7c73042860b199710b3575c68438d"
|
||||
dependencies = [
|
||||
"bytecount",
|
||||
"memchr",
|
||||
"nom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "num-conv"
|
||||
version = "0.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "51d515d32fb182ee37cda2ccdcb92950d6a3c2893aa280e540671c2cd0f3b1d9"
|
||||
|
||||
[[package]]
|
||||
name = "num-traits"
|
||||
version = "0.2.19"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
|
||||
dependencies = [
|
||||
"autocfg",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d"
|
||||
|
||||
[[package]]
|
||||
name = "pdf-parser"
|
||||
version = "0.1.0"
|
||||
dependencies = [
|
||||
"libc",
|
||||
"lopdf",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic"
|
||||
version = "1.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "350e9b48cbc6b0e028b0473b114454c6316e57336ee184ceab6e53f72c178b3e"
|
||||
|
||||
[[package]]
|
||||
name = "portable-atomic-util"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d8a2f0d8d040d7848a709caf78912debcc3f33ee4b3cac47d73d1e1069e83507"
|
||||
dependencies = [
|
||||
"portable-atomic",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "powerfmt"
|
||||
version = "0.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391"
|
||||
|
||||
[[package]]
|
||||
name = "ppv-lite86"
|
||||
version = "0.2.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9"
|
||||
dependencies = [
|
||||
"zerocopy",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "proc-macro2"
|
||||
version = "1.0.95"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "02b3e5e68a3a1a02aad3ec490a98007cbc13c37cbe84a3cd7b8e406d76e7f778"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quote"
|
||||
version = "1.0.40"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1885c039570dc00dcb4ff087a89e185fd56bae234ddc7f056a945bf36467248d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "r-efi"
|
||||
version = "5.2.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "74765f6d916ee2faa39bc8e68e4f3ed8949b48cccdac59983d287a7cb71ce9c5"
|
||||
|
||||
[[package]]
|
||||
name = "rand"
|
||||
version = "0.9.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
|
||||
dependencies = [
|
||||
"rand_chacha",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_chacha"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
|
||||
dependencies = [
|
||||
"ppv-lite86",
|
||||
"rand_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rand_core"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
|
||||
dependencies = [
|
||||
"getrandom",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rangemap"
|
||||
version = "1.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f60fcc7d6849342eff22c4350c8b9a989ee8ceabc4b481253e8946b9fe83d684"
|
||||
|
||||
[[package]]
|
||||
name = "rayon"
|
||||
version = "1.10.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b418a60154510ca1a002a752ca9714984e21e4241e804d32555251faf8b78ffa"
|
||||
dependencies = [
|
||||
"either",
|
||||
"rayon-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rayon-core"
|
||||
version = "1.12.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1465873a3dfdaa8ae7cb14b4383657caab0b3e8a0aa9ae8e04b044854c8dfce2"
|
||||
dependencies = [
|
||||
"crossbeam-deque",
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rustversion"
|
||||
version = "1.0.21"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a0d197bd2c9dc6e53b84da9556a69ba4cdfab8619eb41a8bd1cc2027a0f6b1d"
|
||||
|
||||
[[package]]
|
||||
name = "serde"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5f0e2c6ed6606019b4e29e69dbaba95b11854410e5347d525002456dbbb786b6"
|
||||
dependencies = [
|
||||
"serde_derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "serde_derive"
|
||||
version = "1.0.219"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5b0276cf7f2c73365f7157c8123c21cd9a50fbbd844757af28ca1f5925fc2a00"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sha2"
|
||||
version = "0.10.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"cpufeatures",
|
||||
"digest",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "shlex"
|
||||
version = "1.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64"
|
||||
|
||||
[[package]]
|
||||
name = "stringprep"
|
||||
version = "0.1.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b4df3d392d81bd458a8a621b8bffbd2302a12ffe288a9d931670948749463b1"
|
||||
dependencies = [
|
||||
"unicode-bidi",
|
||||
"unicode-normalization",
|
||||
"unicode-properties",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "syn"
|
||||
version = "2.0.101"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ce2b7fc941b3a24138a0a7cf8e858bfc6a992e7978a068a5c760deb0ed43caf"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror"
|
||||
version = "2.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "567b8a2dae586314f7be2a752ec7474332959c6460e02bde30d702a66d488708"
|
||||
dependencies = [
|
||||
"thiserror-impl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "thiserror-impl"
|
||||
version = "2.0.12"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7f7cf42b4507d8ea322120659672cf1b9dbb93f8f2d4ecfd6e51350ff5b17a1d"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time"
|
||||
version = "0.3.41"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8a7619e19bc266e0f9c5e6686659d394bc57973859340060a69221e57dbc0c40"
|
||||
dependencies = [
|
||||
"deranged",
|
||||
"itoa",
|
||||
"num-conv",
|
||||
"powerfmt",
|
||||
"serde",
|
||||
"time-core",
|
||||
"time-macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "time-core"
|
||||
version = "0.1.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c9e9a38711f559d9e3ce1cdb06dd7c5b8ea546bc90052da6d06bb76da74bb07c"
|
||||
|
||||
[[package]]
|
||||
name = "time-macros"
|
||||
version = "0.2.22"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3526739392ec93fd8b359c8e98514cb3e8e021beb4e5f597b00a0221f8ed8a49"
|
||||
dependencies = [
|
||||
"num-conv",
|
||||
"time-core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec"
|
||||
version = "1.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09b3661f17e86524eccd4371ab0429194e0d7c008abb45f7a7495b1719463c71"
|
||||
dependencies = [
|
||||
"tinyvec_macros",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tinyvec_macros"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20"
|
||||
|
||||
[[package]]
|
||||
name = "typenum"
|
||||
version = "1.18.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1dccffe3ce07af9386bfd29e80c0ab1a8205a2fc34e4bcd40364df902cfa8f3f"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-bidi"
|
||||
version = "0.3.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5c1cb5db39152898a79168971543b1cb5020dff7fe43c8dc468b0885f5e29df5"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-ident"
|
||||
version = "1.0.18"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5a5f39404a5da50712a4c1eecf25e90dd62b613502b7e925fd4e4d19b5c96512"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-normalization"
|
||||
version = "0.1.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "5033c97c4262335cded6d6fc3e5c18ab755e1a3dc96376350f3d8e9f009ad956"
|
||||
dependencies = [
|
||||
"tinyvec",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "unicode-properties"
|
||||
version = "0.1.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e70f2a8b45122e719eb623c01822704c4e0907e7e426a05927e1a1cfff5b75d0"
|
||||
|
||||
[[package]]
|
||||
name = "version_check"
|
||||
version = "0.9.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
|
||||
|
||||
[[package]]
|
||||
name = "wasi"
|
||||
version = "0.14.2+wasi-0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
|
||||
dependencies = [
|
||||
"wit-bindgen-rt",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen"
|
||||
version = "0.2.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1edc8929d7499fc4e8f0be2262a241556cfc54a0bea223790e71446f2aab1ef5"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"once_cell",
|
||||
"rustversion",
|
||||
"wasm-bindgen-macro",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-backend"
|
||||
version = "0.2.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2f0a0651a5c2bc21487bde11ee802ccaf4c51935d0d3d42a6101f98161700bc6"
|
||||
dependencies = [
|
||||
"bumpalo",
|
||||
"log",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro"
|
||||
version = "0.2.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7fe63fc6d09ed3792bd0897b314f53de8e16568c2b3f7982f468c0bf9bd0b407"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"wasm-bindgen-macro-support",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-macro-support"
|
||||
version = "0.2.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8ae87ea40c9f689fc23f209965b6fb8a99ad69aeeb0231408be24920604395de"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
"wasm-bindgen-backend",
|
||||
"wasm-bindgen-shared",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wasm-bindgen-shared"
|
||||
version = "0.2.100"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a05d73b933a847d6cccdda8f838a22ff101ad9bf93e33684f39c1f5f0eece3d"
|
||||
dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "weezl"
|
||||
version = "0.1.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a751b3277700db47d3e574514de2eced5e54dc8a5436a3bf7a0b248b2cee16f3"
|
||||
|
||||
[[package]]
|
||||
name = "windows-core"
|
||||
version = "0.61.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c0fdd3ddb90610c7638aa2b3a3ab2904fb9e5cdbecc643ddb3647212781c4ae3"
|
||||
dependencies = [
|
||||
"windows-implement",
|
||||
"windows-interface",
|
||||
"windows-link",
|
||||
"windows-result",
|
||||
"windows-strings",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-implement"
|
||||
version = "0.60.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a47fddd13af08290e67f4acabf4b459f647552718f683a7b415d290ac744a836"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-interface"
|
||||
version = "0.59.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "bd9211b69f8dcdfa817bfd14bf1c97c9188afa36f4750130fcdf3f400eca9fa8"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-link"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76840935b766e1b0a05c0066835fb9ec80071d4c09a16f6bd5f7e655e3c14c38"
|
||||
|
||||
[[package]]
|
||||
name = "windows-result"
|
||||
version = "0.3.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56f42bd332cc6c8eac5af113fc0c1fd6a8fd2aa08a0119358686e5160d0586c6"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-strings"
|
||||
version = "0.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "56e6c93f3a0c3b36176cb1327a4958a0353d5d166c2a35cb268ace15e91d3b57"
|
||||
dependencies = [
|
||||
"windows-link",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||
dependencies = [
|
||||
"windows-targets",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-targets"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973"
|
||||
dependencies = [
|
||||
"windows_aarch64_gnullvm",
|
||||
"windows_aarch64_msvc",
|
||||
"windows_i686_gnu",
|
||||
"windows_i686_gnullvm",
|
||||
"windows_i686_msvc",
|
||||
"windows_x86_64_gnu",
|
||||
"windows_x86_64_gnullvm",
|
||||
"windows_x86_64_msvc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3"
|
||||
|
||||
[[package]]
|
||||
name = "windows_aarch64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_i686_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnu"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_gnullvm"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d"
|
||||
|
||||
[[package]]
|
||||
name = "windows_x86_64_msvc"
|
||||
version = "0.52.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec"
|
||||
|
||||
[[package]]
|
||||
name = "wit-bindgen-rt"
|
||||
version = "0.39.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy"
|
||||
version = "0.8.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a1702d9583232ddb9174e01bb7c15a2ab8fb1bc6f227aa1233858c351a3ba0cb"
|
||||
dependencies = [
|
||||
"zerocopy-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zerocopy-derive"
|
||||
version = "0.8.25"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "28a6e20d751156648aa063f3800b706ee209a32c0b4d9f24be3d980b01be55ef"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn",
|
||||
]
|
11
apps/api/sharedLibs/pdf-parser/Cargo.toml
Normal file
11
apps/api/sharedLibs/pdf-parser/Cargo.toml
Normal file
@ -0,0 +1,11 @@
|
||||
[package]
|
||||
name = "pdf-parser"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
libc = "0.2.0"
|
||||
lopdf = "0.36.0"
|
||||
|
||||
[lib]
|
||||
crate-type = ["cdylib"]
|
24
apps/api/sharedLibs/pdf-parser/src/lib.rs
Normal file
24
apps/api/sharedLibs/pdf-parser/src/lib.rs
Normal file
@ -0,0 +1,24 @@
|
||||
use std::{ffi::CStr};
|
||||
|
||||
/// Returns the number of pages in a PDF file
|
||||
///
|
||||
/// # Safety
|
||||
/// Input path must be a C string of a path pointing to a PDF file. Output will be an integer, either the number of pages in the PDF or -1 indicating an error.
|
||||
#[no_mangle]
|
||||
pub unsafe extern "C" fn get_page_count(path: *const libc::c_char) -> i32 {
|
||||
let path: String = match unsafe { CStr::from_ptr(path) }.to_str().map_err(|_| ()) {
|
||||
Ok(x) => x.to_string(),
|
||||
Err(_) => {
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
let doc = match lopdf::Document::load(&path) {
|
||||
Ok(x) => x,
|
||||
Err(_) => {
|
||||
return -1;
|
||||
}
|
||||
};
|
||||
|
||||
doc.get_pages().len() as i32
|
||||
}
|
@ -28,6 +28,12 @@ function expectScrapeToSucceed(response: Awaited<ReturnType<typeof scrapeRaw>>)
|
||||
expect(typeof response.body.data).toBe("object");
|
||||
}
|
||||
|
||||
function expectScrapeToFail(response: Awaited<ReturnType<typeof scrapeRaw>>) {
|
||||
expect(response.statusCode).not.toBe(200);
|
||||
expect(response.body.success).toBe(false);
|
||||
expect(typeof response.body.error).toBe("string");
|
||||
}
|
||||
|
||||
export async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||
const raw = await scrapeRaw(body);
|
||||
expectScrapeToSucceed(raw);
|
||||
@ -39,6 +45,15 @@ export async function scrape(body: ScrapeRequestInput): Promise<Document> {
|
||||
return raw.body.data;
|
||||
}
|
||||
|
||||
export async function scrapeWithFailure(body: ScrapeRequestInput): Promise<{
|
||||
success: false;
|
||||
error: string;
|
||||
}> {
|
||||
const raw = await scrapeRaw(body);
|
||||
expectScrapeToFail(raw);
|
||||
return raw.body;
|
||||
}
|
||||
|
||||
export async function scrapeStatusRaw(jobId: string) {
|
||||
return await request(TEST_URL)
|
||||
.get("/v1/scrape/" + encodeURIComponent(jobId))
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { scrape, scrapeStatus } from "./lib";
|
||||
import { scrape, scrapeStatus, scrapeWithFailure } from "./lib";
|
||||
|
||||
describe("Scrape tests", () => {
|
||||
it.concurrent("mocking works properly", async () => {
|
||||
@ -297,16 +297,35 @@ describe("Scrape tests", () => {
|
||||
}, 130000);
|
||||
});
|
||||
|
||||
// Temporarily disabled, too flaky
|
||||
// describe("PDF (f-e dependant)", () => {
|
||||
// it.concurrent("works for PDFs behind anti-bot", async () => {
|
||||
// const response = await scrape({
|
||||
// url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
|
||||
// });
|
||||
describe("PDF (f-e dependant)", () => {
|
||||
// Temporarily disabled, too flaky
|
||||
// it.concurrent("works for PDFs behind anti-bot", async () => {
|
||||
// const response = await scrape({
|
||||
// url: "https://www.researchgate.net/profile/Amir-Leshem/publication/220732050_Robust_adaptive_beamforming_based_on_jointly_estimating_covariance_matrix_and_steering_vector/links/0c96052d2fd8f0a84b000000/Robust-adaptive-beamforming-based-on-jointly-estimating-covariance-matrix-and-steering-vector.pdf"
|
||||
// });
|
||||
|
||||
// expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
|
||||
// }, 60000);
|
||||
// });
|
||||
// expect(response.markdown).toContain("Robust adaptive beamforming based on jointly estimating covariance matrix");
|
||||
// }, 60000);
|
||||
|
||||
it.concurrent("blocks long PDFs with insufficient timeout", async () => {
|
||||
const response = await scrapeWithFailure({
|
||||
url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf",
|
||||
timeout: 30000,
|
||||
});
|
||||
|
||||
expect(response.error).toContain("Insufficient time to process PDF");
|
||||
}, 30000);
|
||||
|
||||
it.concurrent("scrapes long PDFs with sufficient timeout", async () => {
|
||||
const response = await scrape({
|
||||
url: "https://ecma-international.org/wp-content/uploads/ECMA-262_15th_edition_june_2024.pdf",
|
||||
timeout: 300000,
|
||||
});
|
||||
|
||||
// text on the last page
|
||||
expect(response.markdown).toContain("Redistribution and use in source and binary forms, with or without modification");
|
||||
}, 310000);
|
||||
});
|
||||
}
|
||||
|
||||
if (!process.env.TEST_SUITE_SELF_HOSTED || process.env.OPENAI_API_KEY || process.env.OLLAMA_BASE_URL) {
|
||||
|
@ -21,7 +21,7 @@ export function createPdfCacheKey(pdfContent: string | Buffer): string {
|
||||
*/
|
||||
export async function savePdfResultToCache(
|
||||
pdfContent: string,
|
||||
result: { markdown: string; html: string; numPages: number }
|
||||
result: { markdown: string; html: string }
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
if (!process.env.GCS_BUCKET_NAME) {
|
||||
@ -76,7 +76,7 @@ export async function savePdfResultToCache(
|
||||
*/
|
||||
export async function getPdfResultFromCache(
|
||||
pdfContent: string
|
||||
): Promise<{ markdown: string; html: string; numPages: number } | null> {
|
||||
): Promise<{ markdown: string; html: string } | null> {
|
||||
try {
|
||||
if (!process.env.GCS_BUCKET_NAME) {
|
||||
return null;
|
||||
@ -104,7 +104,6 @@ export async function getPdfResultFromCache(
|
||||
|
||||
return {
|
||||
...result,
|
||||
numPages: result.numPages ?? 1, // default to 1 page if cache is old
|
||||
};
|
||||
} catch (error) {
|
||||
logger.error(`Error retrieving PDF RunPod result from GCS cache`, {
|
||||
|
70
apps/api/src/lib/pdf-parser.ts
Normal file
70
apps/api/src/lib/pdf-parser.ts
Normal file
@ -0,0 +1,70 @@
|
||||
import koffi, { KoffiFunction } from "koffi";
|
||||
import { join } from "path";
|
||||
import { stat } from "fs/promises";
|
||||
import { platform } from "os";
|
||||
|
||||
// TODO: add a timeout to the Rust parser
|
||||
const rustExecutablePath = join(
|
||||
process.cwd(),
|
||||
"sharedLibs/pdf-parser/target/release/",
|
||||
platform() === "darwin" ? "libpdf_parser.dylib" : "libpdf_parser.so"
|
||||
);
|
||||
|
||||
class RustPDFParser {
|
||||
private static instance: RustPDFParser;
|
||||
private _getPageCount: KoffiFunction;
|
||||
|
||||
private constructor() {
|
||||
const lib = koffi.load(rustExecutablePath);
|
||||
this._getPageCount = lib.func("get_page_count", "int32", ["string"]);
|
||||
}
|
||||
|
||||
public static async isParserAvailable(): Promise<boolean> {
|
||||
if (RustPDFParser.instance) {
|
||||
return true;
|
||||
}
|
||||
|
||||
try {
|
||||
await stat(rustExecutablePath);
|
||||
RustPDFParser.instance = new RustPDFParser();
|
||||
return true;
|
||||
} catch (_) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
public static async getInstance(): Promise<RustPDFParser> {
|
||||
if (!RustPDFParser.instance) {
|
||||
try {
|
||||
await stat(rustExecutablePath);
|
||||
} catch (_) {
|
||||
throw Error("Rust pdf-parser shared library not found");
|
||||
}
|
||||
RustPDFParser.instance = new RustPDFParser();
|
||||
}
|
||||
return RustPDFParser.instance;
|
||||
}
|
||||
|
||||
public async getPageCount(path: string): Promise<number> {
|
||||
return new Promise<number>((resolve, reject) => {
|
||||
this._getPageCount.async(path, (err: Error, res: number) => {
|
||||
if (err) {
|
||||
reject(err);
|
||||
} else {
|
||||
if (res === -1) {
|
||||
reject(new Error("Failed to parse PDF."));
|
||||
} else {
|
||||
resolve(res);
|
||||
}
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
export async function getPageCount(
|
||||
path: string,
|
||||
): Promise<number> {
|
||||
const converter = await RustPDFParser.getInstance();
|
||||
return await converter.getPageCount(path);
|
||||
}
|
@ -7,15 +7,17 @@ import * as Sentry from "@sentry/node";
|
||||
import escapeHtml from "escape-html";
|
||||
import PdfParse from "pdf-parse";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
import { PDFAntibotError, RemoveFeatureError, UnsupportedFileError } from "../../error";
|
||||
import { PDFAntibotError, PDFInsufficientTimeError, RemoveFeatureError, TimeoutError } from "../../error";
|
||||
import { readFile, unlink } from "node:fs/promises";
|
||||
import path from "node:path";
|
||||
import type { Response } from "undici";
|
||||
import { getPdfResultFromCache, savePdfResultToCache } from "../../../../lib/gcs-pdf-cache";
|
||||
import { getPageCount } from "../../../../lib/pdf-parser";
|
||||
|
||||
type PDFProcessorResult = { html: string; markdown?: string; numPages: number };
|
||||
type PDFProcessorResult = { html: string; markdown?: string; };
|
||||
|
||||
const MAX_FILE_SIZE = 19 * 1024 * 1024; // 19MB
|
||||
const MILLISECONDS_PER_PAGE = 150;
|
||||
|
||||
async function scrapePDFWithRunPodMU(
|
||||
meta: Meta,
|
||||
@ -46,8 +48,13 @@ async function scrapePDFWithRunPodMU(
|
||||
}
|
||||
|
||||
const timeout = timeToRun ? timeToRun - (Date.now() - preCacheCheckStartTime) : undefined;
|
||||
if (timeout && timeout < 0) {
|
||||
throw new TimeoutError("MU PDF parser already timed out before call");
|
||||
}
|
||||
|
||||
const result = await robustFetch({
|
||||
const abort = timeout ? AbortSignal.timeout(timeout) : undefined;
|
||||
|
||||
const podStart = await robustFetch({
|
||||
url:
|
||||
"https://api.runpod.ai/v2/" + process.env.RUNPOD_MU_POD_ID + "/runsync",
|
||||
method: "POST",
|
||||
@ -63,22 +70,63 @@ async function scrapePDFWithRunPodMU(
|
||||
},
|
||||
},
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDFWithRunPodMU/robustFetch",
|
||||
method: "scrapePDFWithRunPodMU/runsync/robustFetch",
|
||||
}),
|
||||
schema: z.object({
|
||||
id: z.string(),
|
||||
status: z.string(),
|
||||
output: z.object({
|
||||
markdown: z.string(),
|
||||
num_pages: z.number(),
|
||||
}),
|
||||
}).optional(),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
abort: timeout ? AbortSignal.timeout(timeout) : undefined,
|
||||
abort,
|
||||
});
|
||||
|
||||
let status: string = podStart.status;
|
||||
let result: { markdown: string } | undefined = podStart.output;
|
||||
|
||||
if (status === "IN_QUEUE" || status === "IN_PROGRESS") {
|
||||
meta.logger.info("RunPod MU returned while in status " + status);
|
||||
do {
|
||||
abort?.throwIfAborted();
|
||||
await new Promise(resolve => setTimeout(resolve, 2500));
|
||||
abort?.throwIfAborted();
|
||||
const podStatus = await robustFetch({
|
||||
url: `https://api.runpod.ai/v2/${process.env.RUNPOD_MU_POD_ID}/status/${podStart.id}`,
|
||||
method: "GET",
|
||||
headers: {
|
||||
Authorization: `Bearer ${process.env.RUNPOD_MU_API_KEY}`,
|
||||
},
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDFWithRunPodMU/status/robustFetch",
|
||||
}),
|
||||
schema: z.object({
|
||||
status: z.string(),
|
||||
output: z.object({
|
||||
markdown: z.string(),
|
||||
}).optional(),
|
||||
}),
|
||||
mock: meta.mock,
|
||||
abort,
|
||||
});
|
||||
meta.logger.info("RunPod MU status " + podStatus.status);
|
||||
status = podStatus.status;
|
||||
result = podStatus.output;
|
||||
} while (status !== "COMPLETED" && status !== "FAILED");
|
||||
}
|
||||
|
||||
if (status === "FAILED") {
|
||||
throw new Error("RunPod MU failed to parse PDF");
|
||||
}
|
||||
|
||||
if (!result) {
|
||||
throw new Error("RunPod MU returned no result");
|
||||
}
|
||||
|
||||
const processorResult = {
|
||||
markdown: result.output.markdown,
|
||||
html: await marked.parse(result.output.markdown, { async: true }),
|
||||
numPages: result.output.num_pages,
|
||||
markdown: result.markdown,
|
||||
html: await marked.parse(result.markdown, { async: true }),
|
||||
};
|
||||
|
||||
try {
|
||||
@ -105,7 +153,6 @@ async function scrapePDFWithParsePDF(
|
||||
return {
|
||||
markdown: escaped,
|
||||
html: escaped,
|
||||
numPages: result.numpages,
|
||||
};
|
||||
}
|
||||
|
||||
@ -160,6 +207,11 @@ export async function scrapePDF(
|
||||
}
|
||||
}
|
||||
|
||||
const pageCount = await getPageCount(tempFilePath);
|
||||
if (pageCount * MILLISECONDS_PER_PAGE > (timeToRun ?? Infinity)) {
|
||||
throw new PDFInsufficientTimeError(pageCount, pageCount * MILLISECONDS_PER_PAGE + 5000);
|
||||
}
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
|
||||
const base64Content = (await readFile(tempFilePath)).toString("base64");
|
||||
@ -214,5 +266,6 @@ export async function scrapePDF(
|
||||
statusCode: response.status,
|
||||
html: result?.html ?? "",
|
||||
markdown: result?.markdown ?? "",
|
||||
numPages: pageCount,
|
||||
};
|
||||
}
|
||||
|
@ -86,3 +86,9 @@ export class PDFAntibotError extends Error {
|
||||
super("PDF scrape was prevented by anti-bot")
|
||||
}
|
||||
}
|
||||
|
||||
export class PDFInsufficientTimeError extends Error {
|
||||
constructor(pageCount: number, minTimeout: number) {
|
||||
super(`Insufficient time to process PDF of ${pageCount} pages. Please increase the timeout parameter in your scrape request to at least ${minTimeout}ms.`);
|
||||
}
|
||||
}
|
||||
|
@ -22,6 +22,7 @@ import {
|
||||
TimeoutError,
|
||||
UnsupportedFileError,
|
||||
SSLError,
|
||||
PDFInsufficientTimeError,
|
||||
} from "./error";
|
||||
import { executeTransformers } from "./transformers";
|
||||
import { LLMRefusalError } from "./transformers/llmExtract";
|
||||
@ -340,6 +341,8 @@ async function scrapeURLLoop(meta: Meta): Promise<ScrapeUrlResponse> {
|
||||
throw error;
|
||||
} else if (error instanceof TimeoutSignal) {
|
||||
throw error;
|
||||
} else if (error instanceof PDFInsufficientTimeError) {
|
||||
throw error;
|
||||
} else {
|
||||
Sentry.captureException(error);
|
||||
meta.logger.warn(
|
||||
@ -489,6 +492,8 @@ export async function scrapeURL(
|
||||
meta.logger.warn("scrapeURL: Tried to scrape unsupported file", {
|
||||
error,
|
||||
});
|
||||
} else if (error instanceof PDFInsufficientTimeError) {
|
||||
meta.logger.warn("scrapeURL: Insufficient time to process PDF", { error });
|
||||
} else if (error instanceof TimeoutSignal) {
|
||||
throw error;
|
||||
} else {
|
||||
|
Loading…
x
Reference in New Issue
Block a user