mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-22 14:26:44 +08:00
Merge pull request #984 from mendableai/default-to-pdf-parse
Default to pdf2md, if under 500 chars (indicating failure) use LlamaParse
This commit is contained in:
commit
db8e9c36d6
@ -58,6 +58,7 @@
|
||||
"@devil7softwares/pos": "^1.0.2",
|
||||
"@dqbd/tiktoken": "^1.0.17",
|
||||
"@nangohq/node": "^0.40.8",
|
||||
"@opendocsg/pdf2md": "^0.2.1",
|
||||
"@sentry/cli": "^2.33.1",
|
||||
"@sentry/node": "^8.26.0",
|
||||
"@sentry/profiling-node": "^8.26.0",
|
||||
@ -100,7 +101,6 @@
|
||||
"mongoose": "^8.4.4",
|
||||
"natural": "^7.0.7",
|
||||
"openai": "^4.57.0",
|
||||
"pdf-parse": "^1.1.1",
|
||||
"pos": "^0.4.2",
|
||||
"posthog-node": "^4.0.1",
|
||||
"promptable": "^0.0.10",
|
||||
|
295
apps/api/pnpm-lock.yaml
generated
295
apps/api/pnpm-lock.yaml
generated
@ -29,6 +29,9 @@ importers:
|
||||
'@nangohq/node':
|
||||
specifier: ^0.40.8
|
||||
version: 0.40.8
|
||||
'@opendocsg/pdf2md':
|
||||
specifier: ^0.2.1
|
||||
version: 0.2.1
|
||||
'@sentry/cli':
|
||||
specifier: ^2.33.1
|
||||
version: 2.33.1
|
||||
@ -155,9 +158,6 @@ importers:
|
||||
openai:
|
||||
specifier: ^4.57.0
|
||||
version: 4.57.0(zod@3.23.8)
|
||||
pdf-parse:
|
||||
specifier: ^1.1.1
|
||||
version: 1.1.1
|
||||
pos:
|
||||
specifier: ^0.4.2
|
||||
version: 0.4.2
|
||||
@ -794,6 +794,10 @@ packages:
|
||||
resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==}
|
||||
engines: {node: '>=18'}
|
||||
|
||||
'@mapbox/node-pre-gyp@1.0.11':
|
||||
resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==}
|
||||
hasBin: true
|
||||
|
||||
'@mixmark-io/domino@2.2.0':
|
||||
resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
|
||||
|
||||
@ -837,6 +841,10 @@ packages:
|
||||
'@one-ini/wasm@0.1.1':
|
||||
resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==}
|
||||
|
||||
'@opendocsg/pdf2md@0.2.1':
|
||||
resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==}
|
||||
hasBin: true
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==}
|
||||
engines: {node: '>=14'}
|
||||
@ -1615,6 +1623,9 @@ packages:
|
||||
resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
|
||||
engines: {node: '>=10.0.0'}
|
||||
|
||||
abbrev@1.1.1:
|
||||
resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==}
|
||||
|
||||
abbrev@2.0.0:
|
||||
resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==}
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
@ -1708,6 +1719,14 @@ packages:
|
||||
resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==}
|
||||
engines: {node: '>=0.2.6'}
|
||||
|
||||
aproba@2.0.0:
|
||||
resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==}
|
||||
|
||||
are-we-there-yet@2.0.0:
|
||||
resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==}
|
||||
engines: {node: '>=10'}
|
||||
deprecated: This package is no longer supported.
|
||||
|
||||
arg@4.1.3:
|
||||
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==}
|
||||
|
||||
@ -1908,6 +1927,10 @@ packages:
|
||||
caniuse-lite@1.0.30001627:
|
||||
resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==}
|
||||
|
||||
canvas@2.11.2:
|
||||
resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
chalk@2.4.2:
|
||||
resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==}
|
||||
engines: {node: '>=4'}
|
||||
@ -1938,6 +1961,10 @@ packages:
|
||||
resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
|
||||
engines: {node: '>= 8.10.0'}
|
||||
|
||||
chownr@2.0.0:
|
||||
resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
chownr@3.0.0:
|
||||
resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
|
||||
engines: {node: '>=18'}
|
||||
@ -1995,6 +2022,10 @@ packages:
|
||||
color-string@1.9.1:
|
||||
resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==}
|
||||
|
||||
color-support@1.1.3:
|
||||
resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==}
|
||||
hasBin: true
|
||||
|
||||
color@3.2.1:
|
||||
resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==}
|
||||
|
||||
@ -2021,6 +2052,9 @@ packages:
|
||||
config-chain@1.1.13:
|
||||
resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==}
|
||||
|
||||
console-control-strings@1.1.0:
|
||||
resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==}
|
||||
|
||||
content-disposition@0.5.4:
|
||||
resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==}
|
||||
engines: {node: '>= 0.6'}
|
||||
@ -2147,6 +2181,10 @@ packages:
|
||||
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
decompress-response@4.2.1:
|
||||
resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
dedent@1.5.3:
|
||||
resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==}
|
||||
peerDependencies:
|
||||
@ -2171,6 +2209,9 @@ packages:
|
||||
resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
|
||||
engines: {node: '>=0.4.0'}
|
||||
|
||||
delegates@1.0.0:
|
||||
resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==}
|
||||
|
||||
denque@2.1.0:
|
||||
resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==}
|
||||
engines: {node: '>=0.10'}
|
||||
@ -2283,6 +2324,9 @@ packages:
|
||||
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
enumify@1.0.4:
|
||||
resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==}
|
||||
|
||||
env-paths@2.2.1:
|
||||
resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==}
|
||||
engines: {node: '>=6'}
|
||||
@ -2484,6 +2528,10 @@ packages:
|
||||
resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==}
|
||||
engines: {node: '>=14.14'}
|
||||
|
||||
fs-minipass@2.1.0:
|
||||
resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==}
|
||||
engines: {node: '>= 8'}
|
||||
|
||||
fs.realpath@1.0.0:
|
||||
resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==}
|
||||
|
||||
@ -2495,6 +2543,11 @@ packages:
|
||||
function-bind@1.1.2:
|
||||
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
|
||||
|
||||
gauge@3.0.2:
|
||||
resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==}
|
||||
engines: {node: '>=10'}
|
||||
deprecated: This package is no longer supported.
|
||||
|
||||
generic-pool@3.9.0:
|
||||
resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==}
|
||||
engines: {node: '>= 4'}
|
||||
@ -2578,6 +2631,9 @@ packages:
|
||||
resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==}
|
||||
engines: {node: '>= 0.4'}
|
||||
|
||||
has-unicode@2.0.1:
|
||||
resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==}
|
||||
|
||||
hasown@2.0.2:
|
||||
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
|
||||
engines: {node: '>= 0.4'}
|
||||
@ -3256,6 +3312,10 @@ packages:
|
||||
resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==}
|
||||
engines: {node: '>=12'}
|
||||
|
||||
make-dir@3.1.0:
|
||||
resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
make-dir@4.0.0:
|
||||
resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==}
|
||||
engines: {node: '>=10'}
|
||||
@ -3326,6 +3386,10 @@ packages:
|
||||
resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==}
|
||||
engines: {node: '>=6'}
|
||||
|
||||
mimic-response@2.1.0:
|
||||
resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
minimatch@3.1.2:
|
||||
resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
|
||||
|
||||
@ -3344,10 +3408,22 @@ packages:
|
||||
minimist@1.2.8:
|
||||
resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
|
||||
|
||||
minipass@3.3.6:
|
||||
resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
minipass@5.0.0:
|
||||
resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
minipass@7.1.2:
|
||||
resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==}
|
||||
engines: {node: '>=16 || 14 >=14.17'}
|
||||
|
||||
minizlib@2.1.2:
|
||||
resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==}
|
||||
engines: {node: '>= 8'}
|
||||
|
||||
minizlib@3.0.1:
|
||||
resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==}
|
||||
engines: {node: '>= 18'}
|
||||
@ -3359,6 +3435,11 @@ packages:
|
||||
resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==}
|
||||
hasBin: true
|
||||
|
||||
mkdirp@1.0.4:
|
||||
resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==}
|
||||
engines: {node: '>=10'}
|
||||
hasBin: true
|
||||
|
||||
mkdirp@3.0.1:
|
||||
resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==}
|
||||
engines: {node: '>=10'}
|
||||
@ -3447,6 +3528,9 @@ packages:
|
||||
resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==}
|
||||
hasBin: true
|
||||
|
||||
nan@2.22.0:
|
||||
resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==}
|
||||
|
||||
natural-compare@1.4.0:
|
||||
resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
|
||||
|
||||
@ -3507,6 +3591,11 @@ packages:
|
||||
engines: {node: '>=8.10.0'}
|
||||
hasBin: true
|
||||
|
||||
nopt@5.0.0:
|
||||
resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==}
|
||||
engines: {node: '>=6'}
|
||||
hasBin: true
|
||||
|
||||
nopt@7.2.1:
|
||||
resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==}
|
||||
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
|
||||
@ -3524,6 +3613,10 @@ packages:
|
||||
resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
npmlog@5.0.1:
|
||||
resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==}
|
||||
deprecated: This package is no longer supported.
|
||||
|
||||
nth-check@2.1.1:
|
||||
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
|
||||
|
||||
@ -3949,6 +4042,11 @@ packages:
|
||||
resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==}
|
||||
engines: {node: '>= 4'}
|
||||
|
||||
rimraf@3.0.2:
|
||||
resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==}
|
||||
deprecated: Rimraf versions prior to v4 are no longer supported
|
||||
hasBin: true
|
||||
|
||||
rimraf@5.0.7:
|
||||
resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==}
|
||||
engines: {node: '>=14.18'}
|
||||
@ -4019,6 +4117,9 @@ packages:
|
||||
resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==}
|
||||
engines: {node: '>= 0.8.0'}
|
||||
|
||||
set-blocking@2.0.0:
|
||||
resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==}
|
||||
|
||||
set-function-length@1.2.2:
|
||||
resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==}
|
||||
engines: {node: '>= 0.4'}
|
||||
@ -4057,6 +4158,12 @@ packages:
|
||||
resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==}
|
||||
engines: {node: '>=14'}
|
||||
|
||||
simple-concat@1.0.1:
|
||||
resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==}
|
||||
|
||||
simple-get@3.1.1:
|
||||
resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==}
|
||||
|
||||
simple-swizzle@0.2.2:
|
||||
resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==}
|
||||
|
||||
@ -4215,6 +4322,10 @@ packages:
|
||||
tar-stream@3.1.7:
|
||||
resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==}
|
||||
|
||||
tar@6.2.1:
|
||||
resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==}
|
||||
engines: {node: '>=10'}
|
||||
|
||||
tar@7.2.0:
|
||||
resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==}
|
||||
engines: {node: '>=18'}
|
||||
@ -4369,6 +4480,9 @@ packages:
|
||||
resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==}
|
||||
engines: {node: '>= 10.0.0'}
|
||||
|
||||
unpdf@0.12.1:
|
||||
resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==}
|
||||
|
||||
unpipe@1.0.0:
|
||||
resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==}
|
||||
engines: {node: '>= 0.8'}
|
||||
@ -4456,6 +4570,9 @@ packages:
|
||||
engines: {node: '>= 8'}
|
||||
hasBin: true
|
||||
|
||||
wide-align@1.1.5:
|
||||
resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==}
|
||||
|
||||
winston-transport@4.8.0:
|
||||
resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==}
|
||||
engines: {node: '>= 12.0.0'}
|
||||
@ -5607,6 +5724,22 @@ snapshots:
|
||||
- langchain
|
||||
- openai
|
||||
|
||||
'@mapbox/node-pre-gyp@1.0.11':
|
||||
dependencies:
|
||||
detect-libc: 2.0.3
|
||||
https-proxy-agent: 5.0.1
|
||||
make-dir: 3.1.0
|
||||
node-fetch: 2.7.0
|
||||
nopt: 5.0.0
|
||||
npmlog: 5.0.1
|
||||
rimraf: 3.0.2
|
||||
semver: 7.6.2
|
||||
tar: 6.2.1
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- supports-color
|
||||
optional: true
|
||||
|
||||
'@mixmark-io/domino@2.2.0': {}
|
||||
|
||||
'@mongodb-js/saslprep@1.1.7':
|
||||
@ -5639,6 +5772,15 @@ snapshots:
|
||||
|
||||
'@one-ini/wasm@0.1.1': {}
|
||||
|
||||
'@opendocsg/pdf2md@0.2.1':
|
||||
dependencies:
|
||||
enumify: 1.0.4
|
||||
minimist: 1.2.8
|
||||
unpdf: 0.12.1
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- supports-color
|
||||
|
||||
'@opentelemetry/api-logs@0.52.1':
|
||||
dependencies:
|
||||
'@opentelemetry/api': 1.9.0
|
||||
@ -6673,6 +6815,9 @@ snapshots:
|
||||
|
||||
'@xmldom/xmldom@0.8.10': {}
|
||||
|
||||
abbrev@1.1.1:
|
||||
optional: true
|
||||
|
||||
abbrev@2.0.0: {}
|
||||
|
||||
abort-controller@3.0.0:
|
||||
@ -6755,6 +6900,15 @@ snapshots:
|
||||
dependencies:
|
||||
sylvester: 0.0.12
|
||||
|
||||
aproba@2.0.0:
|
||||
optional: true
|
||||
|
||||
are-we-there-yet@2.0.0:
|
||||
dependencies:
|
||||
delegates: 1.0.0
|
||||
readable-stream: 3.6.2
|
||||
optional: true
|
||||
|
||||
arg@4.1.3: {}
|
||||
|
||||
argparse@1.0.10:
|
||||
@ -7008,6 +7162,16 @@ snapshots:
|
||||
|
||||
caniuse-lite@1.0.30001627: {}
|
||||
|
||||
canvas@2.11.2:
|
||||
dependencies:
|
||||
'@mapbox/node-pre-gyp': 1.0.11
|
||||
nan: 2.22.0
|
||||
simple-get: 3.1.1
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- supports-color
|
||||
optional: true
|
||||
|
||||
chalk@2.4.2:
|
||||
dependencies:
|
||||
ansi-styles: 3.2.1
|
||||
@ -7056,6 +7220,9 @@ snapshots:
|
||||
optionalDependencies:
|
||||
fsevents: 2.3.3
|
||||
|
||||
chownr@2.0.0:
|
||||
optional: true
|
||||
|
||||
chownr@3.0.0: {}
|
||||
|
||||
chromium-bidi@0.5.24(devtools-protocol@0.0.1299070):
|
||||
@ -7121,6 +7288,9 @@ snapshots:
|
||||
color-name: 1.1.4
|
||||
simple-swizzle: 0.2.2
|
||||
|
||||
color-support@1.1.3:
|
||||
optional: true
|
||||
|
||||
color@3.2.1:
|
||||
dependencies:
|
||||
color-convert: 1.9.3
|
||||
@ -7148,6 +7318,9 @@ snapshots:
|
||||
ini: 1.3.8
|
||||
proto-list: 1.2.4
|
||||
|
||||
console-control-strings@1.1.0:
|
||||
optional: true
|
||||
|
||||
content-disposition@0.5.4:
|
||||
dependencies:
|
||||
safe-buffer: 5.2.1
|
||||
@ -7255,6 +7428,11 @@ snapshots:
|
||||
|
||||
decamelize@4.0.0: {}
|
||||
|
||||
decompress-response@4.2.1:
|
||||
dependencies:
|
||||
mimic-response: 2.1.0
|
||||
optional: true
|
||||
|
||||
dedent@1.5.3: {}
|
||||
|
||||
deepmerge@4.3.1: {}
|
||||
@ -7273,6 +7451,9 @@ snapshots:
|
||||
|
||||
delayed-stream@1.0.0: {}
|
||||
|
||||
delegates@1.0.0:
|
||||
optional: true
|
||||
|
||||
denque@2.1.0: {}
|
||||
|
||||
depd@2.0.0: {}
|
||||
@ -7364,6 +7545,8 @@ snapshots:
|
||||
|
||||
entities@4.5.0: {}
|
||||
|
||||
enumify@1.0.4: {}
|
||||
|
||||
env-paths@2.2.1: {}
|
||||
|
||||
error-ex@1.3.2:
|
||||
@ -7589,6 +7772,11 @@ snapshots:
|
||||
jsonfile: 6.1.0
|
||||
universalify: 2.0.1
|
||||
|
||||
fs-minipass@2.1.0:
|
||||
dependencies:
|
||||
minipass: 3.3.6
|
||||
optional: true
|
||||
|
||||
fs.realpath@1.0.0: {}
|
||||
|
||||
fsevents@2.3.3:
|
||||
@ -7596,6 +7784,19 @@ snapshots:
|
||||
|
||||
function-bind@1.1.2: {}
|
||||
|
||||
gauge@3.0.2:
|
||||
dependencies:
|
||||
aproba: 2.0.0
|
||||
color-support: 1.1.3
|
||||
console-control-strings: 1.1.0
|
||||
has-unicode: 2.0.1
|
||||
object-assign: 4.1.1
|
||||
signal-exit: 3.0.7
|
||||
string-width: 4.2.3
|
||||
strip-ansi: 6.0.1
|
||||
wide-align: 1.1.5
|
||||
optional: true
|
||||
|
||||
generic-pool@3.9.0: {}
|
||||
|
||||
gensync@1.0.0-beta.2: {}
|
||||
@ -7683,6 +7884,9 @@ snapshots:
|
||||
|
||||
has-symbols@1.0.3: {}
|
||||
|
||||
has-unicode@2.0.1:
|
||||
optional: true
|
||||
|
||||
hasown@2.0.2:
|
||||
dependencies:
|
||||
function-bind: 1.1.2
|
||||
@ -8463,6 +8667,11 @@ snapshots:
|
||||
|
||||
luxon@3.4.4: {}
|
||||
|
||||
make-dir@3.1.0:
|
||||
dependencies:
|
||||
semver: 6.3.1
|
||||
optional: true
|
||||
|
||||
make-dir@4.0.0:
|
||||
dependencies:
|
||||
semver: 7.6.2
|
||||
@ -8523,6 +8732,9 @@ snapshots:
|
||||
|
||||
mimic-fn@2.1.0: {}
|
||||
|
||||
mimic-response@2.1.0:
|
||||
optional: true
|
||||
|
||||
minimatch@3.1.2:
|
||||
dependencies:
|
||||
brace-expansion: 1.1.11
|
||||
@ -8541,8 +8753,22 @@ snapshots:
|
||||
|
||||
minimist@1.2.8: {}
|
||||
|
||||
minipass@3.3.6:
|
||||
dependencies:
|
||||
yallist: 4.0.0
|
||||
optional: true
|
||||
|
||||
minipass@5.0.0:
|
||||
optional: true
|
||||
|
||||
minipass@7.1.2: {}
|
||||
|
||||
minizlib@2.1.2:
|
||||
dependencies:
|
||||
minipass: 3.3.6
|
||||
yallist: 4.0.0
|
||||
optional: true
|
||||
|
||||
minizlib@3.0.1:
|
||||
dependencies:
|
||||
minipass: 7.1.2
|
||||
@ -8554,6 +8780,9 @@ snapshots:
|
||||
dependencies:
|
||||
minimist: 1.2.8
|
||||
|
||||
mkdirp@1.0.4:
|
||||
optional: true
|
||||
|
||||
mkdirp@3.0.1: {}
|
||||
|
||||
ml-array-mean@1.1.6:
|
||||
@ -8646,6 +8875,9 @@ snapshots:
|
||||
|
||||
mustache@4.2.0: {}
|
||||
|
||||
nan@2.22.0:
|
||||
optional: true
|
||||
|
||||
natural-compare@1.4.0: {}
|
||||
|
||||
natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3):
|
||||
@ -8692,7 +8924,8 @@ snapshots:
|
||||
|
||||
node-domexception@1.0.0: {}
|
||||
|
||||
node-ensure@0.0.0: {}
|
||||
node-ensure@0.0.0:
|
||||
optional: true
|
||||
|
||||
node-fetch@2.7.0:
|
||||
dependencies:
|
||||
@ -8726,6 +8959,11 @@ snapshots:
|
||||
touch: 3.1.1
|
||||
undefsafe: 2.0.5
|
||||
|
||||
nopt@5.0.0:
|
||||
dependencies:
|
||||
abbrev: 1.1.1
|
||||
optional: true
|
||||
|
||||
nopt@7.2.1:
|
||||
dependencies:
|
||||
abbrev: 2.0.0
|
||||
@ -8738,6 +8976,14 @@ snapshots:
|
||||
dependencies:
|
||||
path-key: 3.1.1
|
||||
|
||||
npmlog@5.0.1:
|
||||
dependencies:
|
||||
are-we-there-yet: 2.0.0
|
||||
console-control-strings: 1.1.0
|
||||
gauge: 3.0.2
|
||||
set-blocking: 2.0.0
|
||||
optional: true
|
||||
|
||||
nth-check@2.1.1:
|
||||
dependencies:
|
||||
boolbase: 1.0.0
|
||||
@ -8909,6 +9155,7 @@ snapshots:
|
||||
node-ensure: 0.0.0
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
optional: true
|
||||
|
||||
peberminta@0.9.0: {}
|
||||
|
||||
@ -9210,6 +9457,11 @@ snapshots:
|
||||
|
||||
retry@0.13.1: {}
|
||||
|
||||
rimraf@3.0.2:
|
||||
dependencies:
|
||||
glob: 7.2.3
|
||||
optional: true
|
||||
|
||||
rimraf@5.0.7:
|
||||
dependencies:
|
||||
glob: 10.4.2
|
||||
@ -9285,6 +9537,9 @@ snapshots:
|
||||
transitivePeerDependencies:
|
||||
- supports-color
|
||||
|
||||
set-blocking@2.0.0:
|
||||
optional: true
|
||||
|
||||
set-function-length@1.2.2:
|
||||
dependencies:
|
||||
define-data-property: 1.1.4
|
||||
@ -9321,6 +9576,16 @@ snapshots:
|
||||
|
||||
signal-exit@4.1.0: {}
|
||||
|
||||
simple-concat@1.0.1:
|
||||
optional: true
|
||||
|
||||
simple-get@3.1.1:
|
||||
dependencies:
|
||||
decompress-response: 4.2.1
|
||||
once: 1.4.0
|
||||
simple-concat: 1.0.1
|
||||
optional: true
|
||||
|
||||
simple-swizzle@0.2.2:
|
||||
dependencies:
|
||||
is-arrayish: 0.3.2
|
||||
@ -9494,6 +9759,16 @@ snapshots:
|
||||
fast-fifo: 1.3.2
|
||||
streamx: 2.18.0
|
||||
|
||||
tar@6.2.1:
|
||||
dependencies:
|
||||
chownr: 2.0.0
|
||||
fs-minipass: 2.1.0
|
||||
minipass: 5.0.0
|
||||
minizlib: 2.1.2
|
||||
mkdirp: 1.0.4
|
||||
yallist: 4.0.0
|
||||
optional: true
|
||||
|
||||
tar@7.2.0:
|
||||
dependencies:
|
||||
'@isaacs/fs-minipass': 4.0.1
|
||||
@ -9626,6 +9901,13 @@ snapshots:
|
||||
|
||||
universalify@2.0.1: {}
|
||||
|
||||
unpdf@0.12.1:
|
||||
optionalDependencies:
|
||||
canvas: 2.11.2
|
||||
transitivePeerDependencies:
|
||||
- encoding
|
||||
- supports-color
|
||||
|
||||
unpipe@1.0.0: {}
|
||||
|
||||
unstructured-client@0.11.3(zod@3.23.8):
|
||||
@ -9698,6 +9980,11 @@ snapshots:
|
||||
dependencies:
|
||||
isexe: 2.0.0
|
||||
|
||||
wide-align@1.1.5:
|
||||
dependencies:
|
||||
string-width: 4.2.3
|
||||
optional: true
|
||||
|
||||
winston-transport@4.8.0:
|
||||
dependencies:
|
||||
logform: 2.6.1
|
||||
|
@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch";
|
||||
import { z } from "zod";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import escapeHtml from "escape-html";
|
||||
import PdfParse from "pdf-parse";
|
||||
import pdf2md from "@opendocsg/pdf2md";
|
||||
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
|
||||
import { RemoveFeatureError } from "../../error";
|
||||
|
||||
@ -113,10 +113,11 @@ async function scrapePDFWithParsePDF(
|
||||
meta: Meta,
|
||||
tempFilePath: string,
|
||||
): Promise<PDFProcessorResult> {
|
||||
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
|
||||
meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath });
|
||||
|
||||
const result = await PdfParse(await fs.readFile(tempFilePath));
|
||||
const escaped = escapeHtml(result.text);
|
||||
const pdfBuffer = await fs.readFile(tempFilePath);
|
||||
const markdown = await pdf2md(pdfBuffer);
|
||||
const escaped = escapeHtml(markdown);
|
||||
|
||||
return {
|
||||
markdown: escaped,
|
||||
@ -140,36 +141,8 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
|
||||
|
||||
let result: PDFProcessorResult | null = null;
|
||||
if (process.env.LLAMAPARSE_API_KEY) {
|
||||
try {
|
||||
result = await scrapePDFWithLlamaParse(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithLlamaParse",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
timeToRun,
|
||||
);
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
|
||||
error,
|
||||
});
|
||||
} else if (error instanceof RemoveFeatureError) {
|
||||
throw error;
|
||||
} else {
|
||||
meta.logger.warn(
|
||||
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
|
||||
{ error },
|
||||
);
|
||||
Sentry.captureException(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result === null) {
|
||||
// First, try parsing with pdf2md
|
||||
result = await scrapePDFWithParsePDF(
|
||||
{
|
||||
...meta,
|
||||
@ -179,6 +152,37 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
|
||||
},
|
||||
tempFilePath,
|
||||
);
|
||||
|
||||
|
||||
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
|
||||
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
|
||||
try {
|
||||
const llamaResult = await scrapePDFWithLlamaParse(
|
||||
{
|
||||
...meta,
|
||||
logger: meta.logger.child({
|
||||
method: "scrapePDF/scrapePDFWithLlamaParse",
|
||||
}),
|
||||
},
|
||||
tempFilePath,
|
||||
timeToRun,
|
||||
);
|
||||
result = llamaResult; // Use LlamaParse result if successful
|
||||
} catch (error) {
|
||||
if (error instanceof Error && error.message === "LlamaParse timed out") {
|
||||
meta.logger.warn("LlamaParse timed out -- using pdf2md result", {
|
||||
error,
|
||||
});
|
||||
} else if (error instanceof RemoveFeatureError) {
|
||||
throw error;
|
||||
} else {
|
||||
meta.logger.warn(
|
||||
"LlamaParse failed to parse PDF -- using pdf2md result",
|
||||
{ error },
|
||||
);
|
||||
Sentry.captureException(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
await fs.unlink(tempFilePath);
|
||||
|
Loading…
x
Reference in New Issue
Block a user