Merge pull request #984 from mendableai/default-to-pdf-parse

Default to pdf2md, if under 500 chars (indicating failure) use LlamaParse
This commit is contained in:
Eric Ciarla 2024-12-17 10:06:40 -05:00 committed by GitHub
commit db8e9c36d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 316 additions and 25 deletions

View File

@ -58,6 +58,7 @@
"@devil7softwares/pos": "^1.0.2",
"@dqbd/tiktoken": "^1.0.17",
"@nangohq/node": "^0.40.8",
"@opendocsg/pdf2md": "^0.2.1",
"@sentry/cli": "^2.33.1",
"@sentry/node": "^8.26.0",
"@sentry/profiling-node": "^8.26.0",
@ -100,7 +101,6 @@
"mongoose": "^8.4.4",
"natural": "^7.0.7",
"openai": "^4.57.0",
"pdf-parse": "^1.1.1",
"pos": "^0.4.2",
"posthog-node": "^4.0.1",
"promptable": "^0.0.10",

295
apps/api/pnpm-lock.yaml generated
View File

@ -29,6 +29,9 @@ importers:
'@nangohq/node':
specifier: ^0.40.8
version: 0.40.8
'@opendocsg/pdf2md':
specifier: ^0.2.1
version: 0.2.1
'@sentry/cli':
specifier: ^2.33.1
version: 2.33.1
@ -155,9 +158,6 @@ importers:
openai:
specifier: ^4.57.0
version: 4.57.0(zod@3.23.8)
pdf-parse:
specifier: ^1.1.1
version: 1.1.1
pos:
specifier: ^0.4.2
version: 0.4.2
@ -794,6 +794,10 @@ packages:
resolution: {integrity: sha512-cXWgKE3sdWLSqAa8ykbCcUsUF1Kyr5J3HOWYGuobhPEycXW4WI++d5DhzdpL238mzoEXTi90VqfSCra37l5YqA==}
engines: {node: '>=18'}
'@mapbox/node-pre-gyp@1.0.11':
resolution: {integrity: sha512-Yhlar6v9WQgUp/He7BdgzOz8lqMQ8sU+jkCq7Wx8Myc5YFJLbEe7lgui/V7G1qB1DJykHSGwreceSaD60Y0PUQ==}
hasBin: true
'@mixmark-io/domino@2.2.0':
resolution: {integrity: sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==}
@ -837,6 +841,10 @@ packages:
'@one-ini/wasm@0.1.1':
resolution: {integrity: sha512-XuySG1E38YScSJoMlqovLru4KTUNSjgVTIjyh7qMX6aNN5HY5Ct5LhRJdxO79JtTzKfzV/bnWpz+zquYrISsvw==}
'@opendocsg/pdf2md@0.2.1':
resolution: {integrity: sha512-k/yvfrTb+GPTIIm/bMm5IsenTqAFl+IqvkBgFwFlmflS5TT7FOfyRLp8MypVWLAG4G9AnT7AZFbdQYgN/CR5BA==}
hasBin: true
'@opentelemetry/api-logs@0.52.1':
resolution: {integrity: sha512-qnSqB2DQ9TPP96dl8cDubDvrUyWc0/sK81xHTK8eSUspzDM3bsewX903qclQFvVhgStjRWdC5bLb3kQqMkfV5A==}
engines: {node: '>=14'}
@ -1615,6 +1623,9 @@ packages:
resolution: {integrity: sha512-2WALfTl4xo2SkGCYRt6rDTFfk9R1czmBvUQy12gK2KuRKIpWEhcbbzy8EZXtz/jkRqHX8bFEc6FC1HjX4TUWYw==}
engines: {node: '>=10.0.0'}
abbrev@1.1.1:
resolution: {integrity: sha512-nne9/IiQ/hzIhY6pdDnbBtz7DjPTKrY00P/zvPSm5pOFkl6xuGrGnXn/VtTNNfNtAfZ9/1RtehkszU9qcTii0Q==}
abbrev@2.0.0:
resolution: {integrity: sha512-6/mh1E2u2YgEsCHdY0Yx5oW+61gZU+1vXaoiHHrpKeuRNNgFvS+/jrwHiQhB5apAf5oB7UB7E19ol2R2LKH8hQ==}
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
@ -1708,6 +1719,14 @@ packages:
resolution: {integrity: sha512-KLy/ugo33KZA7nugtQ7O0E1c8kQ52N3IvD/XgIh4w/Nr28ypfkwDfA67F1ev4N1m5D+BOk1+b2dEJDfpj/VvZg==}
engines: {node: '>=0.2.6'}
aproba@2.0.0:
resolution: {integrity: sha512-lYe4Gx7QT+MKGbDsA+Z+he/Wtef0BiwDOlK/XkBrdfsh9J/jPPXbX0tE9x9cl27Tmu5gg3QUbUrQYa/y+KOHPQ==}
are-we-there-yet@2.0.0:
resolution: {integrity: sha512-Ci/qENmwHnsYo9xKIcUJN5LeDKdJ6R1Z1j9V/J5wyq8nh/mYPEpIKJbBZXtZjG04HiK7zV/p6Vs9952MrMeUIw==}
engines: {node: '>=10'}
deprecated: This package is no longer supported.
arg@4.1.3:
resolution: {integrity: sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==}
@ -1908,6 +1927,10 @@ packages:
caniuse-lite@1.0.30001627:
resolution: {integrity: sha512-4zgNiB8nTyV/tHhwZrFs88ryjls/lHiqFhrxCW4qSTeuRByBVnPYpDInchOIySWknznucaf31Z4KYqjfbrecVw==}
canvas@2.11.2:
resolution: {integrity: sha512-ItanGBMrmRV7Py2Z+Xhs7cT+FNt5K0vPL4p9EZ/UX/Mu7hFbkxSjKF2KVtPwX7UYWp7dRKnrTvReflgrItJbdw==}
engines: {node: '>=6'}
chalk@2.4.2:
resolution: {integrity: sha512-Mti+f9lpJNcwF4tWV8/OrTTtF1gZi+f8FqlyAdouralcFWFQWF2+NgCHShjkCb+IFBLq9buZwE1xckQU4peSuQ==}
engines: {node: '>=4'}
@ -1938,6 +1961,10 @@ packages:
resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==}
engines: {node: '>= 8.10.0'}
chownr@2.0.0:
resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==}
engines: {node: '>=10'}
chownr@3.0.0:
resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==}
engines: {node: '>=18'}
@ -1995,6 +2022,10 @@ packages:
color-string@1.9.1:
resolution: {integrity: sha512-shrVawQFojnZv6xM40anx4CkoDP+fZsw/ZerEMsW/pyzsRbElpsL/DBVW7q3ExxwusdNXI3lXpuhEZkzs8p5Eg==}
color-support@1.1.3:
resolution: {integrity: sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==}
hasBin: true
color@3.2.1:
resolution: {integrity: sha512-aBl7dZI9ENN6fUGC7mWpMTPNHmWUSNan9tuWN6ahh5ZLNk9baLJOnSMlrQkHcrfFgz2/RigjUVAjdx36VcemKA==}
@ -2021,6 +2052,9 @@ packages:
config-chain@1.1.13:
resolution: {integrity: sha512-qj+f8APARXHrM0hraqXYb2/bOVSV4PvJQlNZ/DVj0QrmNM2q2euizkeuVckQ57J+W0mRH6Hvi+k50M4Jul2VRQ==}
console-control-strings@1.1.0:
resolution: {integrity: sha512-ty/fTekppD2fIwRvnZAVdeOiGd1c7YXEixbgJTNzqcxJWKQnjJ/V1bNEEE6hygpM3WjwHFUVK6HTjWSzV4a8sQ==}
content-disposition@0.5.4:
resolution: {integrity: sha512-FveZTNuGw04cxlAiWbzi6zTAL/lhehaWbTtgluJh4/E95DqMwTmha3KZN1aAWA8cFIhHzMZUvLevkw5Rqk+tSQ==}
engines: {node: '>= 0.6'}
@ -2147,6 +2181,10 @@ packages:
resolution: {integrity: sha512-9iE1PgSik9HeIIw2JO94IidnE3eBoQrFJ3w7sFuzSX4DpmZ3v5sZpUiV5Swcf6mQEF+Y0ru8Neo+p+nyh2J+hQ==}
engines: {node: '>=10'}
decompress-response@4.2.1:
resolution: {integrity: sha512-jOSne2qbyE+/r8G1VU+G/82LBs2Fs4LAsTiLSHOCOMZQl2OKZ6i8i4IyHemTe+/yIXOtTcRQMzPcgyhoFlqPkw==}
engines: {node: '>=8'}
dedent@1.5.3:
resolution: {integrity: sha512-NHQtfOOW68WD8lgypbLA5oT+Bt0xXJhiYvoR6SmmNXZfpzOGXwdKWmcwG8N7PwVVWV3eF/68nmD9BaJSsTBhyQ==}
peerDependencies:
@ -2171,6 +2209,9 @@ packages:
resolution: {integrity: sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==}
engines: {node: '>=0.4.0'}
delegates@1.0.0:
resolution: {integrity: sha512-bd2L678uiWATM6m5Z1VzNCErI3jiGzt6HGY8OVICs40JQq/HALfbyNJmp0UDakEY4pMMaN0Ly5om/B1VI/+xfQ==}
denque@2.1.0:
resolution: {integrity: sha512-HVQE3AAb/pxF8fQAoiqpvg9i3evqug3hoiwakOyZAwJm+6vZehbkYXZ0l4JxS+I3QxM97v5aaRNhj8v5oBhekw==}
engines: {node: '>=0.10'}
@ -2283,6 +2324,9 @@ packages:
resolution: {integrity: sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==}
engines: {node: '>=0.12'}
enumify@1.0.4:
resolution: {integrity: sha512-5mwWXaVzJaqyUdOW/PDH5QySRgmQ8VvujmxmvXoXj9w0n+6omhVuyD56eI37FMqy/LxueJzsQ4DrHVQzuT/TXg==}
env-paths@2.2.1:
resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==}
engines: {node: '>=6'}
@ -2484,6 +2528,10 @@ packages:
resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==}
engines: {node: '>=14.14'}
fs-minipass@2.1.0:
resolution: {integrity: sha512-V/JgOLFCS+R6Vcq0slCuaeWEdNC3ouDlJMNIsacH2VtALiu9mV4LPrHc5cDl8k5aw6J8jwgWWpiTo5RYhmIzvg==}
engines: {node: '>= 8'}
fs.realpath@1.0.0:
resolution: {integrity: sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==}
@ -2495,6 +2543,11 @@ packages:
function-bind@1.1.2:
resolution: {integrity: sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==}
gauge@3.0.2:
resolution: {integrity: sha512-+5J6MS/5XksCuXq++uFRsnUd7Ovu1XenbeuIuNRJxYWjgQbPuFhT14lAvsWfqfAmnwluf1OwMjz39HjfLPci0Q==}
engines: {node: '>=10'}
deprecated: This package is no longer supported.
generic-pool@3.9.0:
resolution: {integrity: sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==}
engines: {node: '>= 4'}
@ -2578,6 +2631,9 @@ packages:
resolution: {integrity: sha512-l3LCuF6MgDNwTDKkdYGEihYjt5pRPbEg46rtlmnSPlUbgmB8LOIrKJbYYFBSbnPaJexMKtiPO8hmeRjRz2Td+A==}
engines: {node: '>= 0.4'}
has-unicode@2.0.1:
resolution: {integrity: sha512-8Rf9Y83NBReMnx0gFzA8JImQACstCYWUplepDa9xprwwtmgEZUF0h/i5xSA625zB/I37EtrswSST6OXxwaaIJQ==}
hasown@2.0.2:
resolution: {integrity: sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==}
engines: {node: '>= 0.4'}
@ -3256,6 +3312,10 @@ packages:
resolution: {integrity: sha512-zobTr7akeGHnv7eBOXcRgMeCP6+uyYsczwmeRCauvpvaAltgNyTbLH/+VaEAPUeWBT+1GuNmz4wC/6jtQzbbVA==}
engines: {node: '>=12'}
make-dir@3.1.0:
resolution: {integrity: sha512-g3FeP20LNwhALb/6Cz6Dd4F2ngze0jz7tbzrD2wAV+o9FeNHe4rL+yK2md0J/fiSf1sa1ADhXqi5+oVwOM/eGw==}
engines: {node: '>=8'}
make-dir@4.0.0:
resolution: {integrity: sha512-hXdUTZYIVOt1Ex//jAQi+wTZZpUpwBj/0QsOzqegb3rGMMeJiSEu5xLHnYfBrRV4RH2+OCSOO95Is/7x1WJ4bw==}
engines: {node: '>=10'}
@ -3326,6 +3386,10 @@ packages:
resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==}
engines: {node: '>=6'}
mimic-response@2.1.0:
resolution: {integrity: sha512-wXqjST+SLt7R009ySCglWBCFpjUygmCIfD790/kVbiGmUgfYGuB14PiTd5DwVxSV4NcYHjzMkoj5LjQZwTQLEA==}
engines: {node: '>=8'}
minimatch@3.1.2:
resolution: {integrity: sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==}
@ -3344,10 +3408,22 @@ packages:
minimist@1.2.8:
resolution: {integrity: sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==}
minipass@3.3.6:
resolution: {integrity: sha512-DxiNidxSEK+tHG6zOIklvNOwm3hvCrbUrdtzY74U6HKTJxvIDfOUL5W5P2Ghd3DTkhhKPYGqeNUIh5qcM4YBfw==}
engines: {node: '>=8'}
minipass@5.0.0:
resolution: {integrity: sha512-3FnjYuehv9k6ovOEbyOswadCDPX1piCfhV8ncmYtHOjuPwylVWsghTLo7rabjC3Rx5xD4HDx8Wm1xnMF7S5qFQ==}
engines: {node: '>=8'}
minipass@7.1.2:
resolution: {integrity: sha512-qOOzS1cBTWYF4BH8fVePDBOO9iptMnGUEZwNc/cMWnTV2nVLZ7VoNWEPHkYczZA0pdoA7dl6e7FL659nX9S2aw==}
engines: {node: '>=16 || 14 >=14.17'}
minizlib@2.1.2:
resolution: {integrity: sha512-bAxsR8BVfj60DWXHE3u30oHzfl4G7khkSuPW+qvpd7jFRHm7dLxOjUk1EHACJ/hxLY8phGJ0YhYHZo7jil7Qdg==}
engines: {node: '>= 8'}
minizlib@3.0.1:
resolution: {integrity: sha512-umcy022ILvb5/3Djuu8LWeqUa8D68JaBzlttKeMWen48SjabqS3iY5w/vzeMzMUNhLDifyhbOwKDSznB1vvrwg==}
engines: {node: '>= 18'}
@ -3359,6 +3435,11 @@ packages:
resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==}
hasBin: true
mkdirp@1.0.4:
resolution: {integrity: sha512-vVqVZQyf3WLx2Shd0qJ9xuvqgAyKPLAiqITEtqW0oIUjzo3PePDd6fW9iFz30ef7Ysp/oiWqbhszeGWW2T6Gzw==}
engines: {node: '>=10'}
hasBin: true
mkdirp@3.0.1:
resolution: {integrity: sha512-+NsyUUAZDmo6YVHzL/stxSu3t9YS1iljliy3BSDrXJ/dkn1KYdmtZODGGjLcc9XLgVVpH4KshHB8XmZgMhaBXg==}
engines: {node: '>=10'}
@ -3447,6 +3528,9 @@ packages:
resolution: {integrity: sha512-71ippSywq5Yb7/tVYyGbkBggbU8H3u5Rz56fH60jGFgr8uHwxs+aSKeqmluIVzM0m0kB7xQjKS6qPfd0b2ZoqQ==}
hasBin: true
nan@2.22.0:
resolution: {integrity: sha512-nbajikzWTMwsW+eSsNm3QwlOs7het9gGJU5dDZzRTQGk03vyBOauxgI4VakDzE0PtsGTmXPsXTbbjVhRwR5mpw==}
natural-compare@1.4.0:
resolution: {integrity: sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==}
@ -3507,6 +3591,11 @@ packages:
engines: {node: '>=8.10.0'}
hasBin: true
nopt@5.0.0:
resolution: {integrity: sha512-Tbj67rffqceeLpcRXrT7vKAN8CwfPeIBgM7E6iBkmKLV7bEMwpGgYLGv0jACUsECaa/vuxP0IjEont6umdMgtQ==}
engines: {node: '>=6'}
hasBin: true
nopt@7.2.1:
resolution: {integrity: sha512-taM24ViiimT/XntxbPyJQzCG+p4EKOpgD3mxFwW38mGjVUrfERQOeY4EDHjdnptttfHuHQXFx+lTP08Q+mLa/w==}
engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0}
@ -3524,6 +3613,10 @@ packages:
resolution: {integrity: sha512-S48WzZW777zhNIrn7gxOlISNAqi9ZC/uQFnRdbeIHhZhCA6UqpkOT8T1G7BvfdgP4Er8gF4sUbaS0i7QvIfCWw==}
engines: {node: '>=8'}
npmlog@5.0.1:
resolution: {integrity: sha512-AqZtDUWOMKs1G/8lwylVjrdYgqA4d9nu8hc+0gzRxlDb1I10+FHBGMXs6aiQHFdCUUlqH99MUMuLfzWDNDtfxw==}
deprecated: This package is no longer supported.
nth-check@2.1.1:
resolution: {integrity: sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w==}
@ -3949,6 +4042,11 @@ packages:
resolution: {integrity: sha512-XQBQ3I8W1Cge0Seh+6gjj03LbmRFWuoszgK9ooCpwYIrhhoO80pfq4cUkU5DkknwfOfFteRwlZ56PYOGYyFWdg==}
engines: {node: '>= 4'}
rimraf@3.0.2:
resolution: {integrity: sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==}
deprecated: Rimraf versions prior to v4 are no longer supported
hasBin: true
rimraf@5.0.7:
resolution: {integrity: sha512-nV6YcJo5wbLW77m+8KjH8aB/7/rxQy9SZ0HY5shnwULfS+9nmTtVXAJET5NdZmCzA4fPI/Hm1wo/Po/4mopOdg==}
engines: {node: '>=14.18'}
@ -4019,6 +4117,9 @@ packages:
resolution: {integrity: sha512-XGuRDNjXUijsUL0vl6nSD7cwURuzEgglbOaFuZM9g3kwDXOWVTck0jLzjPzGD+TazWbboZYu52/9/XPdUgne9g==}
engines: {node: '>= 0.8.0'}
set-blocking@2.0.0:
resolution: {integrity: sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==}
set-function-length@1.2.2:
resolution: {integrity: sha512-pgRc4hJ4/sNjWCSS9AmnS40x3bNMDTknHgL5UaMBTMyJnU90EgWh1Rz+MC9eFu4BuN/UwZjKQuY/1v3rM7HMfg==}
engines: {node: '>= 0.4'}
@ -4057,6 +4158,12 @@ packages:
resolution: {integrity: sha512-bzyZ1e88w9O1iNJbKnOlvYTrWPDl46O1bG0D3XInv+9tkPrxrN8jUUTiFlDkkmKWgn1M6CfIA13SuGqOa9Korw==}
engines: {node: '>=14'}
simple-concat@1.0.1:
resolution: {integrity: sha512-cSFtAPtRhljv69IK0hTVZQ+OfE9nePi/rtJmw5UjHeVyVroEqJXP1sFztKUy1qU+xvz3u/sfYJLa947b7nAN2Q==}
simple-get@3.1.1:
resolution: {integrity: sha512-CQ5LTKGfCpvE1K0n2us+kuMPbk/q0EKl82s4aheV9oXjFEz6W/Y7oQFVJuU6QG77hRT4Ghb5RURteF5vnWjupA==}
simple-swizzle@0.2.2:
resolution: {integrity: sha512-JA//kQgZtbuY83m+xT+tXJkmJncGMTFT+C+g2h2R9uxkYIrE2yy9sgmcLhCnw57/WSD+Eh3J97FPEDFnbXnDUg==}
@ -4215,6 +4322,10 @@ packages:
tar-stream@3.1.7:
resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==}
tar@6.2.1:
resolution: {integrity: sha512-DZ4yORTwrbTj/7MZYq2w+/ZFdI6OZ/f9SFHR+71gIVUZhOQPHzVCLpvRnPgyaMpfWxxk/4ONva3GQSyNIKRv6A==}
engines: {node: '>=10'}
tar@7.2.0:
resolution: {integrity: sha512-hctwP0Nb4AB60bj8WQgRYaMOuJYRAPMGiQUAotms5igN8ppfQM+IvjQ5HcKu1MaZh2Wy2KWVTe563Yj8dfc14w==}
engines: {node: '>=18'}
@ -4369,6 +4480,9 @@ packages:
resolution: {integrity: sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==}
engines: {node: '>= 10.0.0'}
unpdf@0.12.1:
resolution: {integrity: sha512-ktP8+TTLDBrlu/j8rQVNbHoMMpFXzkVAkb1rt/JdshFC3jOHdZjuGCNl/voPL0kraUrUOH7ZC88kVxMvlvDBzA==}
unpipe@1.0.0:
resolution: {integrity: sha512-pjy2bYhSsufwWlKwPc+l3cN7+wuJlK6uz0YdJEOlQDbl6jo/YlPi4mb8agUkVC8BF7V8NuzeyPNqRksA3hztKQ==}
engines: {node: '>= 0.8'}
@ -4456,6 +4570,9 @@ packages:
engines: {node: '>= 8'}
hasBin: true
wide-align@1.1.5:
resolution: {integrity: sha512-eDMORYaPNZ4sQIuuYPDHdQvf4gyCF9rEEV/yPxGfwPkRodwEgiMUUXTx/dex+Me0wxx53S+NgUHaP7y3MGlDmg==}
winston-transport@4.8.0:
resolution: {integrity: sha512-qxSTKswC6llEMZKgCQdaWgDuMJQnhuvF5f2Nk3SNXc4byfQ+voo2mX1Px9dkNOuR8p0KAjfPG29PuYUSIb+vSA==}
engines: {node: '>= 12.0.0'}
@ -5607,6 +5724,22 @@ snapshots:
- langchain
- openai
'@mapbox/node-pre-gyp@1.0.11':
dependencies:
detect-libc: 2.0.3
https-proxy-agent: 5.0.1
make-dir: 3.1.0
node-fetch: 2.7.0
nopt: 5.0.0
npmlog: 5.0.1
rimraf: 3.0.2
semver: 7.6.2
tar: 6.2.1
transitivePeerDependencies:
- encoding
- supports-color
optional: true
'@mixmark-io/domino@2.2.0': {}
'@mongodb-js/saslprep@1.1.7':
@ -5639,6 +5772,15 @@ snapshots:
'@one-ini/wasm@0.1.1': {}
'@opendocsg/pdf2md@0.2.1':
dependencies:
enumify: 1.0.4
minimist: 1.2.8
unpdf: 0.12.1
transitivePeerDependencies:
- encoding
- supports-color
'@opentelemetry/api-logs@0.52.1':
dependencies:
'@opentelemetry/api': 1.9.0
@ -6673,6 +6815,9 @@ snapshots:
'@xmldom/xmldom@0.8.10': {}
abbrev@1.1.1:
optional: true
abbrev@2.0.0: {}
abort-controller@3.0.0:
@ -6755,6 +6900,15 @@ snapshots:
dependencies:
sylvester: 0.0.12
aproba@2.0.0:
optional: true
are-we-there-yet@2.0.0:
dependencies:
delegates: 1.0.0
readable-stream: 3.6.2
optional: true
arg@4.1.3: {}
argparse@1.0.10:
@ -7008,6 +7162,16 @@ snapshots:
caniuse-lite@1.0.30001627: {}
canvas@2.11.2:
dependencies:
'@mapbox/node-pre-gyp': 1.0.11
nan: 2.22.0
simple-get: 3.1.1
transitivePeerDependencies:
- encoding
- supports-color
optional: true
chalk@2.4.2:
dependencies:
ansi-styles: 3.2.1
@ -7056,6 +7220,9 @@ snapshots:
optionalDependencies:
fsevents: 2.3.3
chownr@2.0.0:
optional: true
chownr@3.0.0: {}
chromium-bidi@0.5.24(devtools-protocol@0.0.1299070):
@ -7121,6 +7288,9 @@ snapshots:
color-name: 1.1.4
simple-swizzle: 0.2.2
color-support@1.1.3:
optional: true
color@3.2.1:
dependencies:
color-convert: 1.9.3
@ -7148,6 +7318,9 @@ snapshots:
ini: 1.3.8
proto-list: 1.2.4
console-control-strings@1.1.0:
optional: true
content-disposition@0.5.4:
dependencies:
safe-buffer: 5.2.1
@ -7255,6 +7428,11 @@ snapshots:
decamelize@4.0.0: {}
decompress-response@4.2.1:
dependencies:
mimic-response: 2.1.0
optional: true
dedent@1.5.3: {}
deepmerge@4.3.1: {}
@ -7273,6 +7451,9 @@ snapshots:
delayed-stream@1.0.0: {}
delegates@1.0.0:
optional: true
denque@2.1.0: {}
depd@2.0.0: {}
@ -7364,6 +7545,8 @@ snapshots:
entities@4.5.0: {}
enumify@1.0.4: {}
env-paths@2.2.1: {}
error-ex@1.3.2:
@ -7589,6 +7772,11 @@ snapshots:
jsonfile: 6.1.0
universalify: 2.0.1
fs-minipass@2.1.0:
dependencies:
minipass: 3.3.6
optional: true
fs.realpath@1.0.0: {}
fsevents@2.3.3:
@ -7596,6 +7784,19 @@ snapshots:
function-bind@1.1.2: {}
gauge@3.0.2:
dependencies:
aproba: 2.0.0
color-support: 1.1.3
console-control-strings: 1.1.0
has-unicode: 2.0.1
object-assign: 4.1.1
signal-exit: 3.0.7
string-width: 4.2.3
strip-ansi: 6.0.1
wide-align: 1.1.5
optional: true
generic-pool@3.9.0: {}
gensync@1.0.0-beta.2: {}
@ -7683,6 +7884,9 @@ snapshots:
has-symbols@1.0.3: {}
has-unicode@2.0.1:
optional: true
hasown@2.0.2:
dependencies:
function-bind: 1.1.2
@ -8463,6 +8667,11 @@ snapshots:
luxon@3.4.4: {}
make-dir@3.1.0:
dependencies:
semver: 6.3.1
optional: true
make-dir@4.0.0:
dependencies:
semver: 7.6.2
@ -8523,6 +8732,9 @@ snapshots:
mimic-fn@2.1.0: {}
mimic-response@2.1.0:
optional: true
minimatch@3.1.2:
dependencies:
brace-expansion: 1.1.11
@ -8541,8 +8753,22 @@ snapshots:
minimist@1.2.8: {}
minipass@3.3.6:
dependencies:
yallist: 4.0.0
optional: true
minipass@5.0.0:
optional: true
minipass@7.1.2: {}
minizlib@2.1.2:
dependencies:
minipass: 3.3.6
yallist: 4.0.0
optional: true
minizlib@3.0.1:
dependencies:
minipass: 7.1.2
@ -8554,6 +8780,9 @@ snapshots:
dependencies:
minimist: 1.2.8
mkdirp@1.0.4:
optional: true
mkdirp@3.0.1: {}
ml-array-mean@1.1.6:
@ -8646,6 +8875,9 @@ snapshots:
mustache@4.2.0: {}
nan@2.22.0:
optional: true
natural-compare@1.4.0: {}
natural@7.0.7(@aws-sdk/credential-providers@3.679.0(@aws-sdk/client-sso-oidc@3.679.0(@aws-sdk/client-sts@3.679.0)))(socks@2.8.3):
@ -8692,7 +8924,8 @@ snapshots:
node-domexception@1.0.0: {}
node-ensure@0.0.0: {}
node-ensure@0.0.0:
optional: true
node-fetch@2.7.0:
dependencies:
@ -8726,6 +8959,11 @@ snapshots:
touch: 3.1.1
undefsafe: 2.0.5
nopt@5.0.0:
dependencies:
abbrev: 1.1.1
optional: true
nopt@7.2.1:
dependencies:
abbrev: 2.0.0
@ -8738,6 +8976,14 @@ snapshots:
dependencies:
path-key: 3.1.1
npmlog@5.0.1:
dependencies:
are-we-there-yet: 2.0.0
console-control-strings: 1.1.0
gauge: 3.0.2
set-blocking: 2.0.0
optional: true
nth-check@2.1.1:
dependencies:
boolbase: 1.0.0
@ -8909,6 +9155,7 @@ snapshots:
node-ensure: 0.0.0
transitivePeerDependencies:
- supports-color
optional: true
peberminta@0.9.0: {}
@ -9210,6 +9457,11 @@ snapshots:
retry@0.13.1: {}
rimraf@3.0.2:
dependencies:
glob: 7.2.3
optional: true
rimraf@5.0.7:
dependencies:
glob: 10.4.2
@ -9285,6 +9537,9 @@ snapshots:
transitivePeerDependencies:
- supports-color
set-blocking@2.0.0:
optional: true
set-function-length@1.2.2:
dependencies:
define-data-property: 1.1.4
@ -9321,6 +9576,16 @@ snapshots:
signal-exit@4.1.0: {}
simple-concat@1.0.1:
optional: true
simple-get@3.1.1:
dependencies:
decompress-response: 4.2.1
once: 1.4.0
simple-concat: 1.0.1
optional: true
simple-swizzle@0.2.2:
dependencies:
is-arrayish: 0.3.2
@ -9494,6 +9759,16 @@ snapshots:
fast-fifo: 1.3.2
streamx: 2.18.0
tar@6.2.1:
dependencies:
chownr: 2.0.0
fs-minipass: 2.1.0
minipass: 5.0.0
minizlib: 2.1.2
mkdirp: 1.0.4
yallist: 4.0.0
optional: true
tar@7.2.0:
dependencies:
'@isaacs/fs-minipass': 4.0.1
@ -9626,6 +9901,13 @@ snapshots:
universalify@2.0.1: {}
unpdf@0.12.1:
optionalDependencies:
canvas: 2.11.2
transitivePeerDependencies:
- encoding
- supports-color
unpipe@1.0.0: {}
unstructured-client@0.11.3(zod@3.23.8):
@ -9698,6 +9980,11 @@ snapshots:
dependencies:
isexe: 2.0.0
wide-align@1.1.5:
dependencies:
string-width: 4.2.3
optional: true
winston-transport@4.8.0:
dependencies:
logform: 2.6.1

View File

@ -6,7 +6,7 @@ import { robustFetch } from "../../lib/fetch";
import { z } from "zod";
import * as Sentry from "@sentry/node";
import escapeHtml from "escape-html";
import PdfParse from "pdf-parse";
import pdf2md from "@opendocsg/pdf2md";
import { downloadFile, fetchFileToBuffer } from "../utils/downloadFile";
import { RemoveFeatureError } from "../../error";
@ -113,10 +113,11 @@ async function scrapePDFWithParsePDF(
meta: Meta,
tempFilePath: string,
): Promise<PDFProcessorResult> {
meta.logger.debug("Processing PDF document with parse-pdf", { tempFilePath });
meta.logger.debug("Processing PDF document with pdf2md", { tempFilePath });
const result = await PdfParse(await fs.readFile(tempFilePath));
const escaped = escapeHtml(result.text);
const pdfBuffer = await fs.readFile(tempFilePath);
const markdown = await pdf2md(pdfBuffer);
const escaped = escapeHtml(markdown);
return {
markdown: escaped,
@ -140,9 +141,23 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
const { response, tempFilePath } = await downloadFile(meta.id, meta.url);
let result: PDFProcessorResult | null = null;
if (process.env.LLAMAPARSE_API_KEY) {
// First, try parsing with pdf2md
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
// If the parsed text is under 500 characters and LLAMAPARSE_API_KEY exists, try LlamaParse
if (result.markdown && result.markdown.length < 500 && process.env.LLAMAPARSE_API_KEY) {
try {
result = await scrapePDFWithLlamaParse(
const llamaResult = await scrapePDFWithLlamaParse(
{
...meta,
logger: meta.logger.child({
@ -152,16 +167,17 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
tempFilePath,
timeToRun,
);
result = llamaResult; // Use LlamaParse result if successful
} catch (error) {
if (error instanceof Error && error.message === "LlamaParse timed out") {
meta.logger.warn("LlamaParse timed out -- falling back to parse-pdf", {
meta.logger.warn("LlamaParse timed out -- using pdf2md result", {
error,
});
} else if (error instanceof RemoveFeatureError) {
throw error;
} else {
meta.logger.warn(
"LlamaParse failed to parse PDF -- falling back to parse-pdf",
"LlamaParse failed to parse PDF -- using pdf2md result",
{ error },
);
Sentry.captureException(error);
@ -169,18 +185,6 @@ export async function scrapePDF(meta: Meta, timeToRun: number | undefined): Prom
}
}
if (result === null) {
result = await scrapePDFWithParsePDF(
{
...meta,
logger: meta.logger.child({
method: "scrapePDF/scrapePDFWithParsePDF",
}),
},
tempFilePath,
);
}
await fs.unlink(tempFilePath);
return {