Merge branch 'main' of github.com:jina-ai/url2text

This commit is contained in:
yanlong.wang 2024-04-24 19:21:50 +08:00
commit ae99af50aa
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
8 changed files with 804 additions and 193 deletions

View File

@ -13,7 +13,8 @@
".git", ".git",
"*.log", "*.log",
"*.local", "*.local",
".secret.*" ".secret.*",
".firebase-emu"
], ],
"predeploy": [ "predeploy": [
"npm --prefix \"$RESOURCE_DIR\" run build:clean", "npm --prefix \"$RESOURCE_DIR\" run build:clean",

View File

@ -29,7 +29,10 @@
"puppeteer": "^22.6.3", "puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0",
"set-cookie-parser": "^2.6.0",
"stripe": "^11.11.0", "stripe": "^11.11.0",
"tiktoken": "^1.0.10", "tiktoken": "^1.0.10",
"turndown": "^7.1.3", "turndown": "^7.1.3",
@ -42,6 +45,7 @@
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1", "@types/generic-pool": "^3.8.1",
"@types/node": "^18", "@types/node": "^18",
"@types/set-cookie-parser": "^2.4.7",
"@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0", "@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0", "eslint": "^8.9.0",
@ -1986,6 +1990,17 @@
"dev": true, "dev": true,
"peer": true "peer": true
}, },
"node_modules/@sindresorhus/is": {
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/@sindresorhus/is/-/is-4.6.0.tgz",
"integrity": "sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sindresorhus/is?sponsor=1"
}
},
"node_modules/@sinonjs/commons": { "node_modules/@sinonjs/commons": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz", "resolved": "https://registry.npmjs.org/@sinonjs/commons/-/commons-3.0.1.tgz",
@ -2006,6 +2021,17 @@
"@sinonjs/commons": "^3.0.0" "@sinonjs/commons": "^3.0.0"
} }
}, },
"node_modules/@szmarczak/http-timer": {
"version": "4.0.6",
"resolved": "https://registry.npmjs.org/@szmarczak/http-timer/-/http-timer-4.0.6.tgz",
"integrity": "sha512-4BAffykYOgO+5nzBWYwE3W90sBgLJoUPRWWcL8wlyiM8IB8ipJz3UMJ9KXQd1RKQXpKp8Tutn80HZtWsu2u76w==",
"dependencies": {
"defer-to-connect": "^2.0.0"
},
"engines": {
"node": ">=10"
}
},
"node_modules/@tootallnate/once": { "node_modules/@tootallnate/once": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz",
@ -2091,6 +2117,17 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/cacheable-request": {
"version": "6.0.3",
"resolved": "https://registry.npmjs.org/@types/cacheable-request/-/cacheable-request-6.0.3.tgz",
"integrity": "sha512-IQ3EbTzGxIigb1I3qPZc1rWJnH0BmSKv5QYTalEwweFvyBDLSAe24zP0le/hyi7ecGfZVlIVAg4BZqb8WBwKqw==",
"dependencies": {
"@types/http-cache-semantics": "*",
"@types/keyv": "^3.1.4",
"@types/node": "*",
"@types/responselike": "^1.0.0"
}
},
"node_modules/@types/caseless": { "node_modules/@types/caseless": {
"version": "0.12.5", "version": "0.12.5",
"resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz", "resolved": "https://registry.npmjs.org/@types/caseless/-/caseless-0.12.5.tgz",
@ -2161,6 +2198,11 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/http-cache-semantics": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
"integrity": "sha512-1m0bIFVc7eJWyve9S0RnuRgcQqF/Xd5QsUZAZeQFr1Q3/p9JWoQQEqmVy+DPTNpGXwhgIetAoYF8JSc33q29QA=="
},
"node_modules/@types/http-errors": { "node_modules/@types/http-errors": {
"version": "2.0.4", "version": "2.0.4",
"resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz", "resolved": "https://registry.npmjs.org/@types/http-errors/-/http-errors-2.0.4.tgz",
@ -2213,6 +2255,14 @@
"@types/node": "*" "@types/node": "*"
} }
}, },
"node_modules/@types/keyv": {
"version": "3.1.4",
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
"integrity": "sha512-BQ5aZNSCpj7D6K2ksrRCTmKRLEpnPvWDiLPfoGyhZ++8YtiK9d/3DBKPJgry359X/P1PfruyYwvnvwFjuEiEIg==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/lodash": { "node_modules/@types/lodash": {
"version": "4.17.0", "version": "4.17.0",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz", "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
@ -2294,6 +2344,14 @@
"node": ">= 0.12" "node": ">= 0.12"
} }
}, },
"node_modules/@types/responselike": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/@types/responselike/-/responselike-1.0.3.tgz",
"integrity": "sha512-H/+L+UkTV33uf49PH5pCAUBVPNj2nDBXTN+qS1dOwyyg24l3CcicicCA7ca+HMvJBZcFgl5r8e+RR6elsb4Lyw==",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/semver": { "node_modules/@types/semver": {
"version": "7.5.8", "version": "7.5.8",
"resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz", "resolved": "https://registry.npmjs.org/@types/semver/-/semver-7.5.8.tgz",
@ -2319,6 +2377,15 @@
"@types/send": "*" "@types/send": "*"
} }
}, },
"node_modules/@types/set-cookie-parser": {
"version": "2.4.7",
"resolved": "https://registry.npmjs.org/@types/set-cookie-parser/-/set-cookie-parser-2.4.7.tgz",
"integrity": "sha512-+ge/loa0oTozxip6zmhRIk8Z/boU51wl9Q6QdLZcokIGMzY5lFXYy/x7Htj2HTC6/KZP1hUbZ1ekx8DYXICvWg==",
"dev": true,
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/stack-utils": { "node_modules/@types/stack-utils": {
"version": "2.0.3", "version": "2.0.3",
"resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz", "resolved": "https://registry.npmjs.org/@types/stack-utils/-/stack-utils-2.0.3.tgz",
@ -3447,6 +3514,45 @@
"node": ">= 6.0.0" "node": ">= 6.0.0"
} }
}, },
"node_modules/cacheable-lookup": {
"version": "5.0.4",
"resolved": "https://registry.npmjs.org/cacheable-lookup/-/cacheable-lookup-5.0.4.tgz",
"integrity": "sha512-2/kNscPhpcxrOigMZzbiWF7dz8ilhb/nIHU3EyZiXWXpeq/au8qJ8VhdftMkty3n7Gj6HIGalQG8oiBNB3AJgA==",
"engines": {
"node": ">=10.6.0"
}
},
"node_modules/cacheable-request": {
"version": "7.0.4",
"resolved": "https://registry.npmjs.org/cacheable-request/-/cacheable-request-7.0.4.tgz",
"integrity": "sha512-v+p6ongsrp0yTGbJXjgxPow2+DL93DASP4kXCDKb8/bwRtt9OEF3whggkkDkGNzgcWy2XaF4a8nZglC7uElscg==",
"dependencies": {
"clone-response": "^1.0.2",
"get-stream": "^5.1.0",
"http-cache-semantics": "^4.0.0",
"keyv": "^4.0.0",
"lowercase-keys": "^2.0.0",
"normalize-url": "^6.0.1",
"responselike": "^2.0.0"
},
"engines": {
"node": ">=8"
}
},
"node_modules/cacheable-request/node_modules/get-stream": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz",
"integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==",
"dependencies": {
"pump": "^3.0.0"
},
"engines": {
"node": ">=8"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/call-bind": { "node_modules/call-bind": {
"version": "1.0.7", "version": "1.0.7",
"resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz", "resolved": "https://registry.npmjs.org/call-bind/-/call-bind-1.0.7.tgz",
@ -3642,6 +3748,25 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/clone-response": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/clone-response/-/clone-response-1.0.3.tgz",
"integrity": "sha512-ROoL94jJH2dUVML2Y/5PEDNaSHgeOdSDicUyS7izcF63G6sTc/FTjLub4b8Il9S8S0beOfYt0TaA5qvFK+w0wA==",
"dependencies": {
"mimic-response": "^1.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/clone-response/node_modules/mimic-response": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-1.0.1.tgz",
"integrity": "sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==",
"engines": {
"node": ">=4"
}
},
"node_modules/co": { "node_modules/co": {
"version": "4.6.0", "version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
@ -4003,6 +4128,17 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/decompress-response/node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/dedent": { "node_modules/dedent": {
"version": "1.5.3", "version": "1.5.3",
"resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz", "resolved": "https://registry.npmjs.org/dedent/-/dedent-1.5.3.tgz",
@ -4046,6 +4182,14 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/defer-to-connect": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/defer-to-connect/-/defer-to-connect-2.0.1.tgz",
"integrity": "sha512-4tvttepXG1VaYGrRibk5EwJd1t4udunSOVMdLSAL6mId1ix438oPwPZMALY41FCijukO1L0twNcGsdzS7dHgDg==",
"engines": {
"node": ">=10"
}
},
"node_modules/define-data-property": { "node_modules/define-data-property": {
"version": "1.1.4", "version": "1.1.4",
"resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz", "resolved": "https://registry.npmjs.org/define-data-property/-/define-data-property-1.1.4.tgz",
@ -5837,6 +5981,30 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/got": {
"version": "11.8.6",
"resolved": "https://registry.npmjs.org/got/-/got-11.8.6.tgz",
"integrity": "sha512-6tfZ91bOr7bOXnK7PRDCGBLa1H4U080YHNaAQ2KsMGlLEzRbk44nsZF2E1IeRc3vtJHPVbKCYgdFbaGO2ljd8g==",
"dependencies": {
"@sindresorhus/is": "^4.0.0",
"@szmarczak/http-timer": "^4.0.5",
"@types/cacheable-request": "^6.0.1",
"@types/responselike": "^1.0.0",
"cacheable-lookup": "^5.0.3",
"cacheable-request": "^7.0.2",
"decompress-response": "^6.0.0",
"http2-wrapper": "^1.0.0-beta.5.2",
"lowercase-keys": "^2.0.0",
"p-cancelable": "^2.0.0",
"responselike": "^2.0.0"
},
"engines": {
"node": ">=10.19.0"
},
"funding": {
"url": "https://github.com/sindresorhus/got?sponsor=1"
}
},
"node_modules/graceful-fs": { "node_modules/graceful-fs": {
"version": "4.2.11", "version": "4.2.11",
"resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz",
@ -6024,6 +6192,11 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/http-cache-semantics": {
"version": "4.1.1",
"resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.1.1.tgz",
"integrity": "sha512-er295DKPVsV82j5kw1Gjt+ADA/XYHsajl82cGNQG2eyoPkvgUhX+nDIyelzhIWbbsXP39EHcI6l5tYs2FYqYXQ=="
},
"node_modules/http-errors": { "node_modules/http-errors": {
"version": "2.0.0", "version": "2.0.0",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-2.0.0.tgz",
@ -6067,6 +6240,18 @@
"node": ">= 14" "node": ">= 14"
} }
}, },
"node_modules/http2-wrapper": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-1.0.3.tgz",
"integrity": "sha512-V+23sDMr12Wnz7iTcDeJr3O6AIxlnvT/bmaAAAP/Xda35C90p9599p0F1eHR/N1KILWSoWVAiOMFjBBXaXSMxg==",
"dependencies": {
"quick-lru": "^5.1.1",
"resolve-alpn": "^1.0.0"
},
"engines": {
"node": ">=10.19.0"
}
},
"node_modules/https-proxy-agent": { "node_modules/https-proxy-agent": {
"version": "5.0.1", "version": "5.0.1",
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
@ -7391,8 +7576,7 @@
"node_modules/json-buffer": { "node_modules/json-buffer": {
"version": "3.0.1", "version": "3.0.1",
"resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz",
"integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ==", "integrity": "sha512-4bV5BfR2mqfQTJm+V5tPPdf+ZpuhiIvTuAB5g8kcrXOZpTT/QwwVRWBywX1ozr6lEuPdbHxwaJlm9G6mI2sfSQ=="
"dev": true
}, },
"node_modules/json-parse-even-better-errors": { "node_modules/json-parse-even-better-errors": {
"version": "2.3.1", "version": "2.3.1",
@ -7550,7 +7734,6 @@
"version": "4.5.4", "version": "4.5.4",
"resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz", "resolved": "https://registry.npmjs.org/keyv/-/keyv-4.5.4.tgz",
"integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==", "integrity": "sha512-oxVHkHR/EJf2CNXnWxRLW6mg7JyCCUcG0DtEGmL2ctUo1PNTin1PUil+r/+4r5MpVgC/fn1kjsx7mjSujKqIpw==",
"dev": true,
"dependencies": { "dependencies": {
"json-buffer": "3.0.1" "json-buffer": "3.0.1"
} }
@ -7869,6 +8052,14 @@
"integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==", "integrity": "sha512-BFRuQUqc7x2NWxfJBCyUrN8iYUYznzL9JROmRz1gZ6KlOIgmoD+njPVbb+VNn2nGMKggMsK79iUNErillsrx7w==",
"optional": true "optional": true
}, },
"node_modules/lowercase-keys": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/lowercase-keys/-/lowercase-keys-2.0.0.tgz",
"integrity": "sha512-tqNXrS78oMOE73NMxK4EMLQsQowWf8jKooH9g7xPavRT706R6bkQJ6DY2Te7QukaZsulxa30wQ7bk0pm4XiHmA==",
"engines": {
"node": ">=8"
}
},
"node_modules/lru-cache": { "node_modules/lru-cache": {
"version": "5.1.1", "version": "5.1.1",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
@ -8054,17 +8245,6 @@
"node": ">=6" "node": ">=6"
} }
}, },
"node_modules/mimic-response": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/mimic-response/-/mimic-response-3.1.0.tgz",
"integrity": "sha512-z0yWI+4FDrrweS8Zmt4Ej5HdJmky15+L2e6Wgn3+iK5fWzb6T3fhNFq2+MeTRb064c6Wr4N/wv0DzQTjNzHNGQ==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/minimatch": { "node_modules/minimatch": {
"version": "3.1.2", "version": "3.1.2",
"resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
@ -8423,6 +8603,17 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/normalize-url": {
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/normalize-url/-/normalize-url-6.1.0.tgz",
"integrity": "sha512-DlL+XwOy3NxAQ8xuC0okPgK46iuVNAK01YN7RueYBqqFeGsBjV9XmCAzAdgt+667bCl5kPh9EqKKDwnaPG1I7A==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/npm-run-path": { "node_modules/npm-run-path": {
"version": "4.0.1", "version": "4.0.1",
"resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz", "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-4.0.1.tgz",
@ -8632,6 +8823,14 @@
"node": ">= 0.8.0" "node": ">= 0.8.0"
} }
}, },
"node_modules/p-cancelable": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/p-cancelable/-/p-cancelable-2.1.1.tgz",
"integrity": "sha512-BZOr3nRQHOntUjTrH8+Lh54smKHoHyur8We1V8DSMVrl5A2malOOwuJRnKRDjSnkoeBh4at6BwEnb5I7Jl31wg==",
"engines": {
"node": ">=8"
}
},
"node_modules/p-limit": { "node_modules/p-limit": {
"version": "3.1.0", "version": "3.1.0",
"resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz", "resolved": "https://registry.npmjs.org/p-limit/-/p-limit-3.1.0.tgz",
@ -9243,6 +9442,11 @@
"resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",
"integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ==" "integrity": "sha512-b/YwNhb8lk1Zz2+bXXpS/LK9OisiZZ1SNsSLxN1x2OXVEhW2Ckr/7mWE5vrC1ZTiJlD9g19jWszTmJsB+oEpFQ=="
}, },
"node_modules/psl": {
"version": "1.9.0",
"resolved": "https://registry.npmjs.org/psl/-/psl-1.9.0.tgz",
"integrity": "sha512-E/ZsdU4HLs/68gYzgGTkMicWTLPdAftJLfJFlLUAAKZGkStNU72sZjT66SnMDVOfOWY/YAoiD7Jxa9iHvngcag=="
},
"node_modules/pump": { "node_modules/pump": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz", "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.0.tgz",
@ -9367,6 +9571,58 @@
} }
} }
}, },
"node_modules/puppeteer-extra-plugin-page-proxy": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-page-proxy/-/puppeteer-extra-plugin-page-proxy-2.0.0.tgz",
"integrity": "sha512-G8pvHdHK1dO1dgFvYL+dJIlykUKjLkGUvPjzHE3R/eurqAkD4VZ9lWOU/CxYiKPhK2JxlG9QmjGjhxR6IOuP7w==",
"dependencies": {
"debug": "^4.1.1",
"got": "^11.8.5",
"http-proxy-agent": "^5.0.0",
"https-proxy-agent": "^5.0.1",
"puppeteer-extra-plugin": "^3.2.3",
"socks-proxy-agent": "^7.0.0",
"tough-cookie": "^4.1.2"
},
"peerDependencies": {
"playwright-extra": "*",
"puppeteer-extra": "*"
},
"peerDependenciesMeta": {
"playwright-extra": {
"optional": true
},
"puppeteer-extra": {
"optional": true
}
}
},
"node_modules/puppeteer-extra-plugin-page-proxy/node_modules/http-proxy-agent": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
"integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
"dependencies": {
"@tootallnate/once": "2",
"agent-base": "6",
"debug": "4"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/puppeteer-extra-plugin-page-proxy/node_modules/socks-proxy-agent": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
"integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
"dependencies": {
"agent-base": "^6.0.2",
"debug": "^4.3.3",
"socks": "^2.6.2"
},
"engines": {
"node": ">= 10"
}
},
"node_modules/puppeteer-extra-plugin-stealth": { "node_modules/puppeteer-extra-plugin-stealth": {
"version": "2.11.2", "version": "2.11.2",
"resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz",
@ -9457,6 +9713,44 @@
} }
} }
}, },
"node_modules/puppeteer-page-proxy": {
"version": "1.3.0",
"resolved": "https://registry.npmjs.org/puppeteer-page-proxy/-/puppeteer-page-proxy-1.3.0.tgz",
"integrity": "sha512-PDpLjJfcUKiLvUZ3yQJeUcP1d+7nW17s2LZIrKH0gyxEN4zTGkCvB9/HwquPgYq5YcVi8QugsvBckP/K9Vn/iw==",
"dependencies": {
"got": "^11.8.5",
"http-proxy-agent": "^5.0.0",
"https-proxy-agent": "^5.0.1",
"socks-proxy-agent": "^7.0.0",
"tough-cookie": "^4.1.2"
}
},
"node_modules/puppeteer-page-proxy/node_modules/http-proxy-agent": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz",
"integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==",
"dependencies": {
"@tootallnate/once": "2",
"agent-base": "6",
"debug": "4"
},
"engines": {
"node": ">= 6"
}
},
"node_modules/puppeteer-page-proxy/node_modules/socks-proxy-agent": {
"version": "7.0.0",
"resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-7.0.0.tgz",
"integrity": "sha512-Fgl0YPZ902wEsAyiQ+idGd1A7rSFx/ayC1CQVMw5P+EQx2V0SgpGtf6OKFhVjPflPUl9YMmEOnmfjCdMUsygww==",
"dependencies": {
"agent-base": "^6.0.2",
"debug": "^4.3.3",
"socks": "^2.6.2"
},
"engines": {
"node": ">= 10"
}
},
"node_modules/pure-rand": { "node_modules/pure-rand": {
"version": "6.1.0", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz",
@ -9523,6 +9817,11 @@
"url": "https://github.com/sponsors/sindresorhus" "url": "https://github.com/sponsors/sindresorhus"
} }
}, },
"node_modules/querystringify": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/querystringify/-/querystringify-2.2.0.tgz",
"integrity": "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="
},
"node_modules/queue-microtask": { "node_modules/queue-microtask": {
"version": "1.2.3", "version": "1.2.3",
"resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz", "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
@ -9554,6 +9853,17 @@
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==", "integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
"optional": true "optional": true
}, },
"node_modules/quick-lru": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
"engines": {
"node": ">=10"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/range-parser": { "node_modules/range-parser": {
"version": "1.2.1", "version": "1.2.1",
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz", "resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
@ -9709,6 +10019,11 @@
"node": ">=0.10.0" "node": ">=0.10.0"
} }
}, },
"node_modules/requires-port": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/requires-port/-/requires-port-1.0.0.tgz",
"integrity": "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="
},
"node_modules/resolve": { "node_modules/resolve": {
"version": "1.22.8", "version": "1.22.8",
"resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz", "resolved": "https://registry.npmjs.org/resolve/-/resolve-1.22.8.tgz",
@ -9726,6 +10041,11 @@
"url": "https://github.com/sponsors/ljharb" "url": "https://github.com/sponsors/ljharb"
} }
}, },
"node_modules/resolve-alpn": {
"version": "1.2.1",
"resolved": "https://registry.npmjs.org/resolve-alpn/-/resolve-alpn-1.2.1.tgz",
"integrity": "sha512-0a1F4l73/ZFZOakJnQ3FvkJ2+gSTQWz/r2KE5OdDY0TxPm5h4GkqkWWfM47T7HsbnOtcJVEF4epCVy6u7Q3K+g=="
},
"node_modules/resolve-cwd": { "node_modules/resolve-cwd": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz", "resolved": "https://registry.npmjs.org/resolve-cwd/-/resolve-cwd-3.0.0.tgz",
@ -9767,6 +10087,17 @@
"node": ">=10" "node": ">=10"
} }
}, },
"node_modules/responselike": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/responselike/-/responselike-2.0.1.tgz",
"integrity": "sha512-4gl03wn3hj1HP3yzgdI7d3lCkF95F21Pz4BPGvKHinyQzALR5CapwC8yIi0Rh58DEMQ/SguC03wFj2k0M/mHhw==",
"dependencies": {
"lowercase-keys": "^2.0.0"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/retry": { "node_modules/retry": {
"version": "0.13.1", "version": "0.13.1",
"resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz", "resolved": "https://registry.npmjs.org/retry/-/retry-0.13.1.tgz",
@ -10029,6 +10360,11 @@
"resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz", "resolved": "https://registry.npmjs.org/set-blocking/-/set-blocking-2.0.0.tgz",
"integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw==" "integrity": "sha512-KiKBS8AnWGEyLzofFfmvKwpdPzqiy16LvQfK3yv/fVH7Bj13/wl3JSR1J+rfgRE9q7xUJK4qvgS8raSOeLUehw=="
}, },
"node_modules/set-cookie-parser": {
"version": "2.6.0",
"resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.6.0.tgz",
"integrity": "sha512-RVnVQxTXuerk653XfuliOxBP81Sf0+qfQE73LIYKcyMYHG94AuH0kgrQpRDuTZnSmjpysHmzxJXKNfa6PjFhyQ=="
},
"node_modules/set-function-length": { "node_modules/set-function-length": {
"version": "1.2.2", "version": "1.2.2",
"resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz", "resolved": "https://registry.npmjs.org/set-function-length/-/set-function-length-1.2.2.tgz",
@ -10767,6 +11103,28 @@
"node": ">=0.6" "node": ">=0.6"
} }
}, },
"node_modules/tough-cookie": {
"version": "4.1.3",
"resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-4.1.3.tgz",
"integrity": "sha512-aX/y5pVRkfRnfmuX+OdbSdXvPe6ieKX/G2s7e98f4poJHnqH3281gDPm/metm6E/WRamfx7WC4HUqkWHfQHprw==",
"dependencies": {
"psl": "^1.1.33",
"punycode": "^2.1.1",
"universalify": "^0.2.0",
"url-parse": "^1.5.3"
},
"engines": {
"node": ">=6"
}
},
"node_modules/tough-cookie/node_modules/universalify": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/universalify/-/universalify-0.2.0.tgz",
"integrity": "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg==",
"engines": {
"node": ">= 4.0.0"
}
},
"node_modules/tr46": { "node_modules/tr46": {
"version": "3.0.0", "version": "3.0.0",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz", "resolved": "https://registry.npmjs.org/tr46/-/tr46-3.0.0.tgz",
@ -11127,6 +11485,15 @@
"punycode": "^2.1.0" "punycode": "^2.1.0"
} }
}, },
"node_modules/url-parse": {
"version": "1.5.10",
"resolved": "https://registry.npmjs.org/url-parse/-/url-parse-1.5.10.tgz",
"integrity": "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ==",
"dependencies": {
"querystringify": "^2.1.1",
"requires-port": "^1.0.0"
}
},
"node_modules/urlpattern-polyfill": { "node_modules/urlpattern-polyfill": {
"version": "10.0.0", "version": "10.0.0",
"resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz", "resolved": "https://registry.npmjs.org/urlpattern-polyfill/-/urlpattern-polyfill-10.0.0.tgz",

View File

@ -49,7 +49,10 @@
"puppeteer": "^22.6.3", "puppeteer": "^22.6.3",
"puppeteer-extra": "^3.3.6", "puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3", "puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
"puppeteer-extra-plugin-stealth": "^2.11.2", "puppeteer-extra-plugin-stealth": "^2.11.2",
"puppeteer-page-proxy": "^1.3.0",
"set-cookie-parser": "^2.6.0",
"stripe": "^11.11.0", "stripe": "^11.11.0",
"tiktoken": "^1.0.10", "tiktoken": "^1.0.10",
"turndown": "^7.1.3", "turndown": "^7.1.3",
@ -62,6 +65,7 @@
"@types/cors": "^2.8.17", "@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1", "@types/generic-pool": "^3.8.1",
"@types/node": "^18", "@types/node": "^18",
"@types/set-cookie-parser": "^2.4.7",
"@typescript-eslint/eslint-plugin": "^5.12.0", "@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0", "@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0", "eslint": "^8.9.0",

View File

@ -1,51 +1,25 @@
import { assignTransferProtocolMeta, marshalErrorLike, RPCHost, RPCReflection, AssertionFailureError, ParamValidationError } from 'civkit'; import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
HashManager,
AssertionFailureError, ParamValidationError,
} from 'civkit';
import { singleton } from 'tsyringe'; import { singleton } from 'tsyringe';
import { CloudHTTPv2, Ctx, Logger, OutputServerEventStream, RPCReflect } from '../shared'; import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
import _ from 'lodash'; import _ from 'lodash';
import { PageSnapshot, PuppeteerControl } from '../services/puppeteer'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express'; import { Request, Response } from 'express';
import normalizeUrl from "@esm2cjs/normalize-url"; import normalizeUrl from "@esm2cjs/normalize-url";
import { AltTextService } from '../services/alt-text'; import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown'; import TurndownService from 'turndown';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { CookieParam } from 'puppeteer';
import { Crawled } from '../db/crawled';
import { tidyMarkdown } from '../utils/markdown';
import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto';
function tidyMarkdown(markdown: string): string { const md5Hasher = new HashManager('md5', 'hex');
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
// Remove internal new lines and excessive spaces within the text
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
// Normalize by removing excessive spaces and new lines
text = text.replace(/\s+/g, ' ').trim();
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
linkUrl = linkUrl.replace(/\s+/g, '').trim();
if (imgUrl) {
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
} else {
return `[${text}](${linkUrl})`;
}
});
// Step 2: Normalize regular links that may be broken across lines
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
// Step 4: Remove leading spaces from each line
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
return normalizedMarkdown.trim();
}
@singleton() @singleton()
export class CrawlerHost extends RPCHost { export class CrawlerHost extends RPCHost {
@ -53,12 +27,29 @@ export class CrawlerHost extends RPCHost {
turnDownPlugins = [require('turndown-plugin-gfm').tables]; turnDownPlugins = [require('turndown-plugin-gfm').tables];
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 300;
urlValidMs = 1000 * 3600 * 4;
constructor( constructor(
protected globalLogger: Logger, protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl, protected puppeteerControl: PuppeteerControl,
protected altTextService: AltTextService, protected altTextService: AltTextService,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
) { ) {
super(...arguments); super(...arguments);
puppeteerControl.on('crawled', async (snapshot: PageSnapshot, options: ScrappingOptions & { url: URL; }) => {
if (!snapshot.title?.trim()) {
return;
}
if (options.cookies?.length) {
// Potential privacy issue, dont cache if cookies are used
return;
}
await this.setToCache(options.url, snapshot);
});
} }
override async init() { override async init() {
@ -67,16 +58,51 @@ export class CrawlerHost extends RPCHost {
this.emit('ready'); this.emit('ready');
} }
async formatSnapshot(snapshot: PageSnapshot, nominalUrl?: string) { async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
screenshotUrl?: string;
}, nominalUrl?: URL) {
if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
screenshotUrl: snapshot.screenshotUrl,
toString() {
return this.screenshotUrl;
}
};
}
if (mode === 'html') {
return {
html: snapshot.html,
toString() {
return this.html;
}
};
}
if (mode === 'text') {
return {
text: snapshot.text,
toString() {
return this.text;
}
};
}
const toBeTurnedToMd = snapshot.parsed?.content; const toBeTurnedToMd = snapshot.parsed?.content;
let turnDownService = new TurndownService(); let turnDownService = new TurndownService();
for (const plugin of this.turnDownPlugins) { for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin); turnDownService = turnDownService.use(plugin);
} }
const urlToAltMap: { [k: string]: string | undefined; } = {};
let contentText = ''; if (snapshot.imgs?.length) {
if (toBeTurnedToMd) {
const urlToAltMap: { [k: string]: string | undefined; } = {};
const tasks = (snapshot.imgs || []).map(async (x) => { const tasks = (snapshot.imgs || []).map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => { const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
@ -88,25 +114,27 @@ export class CrawlerHost extends RPCHost {
}); });
await Promise.all(tasks); await Promise.all(tasks);
let imgIdx = 0; }
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', { turnDownService.addRule('img-generated-alt', {
filter: 'img', filter: 'img',
replacement: (_content, node) => { replacement: (_content, node) => {
const src = (node.getAttribute('src') || '').trim(); const src = (node.getAttribute('src') || '').trim();
const alt = cleanAttribute(node.getAttribute('alt')); const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) { if (!src) {
return ''; return '';
}
const mapped = urlToAltMap[src];
imgIdx++;
if (mapped) {
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
}
return `![Image ${imgIdx}: ${alt}](${src})`;
} }
}); const mapped = urlToAltMap[src];
imgIdx++;
if (mapped) {
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
}
return `![Image ${imgIdx}: ${alt}](${src})`;
}
});
let contentText = '';
if (toBeTurnedToMd) {
try { try {
contentText = turnDownService.turndown(toBeTurnedToMd).trim(); contentText = turnDownService.turndown(toBeTurnedToMd).trim();
} catch (err) { } catch (err) {
@ -141,7 +169,7 @@ export class CrawlerHost extends RPCHost {
const formatted = { const formatted = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl || snapshot.href?.trim(), url: nominalUrl?.toString() || snapshot.href?.trim(),
content: cleanText, content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined, publishedTime: snapshot.parsed?.publishedTime || undefined,
@ -171,6 +199,7 @@ ${this.content}
timeoutSeconds: 540, timeoutSeconds: 540,
concurrency: 4, concurrency: 4,
}, },
tags: ['Crawler'],
httpMethod: ['get', 'post'], httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream], returnType: [String, OutputServerEventStream],
}) })
@ -181,6 +210,57 @@ ${this.content}
concurrency: 21, concurrency: 21,
maxInstances: 476, maxInstances: 476,
}, },
openapi: {
operation: {
parameters: {
'Accept': {
description: `Specifies your preference for the response format. \n\n` +
`Supported formats:\n` +
`- text/event-stream\n` +
`- application/json or text/json\n` +
`- text/plain`
,
in: 'header',
schema: { type: 'string' }
},
'X-No-Cache': {
description: `Ignores internal cache if this header is specified with a value.`,
in: 'header',
schema: { type: 'string' }
},
'X-Respond-With': {
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
`Supported formats:\n` +
`- markdown\n` +
`- html\n` +
`- text\n` +
`- screenshot\n\n` +
`Defaults to: markdown`
,
in: 'header',
schema: { type: 'string' }
},
'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
`Supported protocols:\n` +
`- http\n` +
`- https\n` +
`- socks4\n` +
`- socks5\n\n` +
`For authentication, https://user:pass@host:port`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
in: 'header',
schema: { type: 'string' }
},
}
}
},
tags: ['Crawler'],
httpMethod: ['get', 'post'], httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream], returnType: [String, OutputServerEventStream],
}) })
@ -207,27 +287,41 @@ ${this.content}
path: 'url' path: 'url'
}); });
} }
const screenshotEnabled = Boolean(ctx.req.headers['x-screenshot']);
const noCache = Boolean(ctx.req.headers['x-no-cache']); const customMode = ctx.req.get('x-respond-with') || 'markdown';
const noCache = Boolean(ctx.req.get('x-no-cache'));
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx.req.headers['x-set-cookie'];
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookie, { decodeValues: false }) as CookieParam,
domain: urlToCrawl.hostname,
});
}
} else if (setCookieHeaders) {
cookies.push({
...parseSetCookieString(setCookieHeaders, { decodeValues: false }) as CookieParam,
domain: urlToCrawl.hostname,
});
}
const crawlOpts: ScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'),
cookies,
};
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream(); const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream); rpcReflect.return(sseStream);
try { try {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
if (!scrapped) { if (!scrapped) {
continue; continue;
} }
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
if (scrapped.screenshot && screenshotEnabled) {
sseStream.write({
event: 'screenshot',
data: scrapped.screenshot.toString('base64'),
});
}
sseStream.write({ sseStream.write({
event: 'data', event: 'data',
@ -235,7 +329,7 @@ ${this.content}
}); });
} }
} catch (err: any) { } catch (err: any) {
this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
sseStream.write({ sseStream.write({
event: 'error', event: 'error',
data: marshalErrorLike(err), data: marshalErrorLike(err),
@ -249,13 +343,13 @@ ${this.content}
let lastScrapped; let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue; continue;
} }
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
return formatted; return formatted;
} }
@ -264,16 +358,22 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
} }
return await this.formatSnapshot(lastScrapped, urlToCrawl?.toString()); return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
} }
for await (const scrapped of this.puppeteerControl.scrap(urlToCrawl.toString(), noCache)) { for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
lastScrapped = scrapped; lastScrapped = scrapped;
if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) { if (!scrapped?.parsed?.content || !(scrapped.title?.trim())) {
continue; continue;
} }
const formatted = await this.formatSnapshot(scrapped, urlToCrawl?.toString()); const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
} }
@ -282,12 +382,111 @@ ${this.content}
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`); throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
} }
return `${await this.formatSnapshot(lastScrapped, urlToCrawl?.toString())}`; const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
} }
getUrlDigest(urlToCrawl: URL) {
const normalizedURL = new URL(urlToCrawl);
normalizedURL.hash = '';
const normalizedUrl = normalizedURL.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl.toString());
return digest;
}
async queryCache(urlToCrawl: URL) {
const digest = this.getUrlDigest(urlToCrawl);
const cache = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0];
if (cache) {
const age = Date.now() - cache.createdAt.valueOf();
const stale = cache.createdAt.valueOf() > (Date.now() - this.cacheValidMs);
this.logger.info(`${stale ? 'Only stale ' : ''}Cache exists for ${urlToCrawl}, normalized digest: ${digest}, ${age}ms old`, {
url: urlToCrawl, digest, age, stale
});
const r = cache.snapshot;
return {
isFresh: !stale,
snapshot: {
...r,
screenshot: undefined,
screenshotUrl: cache.screenshotAvailable ?
await this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs) : undefined,
} as PageSnapshot & { screenshotUrl?: string; }
};
}
return undefined;
}
async setToCache(urlToCrawl: URL, snapshot: PageSnapshot) {
const digest = this.getUrlDigest(urlToCrawl);
this.logger.info(`Caching snapshot of ${urlToCrawl}...`, { url: urlToCrawl, digest, title: snapshot?.title, href: snapshot?.href });
const nowDate = new Date();
const cache = Crawled.from({
_id: randomUUID(),
url: urlToCrawl.toString(),
createdAt: nowDate,
expireAt: new Date(nowDate.valueOf() + this.cacheRetentionMs),
urlPathDigest: digest,
snapshot: {
...snapshot,
screenshot: null
},
});
if (snapshot.screenshot) {
await this.firebaseObjectStorage.saveFile(`screenshots/${cache._id}`, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
cache.screenshotAvailable = true;
}
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
return undefined;
});
return r;
}
async *cachedScrap(urlToCrawl: URL, crawlOpts: ScrappingOptions, noCache: boolean = false) {
let cache;
if (!noCache && !crawlOpts.cookies?.length) {
cache = await this.queryCache(urlToCrawl);
}
if (cache?.isFresh) {
yield cache.snapshot;
return;
}
try {
yield* this.puppeteerControl.scrap(urlToCrawl, crawlOpts);
} catch (err: any) {
if (cache) {
this.logger.warn(`Failed to scrap ${urlToCrawl}, but a stale cache is available. Falling back to cache`, { err: marshalErrorLike(err) });
yield cache.snapshot;
return;
}
throw err;
}
}
} }
function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
}

View File

@ -1,6 +1,7 @@
import { Also, parseJSONText, Prop } from 'civkit'; import { Also, parseJSONText, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore'; import { FirestoreRecord } from '../shared/lib/firestore';
import _ from 'lodash'; import _ from 'lodash';
import type { PageSnapshot } from '../services/puppeteer';
@Also({ @Also({
dictOf: Object dictOf: Object
@ -21,7 +22,10 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string; urlPathDigest!: string;
@Prop() @Prop()
snapshot!: any; snapshot!: PageSnapshot & { screenshot: never; };
@Prop()
screenshotAvailable?: boolean;
@Prop() @Prop()
createdAt!: Date; createdAt!: Date;

View File

@ -1,13 +1,17 @@
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
import { container, singleton } from 'tsyringe';
import type { Browser, Page } from 'puppeteer';
import { Logger } from '../shared/services/logger';
import genericPool from 'generic-pool';
import os from 'os'; import os from 'os';
import fs from 'fs'; import fs from 'fs';
import { Crawled } from '../db/crawled'; import { container, singleton } from 'tsyringe';
import genericPool from 'generic-pool';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, Page } from 'puppeteer';
import puppeteer from 'puppeteer-extra'; import puppeteer from 'puppeteer-extra';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import puppeteerPageProxy from 'puppeteer-extra-plugin-page-proxy';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
export interface ImgBrief { export interface ImgBrief {
@ -42,7 +46,12 @@ export interface PageSnapshot {
screenshot?: Buffer; screenshot?: Buffer;
imgs?: ImgBrief[]; imgs?: ImgBrief[];
} }
const md5Hasher = new HashManager('md5', 'hex');
export interface ScrappingOptions {
proxyUrl?: string;
cookies?: CookieParam[];
}
const puppeteerStealth = require('puppeteer-extra-plugin-stealth'); const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
puppeteer.use(puppeteerStealth()); puppeteer.use(puppeteerStealth());
@ -51,9 +60,13 @@ puppeteer.use(puppeteerStealth());
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`, // userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
// platform: `Linux`, // platform: `Linux`,
// })) // }))
const puppeteerBlockResources = require('puppeteer-extra-plugin-block-resources');
puppeteer.use(puppeteerBlockResources({ puppeteer.use(puppeteerBlockResources({
blockedTypes: new Set(['media']), blockedTypes: new Set(['media']),
interceptResolutionPriority: 1,
}));
puppeteer.use(puppeteerPageProxy({
interceptResolutionPriority: 1,
})); }));
@singleton() @singleton()
@ -74,7 +87,7 @@ export class PuppeteerControl extends AsyncService {
return page.browser().connected && !page.isClosed(); return page.browser().connected && !page.isClosed();
} }
}, { }, {
max: Math.max(1 + Math.floor(os.freemem() / (1024 * 1024 * 1024)), 16), max: Math.max(1 + Math.floor(os.totalmem() / (384 * 1024 * 1024)), 16),
min: 1, min: 1,
acquireTimeoutMillis: 60_000, acquireTimeoutMillis: 60_000,
testOnBorrow: true, testOnBorrow: true,
@ -88,7 +101,7 @@ export class PuppeteerControl extends AsyncService {
override async init() { override async init() {
await this.dependencyReady(); await this.dependencyReady();
this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max });
this.pagePool.start(); this.pagePool.start();
if (this.browser) { if (this.browser) {
@ -128,7 +141,10 @@ export class PuppeteerControl extends AsyncService {
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true)); preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1024, height: 1024 })); preparations.push(page.setViewport({ width: 1024, height: 1024 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { preparations.push(page.exposeFunction('reportSnapshot', (snapshot: PageSnapshot) => {
if (snapshot.href === 'about:blank') {
return;
}
page.emit('snapshot', snapshot); page.emit('snapshot', snapshot);
})); }));
preparations.push(page.evaluateOnNewDocument(READABILITY_JS)); preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
@ -166,40 +182,39 @@ function giveSnapshot() {
const elem = document.createElement('div'); const elem = document.createElement('div');
elem.innerHTML = parsed.content; elem.innerHTML = parsed.content;
r.imgs = briefImgs(elem); r.imgs = briefImgs(elem);
} else {
const allImgs = briefImgs();
if (allImgs.length === 1) {
r.imgs = allImgs;
}
} }
return r; return r;
} }
`)); `));
preparations.push(page.evaluateOnNewDocument(() => { preparations.push(page.evaluateOnNewDocument(`
let aftershot: any; let aftershot = undefined;
const handlePageLoad = () => { const handlePageLoad = () => {
// @ts-expect-error if (document.readyState !== 'complete') {
if (document.readyState !== 'complete' && document.readyState !== 'interactive') { return;
return; }
} const parsed = giveSnapshot();
// @ts-expect-error window.reportSnapshot(parsed);
const parsed = giveSnapshot(); if (!parsed.text) {
if (parsed) { if (aftershot) {
// @ts-expect-error clearTimeout(aftershot);
window.reportSnapshot(parsed); }
} else { aftershot = setTimeout(() => {
if (aftershot) { const r = giveSnapshot();
clearTimeout(aftershot); if (r && r.text) {
} window.reportSnapshot(r);
aftershot = setTimeout(() => { }
// @ts-expect-error }, 500);
window.reportSnapshot(giveSnapshot()); }
}, 500); };
} document.addEventListener('readystatechange', handlePageLoad);
}; document.addEventListener('load', handlePageLoad);
// setInterval(handlePageLoad, 1000); `));
// @ts-expect-error
document.addEventListener('readystatechange', handlePageLoad);
// @ts-expect-error
document.addEventListener('load', handlePageLoad);
}));
await Promise.all(preparations); await Promise.all(preparations);
// TODO: further setup the page; // TODO: further setup the page;
@ -207,41 +222,23 @@ function giveSnapshot() {
return page; return page;
} }
async *scrap(url: string, noCache: string | boolean = false): AsyncGenerator<PageSnapshot | undefined> { async *scrap(parsedUrl: URL, options: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
const parsedUrl = new URL(url);
// parsedUrl.search = ''; // parsedUrl.search = '';
parsedUrl.hash = ''; const url = parsedUrl.toString();
const normalizedUrl = parsedUrl.toString().toLowerCase();
const digest = md5Hasher.hash(normalizedUrl); this.logger.info(`Scraping ${url}`, { url });
this.logger.info(`Scraping ${url}, normalized digest: ${digest}`, { url, digest });
let snapshot: PageSnapshot | undefined; let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined; let screenshot: Buffer | undefined;
if (!noCache) { const page = await this.pagePool.acquire();
const cached = (await Crawled.fromFirestoreQuery(Crawled.COLLECTION.where('urlPathDigest', '==', digest).orderBy('createdAt', 'desc').limit(1)))?.[0]; if (options.proxyUrl) {
await page.useProxy(options.proxyUrl);
if (cached && cached.createdAt.valueOf() > (Date.now() - 1000 * 300)) { }
const age = Date.now() - cached.createdAt.valueOf(); if (options.cookies) {
this.logger.info(`Cache hit for ${url}, normalized digest: ${digest}, ${age}ms old`, { url, digest, age }); await page.setCookie(...options.cookies);
snapshot = {
...cached.snapshot
};
if (snapshot) {
delete snapshot.screenshot;
}
screenshot = cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined;
yield {
...cached.snapshot,
screenshot: cached.snapshot?.screenshot ? Buffer.from(cached.snapshot.screenshot, 'base64') : undefined
};
return;
}
} }
const page = await this.pagePool.acquire();
let nextSnapshotDeferred = Defer(); let nextSnapshotDeferred = Defer();
let finalized = false; let finalized = false;
const hdl = (s: any) => { const hdl = (s: any) => {
@ -262,48 +259,43 @@ function giveSnapshot() {
cause: err, cause: err,
})); }));
}).finally(async () => { }).finally(async () => {
finalized = true;
if (!snapshot?.html) { if (!snapshot?.html) {
finalized = true;
return; return;
} }
screenshot = await page.screenshot({
type: 'jpeg',
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
screenshot = await page.screenshot();
if (!snapshot.title || !snapshot.parsed?.content) { if (!snapshot.title || !snapshot.parsed?.content) {
const salvaged = await this.salvage(url, page); const salvaged = await this.salvage(url, page);
if (salvaged) { if (salvaged) {
screenshot = await page.screenshot({
type: 'jpeg',
quality: 75,
});
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
screenshot = await page.screenshot();
} }
} }
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href }); finalized = true;
const nowDate = new Date(); this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
Crawled.save( this.emit(
Crawled.from({ 'crawled',
url, { ...snapshot, screenshot },
createdAt: nowDate, { ...options, url: parsedUrl }
expireAt: new Date(nowDate.valueOf() + 1000 * 3600 * 24 * 7), );
urlPathDigest: digest,
snapshot: { ...snapshot, screenshot: screenshot?.toString('base64') || '' },
}).degradeForFireStore()
).catch((err) => {
this.logger.warn(`Failed to save snapshot`, { err: marshalErrorLike(err) });
});
}); });
try { try {
let lastHTML = snapshot?.html;
while (true) { while (true) {
await Promise.race([nextSnapshotDeferred.promise, gotoPromise]); await Promise.race([nextSnapshotDeferred.promise, gotoPromise]);
if (finalized) { if (finalized) {
yield { ...snapshot, screenshot } as PageSnapshot; yield { ...snapshot, screenshot } as PageSnapshot;
break; break;
} }
yield snapshot; if (snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, screenshot } as PageSnapshot;
}
} }
} finally { } finally {
gotoPromise.finally(() => { gotoPromise.finally(() => {
@ -333,6 +325,8 @@ function giveSnapshot() {
this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) }); this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
}); });
this.logger.info(`Salvation completed.`);
return true; return true;
} }
} }

View File

@ -0,0 +1,39 @@
export function tidyMarkdown(markdown: string): string {
// Step 1: Handle complex broken links with text and optional images spread across multiple lines
let normalizedMarkdown = markdown.replace(/\[\s*([^]+?)\s*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, url) => {
// Remove internal new lines and excessive spaces within the text
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^!]*?)\s*\n*(?:!\[([^\]]*)\]\((.*?)\))?\s*\n*\]\s*\(\s*([^)]+)\s*\)/g, (match, text, alt, imgUrl, linkUrl) => {
// Normalize by removing excessive spaces and new lines
text = text.replace(/\s+/g, ' ').trim();
alt = alt ? alt.replace(/\s+/g, ' ').trim() : '';
imgUrl = imgUrl ? imgUrl.replace(/\s+/g, '').trim() : '';
linkUrl = linkUrl.replace(/\s+/g, '').trim();
if (imgUrl) {
return `[${text} ![${alt}](${imgUrl})](${linkUrl})`;
} else {
return `[${text}](${linkUrl})`;
}
});
// Step 2: Normalize regular links that may be broken across lines
normalizedMarkdown = normalizedMarkdown.replace(/\[\s*([^\]]+)\]\s*\(\s*([^)]+)\)/g, (match, text, url) => {
text = text.replace(/\s+/g, ' ').trim();
url = url.replace(/\s+/g, '').trim();
return `[${text}](${url})`;
});
// Step 3: Replace more than two consecutive empty lines with exactly two empty lines
normalizedMarkdown = normalizedMarkdown.replace(/\n{3,}/g, '\n\n');
// Step 4: Remove leading spaces from each line
normalizedMarkdown = normalizedMarkdown.replace(/^[ \t]+/gm, '');
return normalizedMarkdown.trim();
}

View File

@ -0,0 +1,3 @@
export function cleanAttribute(attribute: string) {
return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : '';
}