mirror of
https://git-proxy.hk.martin98.com/https://github.com/jina-ai/reader
synced 2025-04-10 13:10:32 +08:00
restructure: nolonger a firebase application (#1160)
* fix: fine allow redefining Function.prototype.toString * wip * wip * wip * wip * wip * wip * wip * fix: contentType encoding * wip * fix: error throwing * wip * fix * wip * fix * fix * fix: jsdom * wip * wip * fix: links summary uniqueness * wip * wip * robots-txt catch no robots.txt * deps: remove puppeteer-extra-plugin-stealth * fix: dont change waring type * fix: curl * fix: replace firebase-roundtrip-check with blackhole-detector * fix: black hole detection * sercher: black hole detecting * fix: no h2c for searcher * fix: bhd * fix: search and crawl conflict * fix: bhd * fix * fix: server script * canvas: fixed avif issue * logging: move some to debug * fix * fix: pptr declare ready only when page can be created without issues * fix: bhd * cd: cloud run deploy-health-check cannot complete pptr newPage * cd: fix * fix: curl body can be null * fix * fix * fix: major fix regarding TC pdfs * fix * fix * deps: fix civkit trie router issue * fix * boom: total restructure * cd: fix docker ctx * fix * fix: switch to h2c * cd: ensure http2
This commit is contained in:
parent
ed80c9a4a2
commit
23a3b807c9
12
.github/workflows/cd.yml
vendored
12
.github/workflows/cd.yml
vendored
@ -14,9 +14,6 @@ jobs:
|
||||
concurrency:
|
||||
group: ${{ github.ref_type == 'branch' && github.ref }}
|
||||
cancel-in-progress: true
|
||||
defaults:
|
||||
run:
|
||||
working-directory: backend/functions
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
@ -30,6 +27,8 @@ jobs:
|
||||
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
|
||||
- name: 'Set up Cloud SDK'
|
||||
uses: 'google-github-actions/setup-gcloud@v2'
|
||||
with:
|
||||
install_components: beta
|
||||
- name: "Docker auth"
|
||||
run: |-
|
||||
gcloud auth configure-docker us-docker.pkg.dev --quiet
|
||||
@ -40,7 +39,6 @@ jobs:
|
||||
with:
|
||||
node-version: 22.12.0
|
||||
cache: npm
|
||||
cache-dependency-path: backend/functions/package-lock.json
|
||||
|
||||
- name: npm install
|
||||
run: npm ci
|
||||
@ -65,13 +63,13 @@ jobs:
|
||||
id: container
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: backend/functions
|
||||
context: .
|
||||
push: true
|
||||
tags: ${{ steps.meta.outputs.tags }}
|
||||
labels: ${{ steps.meta.outputs.labels }}
|
||||
- name: Deploy CRAWL with Tag
|
||||
run: |
|
||||
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
|
||||
gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
||||
- name: Deploy SEARCH with Tag
|
||||
run: |
|
||||
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
|
||||
gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
|
79
.gitignore
vendored
79
.gitignore
vendored
@ -1,4 +1,79 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
firebase-debug.log*
|
||||
firebase-debug.*.log*
|
||||
|
||||
# Firebase cache
|
||||
.firebase/
|
||||
|
||||
# Firebase config
|
||||
|
||||
# Uncomment this if you'd like others to create their own Firebase project.
|
||||
# For a team working on the same Firebase project(s), it is recommended to leave
|
||||
# it commented so all members can deploy to the same project(s) in .firebaserc.
|
||||
# .firebaserc
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (http://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
.secret.local
|
||||
|
||||
toy*.ts
|
||||
|
||||
.DS_Store
|
||||
/package-lock.json
|
||||
backend/functions/test.js
|
||||
build/
|
||||
.firebase-emu/
|
||||
*.log
|
||||
.DS_Store
|
||||
|
||||
*.local
|
||||
.secret.*
|
||||
licensed/
|
59
.vscode/launch.json
vendored
59
.vscode/launch.json
vendored
@ -1,26 +1,6 @@
|
||||
{
|
||||
"version": "0.2.0",
|
||||
"configurations": [
|
||||
{
|
||||
"name": "Debug Fullstack: attach",
|
||||
"request": "attach",
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:debug"
|
||||
},
|
||||
{
|
||||
"name": "Debug Fullstack: attach: with proxy",
|
||||
"request": "attach",
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:debug:with-proxy"
|
||||
},
|
||||
{
|
||||
"name": "Attach",
|
||||
"port": 9229,
|
||||
@ -40,21 +20,44 @@
|
||||
"type": "node"
|
||||
},
|
||||
{
|
||||
"name": "Debug Fullstack",
|
||||
"name": "Debug Stand Alone Crawl",
|
||||
"request": "launch",
|
||||
"runtimeArgs": [
|
||||
"emulators:start",
|
||||
"--import=../.firebase-emu",
|
||||
"--export-on-exit=../.firebase-emu",
|
||||
"--env-file=.secret.local",
|
||||
],
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
|
||||
"env": {
|
||||
"GCLOUD_PROJECT": "reader-6b7dc",
|
||||
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"program": "build/stand-alone/crawl.js",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"preLaunchTask": "Fullstack:prepare",
|
||||
"killBehavior": "polite"
|
||||
"outputCapture": "std",
|
||||
"preLaunchTask": "Backend:build:watch",
|
||||
"killBehavior": "forceful"
|
||||
},
|
||||
{
|
||||
"name": "Debug Stand Alone Search",
|
||||
"request": "launch",
|
||||
"runtimeArgs": [
|
||||
"--env-file=.secret.local",
|
||||
],
|
||||
"env": {
|
||||
"GCLOUD_PROJECT": "reader-6b7dc",
|
||||
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
"program": "build/stand-alone/search.js",
|
||||
"skipFiles": [
|
||||
"<node_internals>/**"
|
||||
],
|
||||
"type": "node",
|
||||
"outputCapture": "std",
|
||||
"preLaunchTask": "Backend:build:watch",
|
||||
"killBehavior": "forceful"
|
||||
},
|
||||
]
|
||||
}
|
132
.vscode/tasks.json
vendored
132
.vscode/tasks.json
vendored
@ -6,29 +6,18 @@
|
||||
"script": "build",
|
||||
"group": "build",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
"problemMatcher": [],
|
||||
"label": "Backend:rebuild",
|
||||
"detail": "Backend:rebuild"
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:reset",
|
||||
"group": "build",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"problemMatcher": [],
|
||||
"label": "Backend:reset-emulator",
|
||||
"detail": "Backend:reset-emulator"
|
||||
},
|
||||
{
|
||||
"type": "typescript",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
"cwd": "${workspaceFolder}"
|
||||
},
|
||||
"tsconfig": "backend/functions/tsconfig.json",
|
||||
"tsconfig": "tsconfig.json",
|
||||
"option": "watch",
|
||||
"isBackground": true,
|
||||
"problemMatcher": [
|
||||
@ -36,121 +25,6 @@
|
||||
],
|
||||
"group": "build",
|
||||
"label": "Backend:build:watch"
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:debug",
|
||||
"group": "none",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions"
|
||||
},
|
||||
"problemMatcher": [
|
||||
{
|
||||
"base": "$tsc",
|
||||
"background": {
|
||||
"activeOnStart": false,
|
||||
"beginsPattern": "shutdown requested|Starting emulators",
|
||||
"endsPattern": "Debugger listening"
|
||||
}
|
||||
}
|
||||
],
|
||||
"label": "Backend:start-emulator-debug",
|
||||
"detail": "Backend:start-emulator-debug",
|
||||
"dependsOn": [
|
||||
"Backend:build:watch"
|
||||
],
|
||||
"isBackground": true,
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "dev",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/webapp",
|
||||
},
|
||||
"group": "build",
|
||||
"label": "Frontend:start:dev",
|
||||
"detail": "Frontend:start:dev",
|
||||
"isBackground": true,
|
||||
"problemMatcher": {
|
||||
"base": "$vite",
|
||||
"background": {
|
||||
"activeOnStart": true,
|
||||
"endsPattern": "OK",
|
||||
"beginsPattern": "vite"
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "dev",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/webapp",
|
||||
"env": {
|
||||
"FIREBASE_EMULATE": "true",
|
||||
}
|
||||
},
|
||||
"group": "build",
|
||||
"label": "Frontend:start:emu",
|
||||
"detail": "Frontend:start:emu",
|
||||
"isBackground": true,
|
||||
"problemMatcher": {
|
||||
"base": "$vite",
|
||||
"background": {
|
||||
"activeOnStart": true,
|
||||
"endsPattern": "OK",
|
||||
"beginsPattern": "vite"
|
||||
}
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "npm",
|
||||
"script": "emu:debug2",
|
||||
"group": "none",
|
||||
"options": {
|
||||
"cwd": "${workspaceFolder}/backend/functions",
|
||||
"env": {
|
||||
"https_proxy": "http://127.0.0.1:7890",
|
||||
"http_proxy": "http://127.0.0.1:7890",
|
||||
"all_proxy": "socks5://127.0.0.1:7890"
|
||||
}
|
||||
},
|
||||
"problemMatcher": [
|
||||
{
|
||||
"base": "$tsc",
|
||||
"background": {
|
||||
"activeOnStart": false,
|
||||
"beginsPattern": "shutdown requested|Starting emulators",
|
||||
"endsPattern": "Debugger listening"
|
||||
}
|
||||
}
|
||||
],
|
||||
"label": "Backend:start-emulator-debug:with-proxy",
|
||||
"detail": "Backend:start-emulator-debug:with-proxy",
|
||||
"dependsOn": [
|
||||
"Backend:build:watch"
|
||||
],
|
||||
"isBackground": true,
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:prepare",
|
||||
"dependsOn": [
|
||||
"Frontend:start:emu",
|
||||
"Backend:build:watch",
|
||||
],
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:debug",
|
||||
"dependsOn": [
|
||||
// "Frontend:start:emu",
|
||||
"Backend:start-emulator-debug",
|
||||
],
|
||||
},
|
||||
{
|
||||
"label": "Fullstack:debug:with-proxy",
|
||||
"dependsOn": [
|
||||
"Frontend:start:emu",
|
||||
"Backend:start-emulator-debug:with-proxy",
|
||||
],
|
||||
}
|
||||
]
|
||||
}
|
@ -158,13 +158,9 @@ curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.or
|
||||
|
||||
You will need the following tools to run the project:
|
||||
- Node v18 (The build fails for Node version >18)
|
||||
- Firebase CLI (`npm install -g firebase-tools`)
|
||||
|
||||
For backend, go to the `backend/functions` directory and install the npm dependencies.
|
||||
|
||||
```bash
|
||||
git clone git@github.com:jina-ai/reader.git
|
||||
cd backend/functions
|
||||
npm install
|
||||
```
|
||||
|
||||
|
@ -1,5 +0,0 @@
|
||||
{
|
||||
"projects": {
|
||||
"default": "reader-6b7dc"
|
||||
}
|
||||
}
|
79
backend/.gitignore
vendored
79
backend/.gitignore
vendored
@ -1,79 +0,0 @@
|
||||
# Logs
|
||||
logs
|
||||
*.log
|
||||
npm-debug.log*
|
||||
yarn-debug.log*
|
||||
yarn-error.log*
|
||||
firebase-debug.log*
|
||||
firebase-debug.*.log*
|
||||
|
||||
# Firebase cache
|
||||
.firebase/
|
||||
|
||||
# Firebase config
|
||||
|
||||
# Uncomment this if you'd like others to create their own Firebase project.
|
||||
# For a team working on the same Firebase project(s), it is recommended to leave
|
||||
# it commented so all members can deploy to the same project(s) in .firebaserc.
|
||||
# .firebaserc
|
||||
|
||||
# Runtime data
|
||||
pids
|
||||
*.pid
|
||||
*.seed
|
||||
*.pid.lock
|
||||
|
||||
# Directory for instrumented libs generated by jscoverage/JSCover
|
||||
lib-cov
|
||||
|
||||
# Coverage directory used by tools like istanbul
|
||||
coverage
|
||||
|
||||
# nyc test coverage
|
||||
.nyc_output
|
||||
|
||||
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
|
||||
.grunt
|
||||
|
||||
# Bower dependency directory (https://bower.io/)
|
||||
bower_components
|
||||
|
||||
# node-waf configuration
|
||||
.lock-wscript
|
||||
|
||||
# Compiled binary addons (http://nodejs.org/api/addons.html)
|
||||
build/Release
|
||||
|
||||
# Dependency directories
|
||||
node_modules/
|
||||
|
||||
# Optional npm cache directory
|
||||
.npm
|
||||
|
||||
# Optional eslint cache
|
||||
.eslintcache
|
||||
|
||||
# Optional REPL history
|
||||
.node_repl_history
|
||||
|
||||
# Output of 'npm pack'
|
||||
*.tgz
|
||||
|
||||
# Yarn Integrity file
|
||||
.yarn-integrity
|
||||
|
||||
# dotenv environment variables file
|
||||
.env
|
||||
.secret.local
|
||||
|
||||
toy*.ts
|
||||
|
||||
.DS_Store
|
||||
build/
|
||||
.firebase-emu/
|
||||
*.log
|
||||
.DS_Store
|
||||
|
||||
*.local
|
||||
.secret.*
|
||||
licensed/
|
@ -1,43 +0,0 @@
|
||||
{
|
||||
"firestore": {
|
||||
"rules": "firestore.rules",
|
||||
"indexes": "firestore.indexes.json"
|
||||
},
|
||||
"functions": [
|
||||
{
|
||||
"source": "functions",
|
||||
"codebase": "default",
|
||||
"ignore": [
|
||||
"node_modules",
|
||||
"src",
|
||||
".git",
|
||||
"*.log",
|
||||
"*.local",
|
||||
".secret.*",
|
||||
".firebase-emu"
|
||||
],
|
||||
"predeploy": [
|
||||
"npm --prefix \"$RESOURCE_DIR\" run build:clean",
|
||||
"npm --prefix \"$RESOURCE_DIR\" run build"
|
||||
]
|
||||
}
|
||||
],
|
||||
"storage": {
|
||||
"rules": "storage.rules"
|
||||
},
|
||||
"emulators": {
|
||||
"ui": {
|
||||
"enabled": true
|
||||
},
|
||||
"singleProjectMode": true,
|
||||
"functions": {
|
||||
"port": 5001
|
||||
},
|
||||
"firestore": {
|
||||
"port": 9098
|
||||
},
|
||||
"storage": {
|
||||
"port": 9097
|
||||
}
|
||||
}
|
||||
}
|
@ -1,19 +0,0 @@
|
||||
{
|
||||
"indexes": [
|
||||
{
|
||||
"collectionGroup": "prompts",
|
||||
"queryScope": "COLLECTION_GROUP",
|
||||
"fields": [
|
||||
{
|
||||
"fieldPath": "id",
|
||||
"order": "ASCENDING"
|
||||
},
|
||||
{
|
||||
"fieldPath": "isPublic",
|
||||
"order": "ASCENDING"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"fieldOverrides": []
|
||||
}
|
@ -1,32 +0,0 @@
|
||||
rules_version = '2';
|
||||
service cloud.firestore {
|
||||
match /databases/{database}/documents {
|
||||
// match /questions/{document=**} {
|
||||
// allow read: if request.auth != null
|
||||
// }
|
||||
|
||||
// match /answers/{userId}/profiles/default {
|
||||
// allow read, write: if request.auth != null && request.auth.uid == userId
|
||||
// }
|
||||
|
||||
match /credits/{userId}/{document=**} {
|
||||
allow read: if request.auth != null && request.auth.uid == userId
|
||||
}
|
||||
|
||||
match /users/{userId}/prompts/{document=**} {
|
||||
allow read: if request.auth != null && request.auth.uid == userId
|
||||
}
|
||||
|
||||
// match /users/{userId}/profiles/{document=**} {
|
||||
// allow read: if request.auth != null && request.auth.uid == userId
|
||||
// }
|
||||
|
||||
match /users/{userId}/creditHistory/{document=**} {
|
||||
allow read: if request.auth != null && request.auth.uid == userId
|
||||
}
|
||||
|
||||
match /{document=**} {
|
||||
allow read, write: if false;
|
||||
}
|
||||
}
|
||||
}
|
@ -1 +0,0 @@
|
||||
node_modules/
|
@ -1,36 +0,0 @@
|
||||
root = true
|
||||
|
||||
[*]
|
||||
end_of_line = lf
|
||||
charset = utf-8
|
||||
indent_style = space
|
||||
insert_final_newline = true
|
||||
trim_trailing_whitespace = true
|
||||
indent_size = 4
|
||||
quote_type = single
|
||||
max_line_length = 120
|
||||
|
||||
[*.py]
|
||||
indent_size = 4
|
||||
|
||||
[*.ts]
|
||||
indent_size = 4
|
||||
|
||||
[*.js]
|
||||
indent_size = 2
|
||||
|
||||
[*.vue]
|
||||
indent_size = 2
|
||||
|
||||
[*.*sx]
|
||||
indent_size = 2
|
||||
|
||||
[*.*ml]
|
||||
indent_size = 2
|
||||
|
||||
[*.json]
|
||||
indent_size = 2
|
||||
|
||||
[*.md]
|
||||
indent_size = 2
|
||||
trim_trailing_whitespace = false
|
@ -1,9 +0,0 @@
|
||||
const { join } = require('path');
|
||||
|
||||
/**
|
||||
* @type {import("puppeteer").Configuration}
|
||||
*/
|
||||
module.exports = {
|
||||
// Changes the cache location for Puppeteer.
|
||||
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
|
||||
};
|
@ -1,93 +0,0 @@
|
||||
{
|
||||
"name": "reader",
|
||||
"scripts": {
|
||||
"lint": "eslint --ext .js,.ts .",
|
||||
"build": "node ./integrity-check.cjs && tsc -p .",
|
||||
"build:watch": "tsc --watch",
|
||||
"build:clean": "rm -rf ./build",
|
||||
"shell": "npm run build && firebase functions:shell",
|
||||
"emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu",
|
||||
"emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../",
|
||||
"emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit",
|
||||
"emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
|
||||
"emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
|
||||
"emu:kill": "killall java",
|
||||
"serve": "npm run build && npm run emu:start",
|
||||
"debug": "npm run build && npm run emu:start -- --inspect-functions",
|
||||
"from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit",
|
||||
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
|
||||
"start": "npm run shell",
|
||||
"deploy": "firebase deploy --only functions",
|
||||
"logs": "firebase functions:log",
|
||||
"gcp-build": "node node_modules/puppeteer/install.mjs"
|
||||
},
|
||||
"engines": {
|
||||
"node": "20"
|
||||
},
|
||||
"main": "build/index.js",
|
||||
"dependencies": {
|
||||
"@esm2cjs/normalize-url": "^8.0.0",
|
||||
"@google-cloud/translate": "^8.2.0",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@napi-rs/canvas": "^0.1.67",
|
||||
"@types/turndown": "^5.0.4",
|
||||
"@xmldom/xmldom": "^0.9.3",
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.3-3e69606",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
"linkedom": "^0.18.4",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"node-libcurl": "^4.1.0",
|
||||
"openai": "^4.20.0",
|
||||
"pdfjs-dist": "^4.2.67",
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||
"puppeteer-page-proxy": "^1.3.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"set-cookie-parser": "^2.6.0",
|
||||
"simple-zstd": "^1.4.2",
|
||||
"stripe": "^11.11.0",
|
||||
"tiktoken": "^1.0.16",
|
||||
"tld-extract": "^2.1.0",
|
||||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"undici": "^5.24.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/archiver": "^5.3.4",
|
||||
"@types/bcrypt": "^5.0.0",
|
||||
"@types/busboy": "^1.5.4",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/node": "^20.14.13",
|
||||
"@types/set-cookie-parser": "^2.4.7",
|
||||
"@types/xmldom": "^0.1.34",
|
||||
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
||||
"@typescript-eslint/parser": "^5.12.0",
|
||||
"eslint": "^8.9.0",
|
||||
"eslint-config-google": "^0.14.0",
|
||||
"eslint-plugin-import": "^2.25.4",
|
||||
"firebase-functions-test": "^3.0.0",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"replicate": "^0.16.1",
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"private": true,
|
||||
"exports": {
|
||||
".": "./build/index.js"
|
||||
}
|
||||
}
|
@ -1,218 +0,0 @@
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
|
||||
import { PageSnapshot, ScrappingOptions } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { JSDomControl } from './jsdom';
|
||||
import { AssertionFailureError, FancyFile } from 'civkit';
|
||||
import { TempFileManager } from '../shared';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { pathToFileURL } from 'url';
|
||||
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
||||
import { ZSTDDecompress } from 'simple-zstd';
|
||||
|
||||
@singleton()
|
||||
export class CurlControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected tempFileManager: TempFileManager,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
|
||||
const mixinHeaders = {
|
||||
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': 'Windows',
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
};
|
||||
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
|
||||
|
||||
return curl;
|
||||
}
|
||||
|
||||
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
|
||||
const snapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html: '',
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
|
||||
let contentType = '';
|
||||
const result = await new Promise<{
|
||||
statusCode: number,
|
||||
data?: FancyFile,
|
||||
headers: Buffer | HeaderInfo[],
|
||||
}>((resolve, reject) => {
|
||||
const curl = new Curl();
|
||||
curl.enable(CurlFeature.StreamResponse);
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
|
||||
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
|
||||
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
|
||||
}
|
||||
|
||||
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
|
||||
// if (crawlOpts?.extraHeaders) {
|
||||
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
|
||||
// }
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
|
||||
}
|
||||
if (crawlOpts?.cookies?.length) {
|
||||
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
|
||||
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
|
||||
}
|
||||
if (crawlOpts?.referer) {
|
||||
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
|
||||
}
|
||||
|
||||
curl.on('end', (statusCode, _data, headers) => {
|
||||
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
|
||||
curl.close();
|
||||
});
|
||||
|
||||
curl.on('error', (err) => {
|
||||
curl.close();
|
||||
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
|
||||
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
|
||||
});
|
||||
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
|
||||
let status = -1;
|
||||
let contentEncoding = '';
|
||||
curl.on('stream', (stream, statusCode, headers) => {
|
||||
status = statusCode;
|
||||
const lastResHeaders = headers[headers.length - 1];
|
||||
for (const [k, v] of Object.entries(lastResHeaders)) {
|
||||
const kl = k.toLowerCase();
|
||||
if (kl === 'content-type') {
|
||||
contentType = v.toLowerCase();
|
||||
}
|
||||
if (kl === 'content-encoding') {
|
||||
contentEncoding = v.toLowerCase();
|
||||
}
|
||||
if (contentType && contentEncoding) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!contentType) {
|
||||
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
|
||||
stream.destroy();
|
||||
return;
|
||||
}
|
||||
if (contentType.startsWith('image/')) {
|
||||
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
|
||||
stream.destroy();
|
||||
resolve({
|
||||
statusCode: status,
|
||||
headers,
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
switch (contentEncoding) {
|
||||
case 'gzip': {
|
||||
const decompressed = createGunzip();
|
||||
stream.pipe(decompressed);
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'deflate': {
|
||||
const decompressed = createInflate();
|
||||
stream.pipe(decompressed);
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'br': {
|
||||
const decompressed = createBrotliDecompress();
|
||||
stream.pipe(decompressed);
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'zstd': {
|
||||
const decompressed = ZSTDDecompress();
|
||||
stream.pipe(decompressed);
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const fpath = this.tempFileManager.alloc();
|
||||
const fancyFile = FancyFile.auto(stream, fpath);
|
||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||
resolve({
|
||||
statusCode: status,
|
||||
data: fancyFile,
|
||||
headers,
|
||||
});
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
|
||||
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
|
||||
}
|
||||
|
||||
if (contentType === 'application/octet-stream') {
|
||||
// Content declared as binary is same as unknown.
|
||||
contentType = '';
|
||||
}
|
||||
|
||||
if (result.data) {
|
||||
const mimeType: string = contentType || await result.data.mimeType;
|
||||
if (mimeType.startsWith('text/html')) {
|
||||
if ((await result.data.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
||||
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
|
||||
if ((await result.data.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
|
||||
}
|
||||
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
} else if (mimeType.startsWith('application/pdf')) {
|
||||
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
|
||||
} else {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
|
||||
}
|
||||
}
|
||||
|
||||
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||
|
||||
return curlSnapshot!;
|
||||
}
|
||||
|
||||
|
||||
}
|
@ -1 +0,0 @@
|
||||
../../../thinapps-shared/backend
|
@ -1,168 +0,0 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
||||
|
||||
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
||||
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
||||
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
||||
credential: applicationDefault(),
|
||||
});
|
||||
|
||||
initializeApp();
|
||||
|
||||
|
||||
import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
|
||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||
import http2 from 'http2';
|
||||
import { CrawlerHost } from '../cloud-functions/crawler';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { NextFunction, Request, Response } from 'express';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
console.error('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.log('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
console.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class CrawlStandAloneServer extends ExpressServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected registry: CloudFunctionRegistry,
|
||||
protected crawlerHost: CrawlerHost,
|
||||
protected threadLocal: AsyncContext,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
registry.allHandsOnDeck().catch(() => void 0);
|
||||
registry.title = 'reader';
|
||||
registry.version = '0.1.0';
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
this.httpServer = http2.createServer(this.expressApp);
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
const requestPath = req.url;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
||||
res.set('Content-Length', asset.stats.size.toString());
|
||||
fs.createReadStream(asset.path).pipe(res);
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
makeMiscMiddleware() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
if (req.method === 'OPTIONS') {
|
||||
return res.status(200).end();
|
||||
}
|
||||
this.threadLocal.set('ip', req.ip);
|
||||
|
||||
return next();
|
||||
};
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
override registerRoutes(): void {
|
||||
|
||||
const openAPIManager = new OpenAPIManager();
|
||||
openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
|
||||
const openapiJsonPath = '/openapi.json';
|
||||
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
||||
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
||||
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
||||
baseURL.search = '';
|
||||
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
||||
info: {
|
||||
title: this.registry.title,
|
||||
description: `${this.registry.title} openAPI documentations`,
|
||||
'x-logo': {
|
||||
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
||||
}
|
||||
}
|
||||
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
||||
res.statusCode = 200;
|
||||
res.end(JSON.stringify(content));
|
||||
});
|
||||
|
||||
this.expressRootRouter.use('/',
|
||||
...this.registry.expressMiddlewares,
|
||||
this.makeAssetsServingController(),
|
||||
this.makeMiscMiddleware(),
|
||||
this.registry.makeShimController('crawl')
|
||||
);
|
||||
}
|
||||
|
||||
protected override featureSelect(): void {
|
||||
this.insertAsyncHookMiddleware();
|
||||
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
||||
this.insertLogRequestsMiddleware();
|
||||
this.registerOpenAPIDocsRoutes('/docs');
|
||||
|
||||
this.registerRoutes();
|
||||
}
|
||||
}
|
||||
const instance = container.resolve(CrawlStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|
@ -1,168 +0,0 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { initializeApp, applicationDefault } from 'firebase-admin/app';
|
||||
|
||||
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
|
||||
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
|
||||
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
|
||||
credential: applicationDefault(),
|
||||
});
|
||||
|
||||
initializeApp();
|
||||
|
||||
|
||||
import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
|
||||
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
|
||||
import { ExpressServer } from 'civkit/civ-rpc/express';
|
||||
import http2 from 'http2';
|
||||
import { SearcherHost } from '../cloud-functions/searcher-serper';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { NextFunction, Request, Response } from 'express';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
console.error('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
console.log('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
console.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class SearchStandAloneServer extends ExpressServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected registry: CloudFunctionRegistry,
|
||||
protected searcherHost: SearcherHost,
|
||||
protected threadLocal: AsyncContext,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
registry.allHandsOnDeck().catch(() => void 0);
|
||||
registry.title = 'reader';
|
||||
registry.version = '0.1.0';
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
this.httpServer = http2.createServer(this.expressApp);
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
const requestPath = req.url;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
|
||||
res.set('Content-Length', asset.stats.size.toString());
|
||||
fs.createReadStream(asset.path).pipe(res);
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
makeMiscMiddleware() {
|
||||
return (req: Request, res: Response, next: NextFunction) => {
|
||||
if (req.method === 'OPTIONS') {
|
||||
return res.status(200).end();
|
||||
}
|
||||
this.threadLocal.set('ip', req.ip);
|
||||
|
||||
return next();
|
||||
};
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
override registerRoutes(): void {
|
||||
|
||||
const openAPIManager = new OpenAPIManager();
|
||||
openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
|
||||
const openapiJsonPath = '/openapi.json';
|
||||
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
|
||||
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
|
||||
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
|
||||
baseURL.search = '';
|
||||
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
|
||||
info: {
|
||||
title: this.registry.title,
|
||||
description: `${this.registry.title} openAPI documentations`,
|
||||
'x-logo': {
|
||||
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
|
||||
}
|
||||
}
|
||||
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
|
||||
res.statusCode = 200;
|
||||
res.end(JSON.stringify(content));
|
||||
});
|
||||
|
||||
this.expressRootRouter.use('/',
|
||||
...this.registry.expressMiddlewares,
|
||||
this.makeMiscMiddleware(),
|
||||
this.makeAssetsServingController(),
|
||||
this.registry.makeShimController('search')
|
||||
);
|
||||
}
|
||||
|
||||
protected override featureSelect(): void {
|
||||
this.insertAsyncHookMiddleware();
|
||||
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
|
||||
this.insertLogRequestsMiddleware();
|
||||
this.registerOpenAPIDocsRoutes('/docs');
|
||||
|
||||
this.registerRoutes();
|
||||
}
|
||||
}
|
||||
const instance = container.resolve(SearchStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));
|
@ -1,8 +0,0 @@
|
||||
rules_version = '2';
|
||||
service firebase.storage {
|
||||
match /b/{bucket}/o {
|
||||
match /{allPaths=**} {
|
||||
allow read, write: if false;
|
||||
}
|
||||
}
|
||||
}
|
238
backend/functions/package-lock.json → package-lock.json
generated
238
backend/functions/package-lock.json → package-lock.json
generated
@ -8,15 +8,16 @@
|
||||
"dependencies": {
|
||||
"@esm2cjs/normalize-url": "^8.0.0",
|
||||
"@google-cloud/translate": "^8.2.0",
|
||||
"@koa/bodyparser": "^5.1.1",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@napi-rs/canvas": "^0.1.67",
|
||||
"@napi-rs/canvas": "^0.1.68",
|
||||
"@types/turndown": "^5.0.4",
|
||||
"@xmldom/xmldom": "^0.9.3",
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.3-3e69606",
|
||||
"civkit": "^0.8.4-32482a3",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -31,7 +32,7 @@
|
||||
"minio": "^7.1.3",
|
||||
"node-libcurl": "^4.1.0",
|
||||
"openai": "^4.20.0",
|
||||
"pdfjs-dist": "^4.2.67",
|
||||
"pdfjs-dist": "^4.10.38",
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
@ -53,6 +54,7 @@
|
||||
"@types/busboy": "^1.5.4",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/koa": "^2.15.0",
|
||||
"@types/node": "^20.14.13",
|
||||
"@types/set-cookie-parser": "^2.4.7",
|
||||
"@types/xmldom": "^0.1.34",
|
||||
@ -62,6 +64,7 @@
|
||||
"eslint-config-google": "^0.14.0",
|
||||
"eslint-plugin-import": "^2.25.4",
|
||||
"firebase-functions-test": "^3.0.0",
|
||||
"koa": "^2.16.0",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"replicate": "^0.16.1",
|
||||
"typescript": "^5.5.4"
|
||||
@ -1626,6 +1629,23 @@
|
||||
"url": "https://opencollective.com/js-sdsl"
|
||||
}
|
||||
},
|
||||
"node_modules/@koa/bodyparser": {
|
||||
"version": "5.1.1",
|
||||
"resolved": "https://registry.npmjs.org/@koa/bodyparser/-/bodyparser-5.1.1.tgz",
|
||||
"integrity": "sha512-ZBF49xqNVxnmJ+8iXegq+fXPQm9RSX8giNl/aXS5rW1VpNct92wnFbGR/47vfoRJVLARGQ4HVL4WaQ0u8IJVoA==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"co-body": "^6.1.0",
|
||||
"lodash.merge": "^4.6.2",
|
||||
"type-is": "^1.6.18"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">= 16"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"koa": "^2.14.1"
|
||||
}
|
||||
},
|
||||
"node_modules/@koa/router": {
|
||||
"version": "12.0.1",
|
||||
"resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
|
||||
@ -1679,30 +1699,30 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.67.tgz",
|
||||
"integrity": "sha512-VA4Khm/5Kg2bQGx3jXotTC4MloOG8b1Ung80exafUK0k5u6yJmIz3Q2iXeeWZs5weV+LQOEB+CPKsYwEYaGAjw==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.68.tgz",
|
||||
"integrity": "sha512-LQESrePLEBLvhuFkXx9jjBXRC2ClYsO5mqQ1m/puth5z9SOuM3N/B3vDuqnC3RJFktDktyK9khGvo7dTkqO9uQ==",
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 10"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"@napi-rs/canvas-android-arm64": "0.1.67",
|
||||
"@napi-rs/canvas-darwin-arm64": "0.1.67",
|
||||
"@napi-rs/canvas-darwin-x64": "0.1.67",
|
||||
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.67",
|
||||
"@napi-rs/canvas-linux-arm64-gnu": "0.1.67",
|
||||
"@napi-rs/canvas-linux-arm64-musl": "0.1.67",
|
||||
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.67",
|
||||
"@napi-rs/canvas-linux-x64-gnu": "0.1.67",
|
||||
"@napi-rs/canvas-linux-x64-musl": "0.1.67",
|
||||
"@napi-rs/canvas-win32-x64-msvc": "0.1.67"
|
||||
"@napi-rs/canvas-android-arm64": "0.1.68",
|
||||
"@napi-rs/canvas-darwin-arm64": "0.1.68",
|
||||
"@napi-rs/canvas-darwin-x64": "0.1.68",
|
||||
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.68",
|
||||
"@napi-rs/canvas-linux-arm64-gnu": "0.1.68",
|
||||
"@napi-rs/canvas-linux-arm64-musl": "0.1.68",
|
||||
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.68",
|
||||
"@napi-rs/canvas-linux-x64-gnu": "0.1.68",
|
||||
"@napi-rs/canvas-linux-x64-musl": "0.1.68",
|
||||
"@napi-rs/canvas-win32-x64-msvc": "0.1.68"
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-android-arm64": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.67.tgz",
|
||||
"integrity": "sha512-W+3DFG5h0WU8Vqqb3W5fNmm5/TPH5ECZRinQDK4CAKFSUkc4iZcDwrmyFG9sB4KdHazf1mFVHCpEeVMO6Mk6Zg==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.68.tgz",
|
||||
"integrity": "sha512-h1KcSR4LKLfRfzeBH65xMxbWOGa1OtMFQbCMVlxPCkN1Zr+2gK+70pXO5ktojIYcUrP6KDcOwoc8clho5ccM/w==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@ -1716,9 +1736,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-darwin-arm64": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.67.tgz",
|
||||
"integrity": "sha512-xzrv7QboI47yhIHR5P5u/9KGswokuOKLiKSukr1Ku03RRJxP6lGuVtrAZAgdRg7F9FsuF2REf2yK53YVb6pMlA==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.68.tgz",
|
||||
"integrity": "sha512-/VURlrAD4gDoxW1GT/b0nP3fRz/fhxmHI/xznTq2FTwkQLPOlLkDLCvTmQ7v6LtGKdc2Ed6rvYpRan+JXThInQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@ -1732,9 +1752,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-darwin-x64": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.67.tgz",
|
||||
"integrity": "sha512-SNk9lYBr84N0gW8MZ2IrjygFtbFBILr3SEqMdHzHHuph20SQmssFvJGPZwSSCMEyKAvyqhogbmlew0te5Z4w9Q==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.68.tgz",
|
||||
"integrity": "sha512-tEpvGR6vCLTo1Tx9wmDnoOKROpw57wiCWwCpDOuVlj/7rqEJOUYr9ixW4aRJgmeGBrZHgevI0EURys2ER6whmg==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@ -1748,9 +1768,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.67.tgz",
|
||||
"integrity": "sha512-qmBlSvUpl567bzH8tNXi82u5FrL4d0qINqd6K9O7GWGGGFmKMJdrgi2/SW3wwCTxqHBasIDdVWc4KSJfwyaoDQ==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.68.tgz",
|
||||
"integrity": "sha512-U9xbJsumPOiAYeAFZMlHf62b9dGs2HJ6Q5xt7xTB0uEyPeurwhgYBWGgabdsEidyj38YuzI/c3LGBbSQB3vagw==",
|
||||
"cpu": [
|
||||
"arm"
|
||||
],
|
||||
@ -1764,9 +1784,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.67.tgz",
|
||||
"integrity": "sha512-k3nAPQefkMeFuJ65Rqdnx92KX1JXQhEKjjWeKsCJB+7sIBgQUWtHo9c3etfVLv5pkWJJDFi/Zc2soNkH3E8dRA==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.68.tgz",
|
||||
"integrity": "sha512-KFkn8wEm3mPnWD4l8+OUUkxylSJuN5q9PnJRZJgv15RtCA1bgxIwTkBhI/+xuyVMcHqON9sXq7cDkEJtHm35dg==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@ -1780,9 +1800,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.67.tgz",
|
||||
"integrity": "sha512-lZwHWR1cCP408l86n3Qbs3X1oFeAYMjJIQvQl1VMZh6wo5PfI+jaZSKBUOd8x44TnVllX9yhLY9unNRztk/sUQ==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.68.tgz",
|
||||
"integrity": "sha512-IQzts91rCdOALXBWQxLZRCEDrfFTGDtNRJMNu+2SKZ1uT8cmPQkPwVk5rycvFpvgAcmiFiOSCp1aRrlfU8KPpQ==",
|
||||
"cpu": [
|
||||
"arm64"
|
||||
],
|
||||
@ -1796,9 +1816,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.67.tgz",
|
||||
"integrity": "sha512-PdBC9p6bLHA1W3OdA0vTHj701SB/kioGQ1uCFBRMs5KBCaMLb/H4aNi8uaIUIEvBWnxeAjoNcLU7//q0FxEosw==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.68.tgz",
|
||||
"integrity": "sha512-e9AS5UttoIKqXSmBzKZdd3NErSVyOEYzJfNOCGtafGk1//gibTwQXGlSXmAKuErqMp09pyk9aqQRSYzm1AQfBw==",
|
||||
"cpu": [
|
||||
"riscv64"
|
||||
],
|
||||
@ -1812,9 +1832,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.67.tgz",
|
||||
"integrity": "sha512-kJJX6eWzjipL/LdKOWCJctc88e5yzuXri8+s0V/lN06OwuLGW62TWS3lvi8qlUrGMOfRGabSWWlB4omhASSB8w==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.68.tgz",
|
||||
"integrity": "sha512-Pa/I36VE3j57I3Obhrr+J48KGFfkZk2cJN/2NmW/vCgmoF7kCP6aTVq5n+cGdGWLd/cN9CJ9JvNwEoMRDghu0g==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@ -1828,9 +1848,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-linux-x64-musl": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.67.tgz",
|
||||
"integrity": "sha512-jLKiPWGeN6ZzhnaLG7ex7eexsiHJ1mdtPK1qKvETIcu45dApMXyUIHvdL6XWB5gFFtj5ScHzLUxv1vkfPZsoxA==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.68.tgz",
|
||||
"integrity": "sha512-9c6rkc5195wNxuUHJdf4/mmnq433OQey9TNvQ9LspJazvHbfSkTij8wtKjASVQsJyPDva4fkWOeV/OQ7cLw0GQ==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@ -1844,9 +1864,9 @@
|
||||
}
|
||||
},
|
||||
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
|
||||
"version": "0.1.67",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.67.tgz",
|
||||
"integrity": "sha512-K/JmkOFbc4iRZYUqJhj0jwqfHA/wNQEmTiGNsgZ6d59yF/IBNp5T0D5eg3B8ghjI8GxDYCiSJ6DNX8mC3Oh2EQ==",
|
||||
"version": "0.1.68",
|
||||
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.68.tgz",
|
||||
"integrity": "sha512-Fc5Dez23u0FoSATurT6/w1oMytiRnKWEinHivdMvXpge6nG4YvhrASrtqMk8dGJMVQpHr8QJYF45rOrx2YU2Aw==",
|
||||
"cpu": [
|
||||
"x64"
|
||||
],
|
||||
@ -2238,6 +2258,16 @@
|
||||
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
|
||||
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
|
||||
},
|
||||
"node_modules/@types/accepts": {
|
||||
"version": "1.3.7",
|
||||
"resolved": "https://registry.npmjs.org/@types/accepts/-/accepts-1.3.7.tgz",
|
||||
"integrity": "sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/archiver": {
|
||||
"version": "5.3.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
|
||||
@ -2344,6 +2374,26 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/content-disposition": {
|
||||
"version": "0.5.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/content-disposition/-/content-disposition-0.5.8.tgz",
|
||||
"integrity": "sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/cookies": {
|
||||
"version": "0.9.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/cookies/-/cookies-0.9.0.tgz",
|
||||
"integrity": "sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/connect": "*",
|
||||
"@types/express": "*",
|
||||
"@types/keygrip": "*",
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/cors": {
|
||||
"version": "2.8.17",
|
||||
"resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
|
||||
@ -2403,6 +2453,13 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/http-assert": {
|
||||
"version": "1.5.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/http-assert/-/http-assert-1.5.6.tgz",
|
||||
"integrity": "sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/http-cache-semantics": {
|
||||
"version": "4.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
|
||||
@ -2460,6 +2517,13 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/keygrip": {
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/@types/keygrip/-/keygrip-1.0.6.tgz",
|
||||
"integrity": "sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/@types/keyv": {
|
||||
"version": "3.1.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
|
||||
@ -2468,6 +2532,33 @@
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/koa": {
|
||||
"version": "2.15.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/koa/-/koa-2.15.0.tgz",
|
||||
"integrity": "sha512-7QFsywoE5URbuVnG3loe03QXuGajrnotr3gQkXcEBShORai23MePfFYdhz90FEtBBpkyIYQbVD+evKtloCgX3g==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/accepts": "*",
|
||||
"@types/content-disposition": "*",
|
||||
"@types/cookies": "*",
|
||||
"@types/http-assert": "*",
|
||||
"@types/http-errors": "*",
|
||||
"@types/keygrip": "*",
|
||||
"@types/koa-compose": "*",
|
||||
"@types/node": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/koa-compose": {
|
||||
"version": "3.2.8",
|
||||
"resolved": "https://registry.npmjs.org/@types/koa-compose/-/koa-compose-3.2.8.tgz",
|
||||
"integrity": "sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@types/koa": "*"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/lodash": {
|
||||
"version": "4.17.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
|
||||
@ -3836,7 +3927,6 @@
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
|
||||
"integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"mime-types": "^2.1.18",
|
||||
"ylru": "^1.2.0"
|
||||
@ -4005,9 +4095,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/civkit": {
|
||||
"version": "0.8.3-3e69606",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.3-3e69606.tgz",
|
||||
"integrity": "sha512-niV5U11ySIiVNSnGpW49KJlExmIiuQQfnyQEXeYuKCE+B+wkqYCBG+3tlY3E882tmPkaQQKpDlF/yTeqEU2q2Q==",
|
||||
"version": "0.8.4-32482a3",
|
||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-32482a3.tgz",
|
||||
"integrity": "sha512-VQwRreeVKYEoSMlhwYrPGpAA5na6lrIavGKmYNrhsHVJEvSfgkWKEete/btZzer4+WBxnNRw+PpRPrq6xjt13Q==",
|
||||
"license": "AGPL",
|
||||
"dependencies": {
|
||||
"lodash": "^4.17.21",
|
||||
"tslib": "^2.5.0"
|
||||
@ -4138,7 +4229,6 @@
|
||||
"version": "4.6.0",
|
||||
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
|
||||
"integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
|
||||
"devOptional": true,
|
||||
"engines": {
|
||||
"iojs": ">= 1.0.0",
|
||||
"node": ">= 0.12.0"
|
||||
@ -4148,7 +4238,6 @@
|
||||
"version": "6.1.0",
|
||||
"resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
|
||||
"integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"inflation": "^2.0.0",
|
||||
"qs": "^6.5.2",
|
||||
@ -4273,7 +4362,6 @@
|
||||
"version": "0.9.1",
|
||||
"resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
|
||||
"integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"depd": "~2.0.0",
|
||||
"keygrip": "~1.1.0"
|
||||
@ -4582,8 +4670,7 @@
|
||||
"node_modules/deep-equal": {
|
||||
"version": "1.0.1",
|
||||
"resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
|
||||
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw==",
|
||||
"optional": true
|
||||
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw=="
|
||||
},
|
||||
"node_modules/deep-extend": {
|
||||
"version": "0.6.0",
|
||||
@ -6701,7 +6788,6 @@
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
|
||||
"integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"deep-equal": "~1.0.1",
|
||||
"http-errors": "~1.8.0"
|
||||
@ -6714,7 +6800,6 @@
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
||||
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@ -6723,7 +6808,6 @@
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
||||
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"depd": "~1.1.2",
|
||||
"inherits": "2.0.4",
|
||||
@ -6739,7 +6823,6 @@
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
||||
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@ -6940,7 +7023,6 @@
|
||||
"version": "2.1.0",
|
||||
"resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
|
||||
"integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.8.0"
|
||||
}
|
||||
@ -8316,7 +8398,6 @@
|
||||
"version": "1.1.0",
|
||||
"resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
|
||||
"integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"tsscmp": "1.0.6"
|
||||
},
|
||||
@ -8354,10 +8435,10 @@
|
||||
}
|
||||
},
|
||||
"node_modules/koa": {
|
||||
"version": "2.15.3",
|
||||
"resolved": "https://registry.npmjs.org/koa/-/koa-2.15.3.tgz",
|
||||
"integrity": "sha512-j/8tY9j5t+GVMLeioLaxweJiKUayFhlGqNTzf2ZGwL0ZCQijd2RLHK0SLW5Tsko8YyyqCZC2cojIb0/s62qTAg==",
|
||||
"optional": true,
|
||||
"version": "2.16.0",
|
||||
"resolved": "https://registry.npmjs.org/koa/-/koa-2.16.0.tgz",
|
||||
"integrity": "sha512-Afhqq0Vq3W7C+/rW6IqHVBDLzqObwZ07JaUNUEF8yCQ6afiyFE3RAy+i7V0E46XOWlH7vPWn/x0vsZwNy6PWxw==",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"accepts": "^1.3.5",
|
||||
"cache-content-type": "^1.0.0",
|
||||
@ -8404,14 +8485,12 @@
|
||||
"node_modules/koa-compose": {
|
||||
"version": "4.1.0",
|
||||
"resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
|
||||
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw==",
|
||||
"optional": true
|
||||
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw=="
|
||||
},
|
||||
"node_modules/koa-convert": {
|
||||
"version": "2.0.0",
|
||||
"resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
|
||||
"integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"co": "^4.6.0",
|
||||
"koa-compose": "^4.1.0"
|
||||
@ -8424,7 +8503,6 @@
|
||||
"version": "1.8.1",
|
||||
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
|
||||
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
|
||||
"optional": true,
|
||||
"dependencies": {
|
||||
"depd": "~1.1.2",
|
||||
"inherits": "2.0.4",
|
||||
@ -8440,7 +8518,6 @@
|
||||
"version": "1.1.2",
|
||||
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
|
||||
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@ -8449,7 +8526,6 @@
|
||||
"version": "1.5.0",
|
||||
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
|
||||
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 0.6"
|
||||
}
|
||||
@ -8644,8 +8720,7 @@
|
||||
"node_modules/lodash.merge": {
|
||||
"version": "4.6.2",
|
||||
"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
|
||||
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
|
||||
"dev": true
|
||||
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
|
||||
},
|
||||
"node_modules/lodash.once": {
|
||||
"version": "4.1.1",
|
||||
@ -9853,8 +9928,7 @@
|
||||
"node_modules/only": {
|
||||
"version": "0.0.2",
|
||||
"resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
|
||||
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==",
|
||||
"optional": true
|
||||
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ=="
|
||||
},
|
||||
"node_modules/openai": {
|
||||
"version": "4.33.0",
|
||||
@ -10118,15 +10192,15 @@
|
||||
}
|
||||
},
|
||||
"node_modules/pdfjs-dist": {
|
||||
"version": "4.2.67",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
|
||||
"integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
|
||||
"version": "4.10.38",
|
||||
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz",
|
||||
"integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==",
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=18"
|
||||
"node": ">=20"
|
||||
},
|
||||
"optionalDependencies": {
|
||||
"canvas": "^2.11.2",
|
||||
"path2d": "^0.2.0"
|
||||
"@napi-rs/canvas": "^0.1.65"
|
||||
}
|
||||
},
|
||||
"node_modules/peek-stream": {
|
||||
@ -12443,7 +12517,6 @@
|
||||
"version": "1.0.6",
|
||||
"resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
|
||||
"integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">=0.6.x"
|
||||
}
|
||||
@ -13136,7 +13209,6 @@
|
||||
"version": "1.4.0",
|
||||
"resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
|
||||
"integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
|
||||
"optional": true,
|
||||
"engines": {
|
||||
"node": ">= 4.0.0"
|
||||
}
|
97
package.json
97
package.json
@ -1,15 +1,84 @@
|
||||
{
|
||||
"name": "reader",
|
||||
"version": "1.0.0",
|
||||
"description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)",
|
||||
"main": "index.js",
|
||||
"scripts": {
|
||||
"test": "echo \"Error: no test specified\" && exit 1"
|
||||
},
|
||||
"author": "",
|
||||
"license": "ISC",
|
||||
"devDependencies": {
|
||||
"firebase-tools": "^13.6.2",
|
||||
"typescript": "^5.1.6"
|
||||
}
|
||||
}
|
||||
"name": "reader",
|
||||
"scripts": {
|
||||
"lint": "eslint --ext .js,.ts .",
|
||||
"build": "node ./integrity-check.cjs && tsc -p .",
|
||||
"build:watch": "tsc --watch",
|
||||
"build:clean": "rm -rf ./build",
|
||||
"serve": "npm run build && npm run start",
|
||||
"debug": "npm run build && npm run dev",
|
||||
"start": "npm run shell"
|
||||
},
|
||||
"engines": {
|
||||
"node": "20"
|
||||
},
|
||||
"main": "build/index.js",
|
||||
"dependencies": {
|
||||
"@esm2cjs/normalize-url": "^8.0.0",
|
||||
"@google-cloud/translate": "^8.2.0",
|
||||
"@koa/bodyparser": "^5.1.1",
|
||||
"@mozilla/readability": "^0.5.0",
|
||||
"@napi-rs/canvas": "^0.1.68",
|
||||
"@types/turndown": "^5.0.4",
|
||||
"@xmldom/xmldom": "^0.9.3",
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"busboy": "^1.6.0",
|
||||
"civkit": "^0.8.4-32482a3",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"langdetect": "^0.2.1",
|
||||
"linkedom": "^0.18.4",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"node-libcurl": "^4.1.0",
|
||||
"openai": "^4.20.0",
|
||||
"pdfjs-dist": "^4.10.38",
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
|
||||
"puppeteer-page-proxy": "^1.3.0",
|
||||
"robots-parser": "^3.0.1",
|
||||
"set-cookie-parser": "^2.6.0",
|
||||
"simple-zstd": "^1.4.2",
|
||||
"stripe": "^11.11.0",
|
||||
"tiktoken": "^1.0.16",
|
||||
"tld-extract": "^2.1.0",
|
||||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"undici": "^5.24.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/archiver": "^5.3.4",
|
||||
"@types/bcrypt": "^5.0.0",
|
||||
"@types/busboy": "^1.5.4",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/koa": "^2.15.0",
|
||||
"@types/node": "^20.14.13",
|
||||
"@types/set-cookie-parser": "^2.4.7",
|
||||
"@types/xmldom": "^0.1.34",
|
||||
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
||||
"@typescript-eslint/parser": "^5.12.0",
|
||||
"eslint": "^8.9.0",
|
||||
"eslint-config-google": "^0.14.0",
|
||||
"eslint-plugin-import": "^2.25.4",
|
||||
"firebase-functions-test": "^3.0.0",
|
||||
"koa": "^2.16.0",
|
||||
"pino-pretty": "^13.0.0",
|
||||
"replicate": "^0.16.1",
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"private": true,
|
||||
"exports": {
|
||||
".": "./build/index.js"
|
||||
}
|
||||
}
|
||||
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 14 KiB |
@ -1,30 +1,45 @@
|
||||
import {
|
||||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
AssertionFailureError, ParamValidationError, Defer,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { pathToFileURL } from 'url';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
import _ from 'lodash';
|
||||
|
||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import {
|
||||
assignTransferProtocolMeta, RPCHost, RPCReflection,
|
||||
AssertionFailureError, ParamValidationError,
|
||||
RawString,
|
||||
ApplicationError,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { Defer } from 'civkit/defer';
|
||||
import { retryWith } from 'civkit/decorators';
|
||||
|
||||
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
|
||||
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
import { DomainProfile } from '../db/domain-profile';
|
||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { JSDomControl } from '../services/jsdom';
|
||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
import { CurlControl } from '../services/curl';
|
||||
import { LmControl } from '../services/lm';
|
||||
import { tryDecodeURIComponent } from '../utils/misc';
|
||||
import { CFBrowserRendering } from '../services/cf-browser-rendering';
|
||||
|
||||
import { GlobalLogger } from '../services/logger';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
||||
import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors';
|
||||
|
||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
import { ProxyProvider } from '../shared/services/proxy-provider';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||
import { RobotsTxtService } from '../services/robots-text';
|
||||
import { ServiceBadAttemptError } from '../shared/lib/errors';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -33,6 +48,8 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
removeSelector?: string | string[];
|
||||
keepImgDataUrl?: boolean;
|
||||
engine?: string;
|
||||
allocProxy?: string;
|
||||
private?: boolean;
|
||||
}
|
||||
|
||||
const indexProto = {
|
||||
@ -56,16 +73,18 @@ export class CrawlerHost extends RPCHost {
|
||||
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected curlControl: CurlControl,
|
||||
protected cfBrowserRendering: CFBrowserRendering,
|
||||
protected proxyProvider: ProxyProvider,
|
||||
protected lmControl: LmControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected fbHealthCheck: FirebaseRoundTripChecker,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected robotsTxtService: RobotsTxtService,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
@ -73,7 +92,7 @@ export class CrawlerHost extends RPCHost {
|
||||
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
|
||||
return;
|
||||
}
|
||||
if (options.cookies?.length) {
|
||||
if (options.cookies?.length || options.private) {
|
||||
// Potential privacy issue, dont cache if cookies are used
|
||||
return;
|
||||
}
|
||||
@ -84,9 +103,14 @@ export class CrawlerHost extends RPCHost {
|
||||
if (options.locale) {
|
||||
Reflect.set(snapshot, 'locale', options.locale);
|
||||
}
|
||||
await this.setToCache(options.url, snapshot);
|
||||
|
||||
await this.exploreDirectEngine(snapshot).catch(() => undefined);
|
||||
const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
||||
if (analyzed.tokens < 200) {
|
||||
// Does not contain enough content
|
||||
return;
|
||||
}
|
||||
|
||||
await this.setToCache(options.url, snapshot);
|
||||
});
|
||||
|
||||
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
|
||||
@ -108,12 +132,19 @@ export class CrawlerHost extends RPCHost {
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, ''));
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
getIndex(user?: JinaEmbeddingsTokenAccount) {
|
||||
async getIndex(auth?: JinaEmbeddingsAuthDTO) {
|
||||
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
|
||||
|
||||
// Object.assign(indexObject, {
|
||||
// usage1: `${ctx.origin}/YOUR_URL`,
|
||||
// usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
|
||||
// homepage: 'https://jina.ai/reader',
|
||||
// sourceCode: 'https://github.com/jina-ai/reader',
|
||||
// });
|
||||
Object.assign(indexObject, {
|
||||
usage1: 'https://r.jina.ai/YOUR_URL',
|
||||
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
|
||||
@ -121,71 +152,83 @@ export class CrawlerHost extends RPCHost {
|
||||
sourceCode: 'https://github.com/jina-ai/reader',
|
||||
});
|
||||
|
||||
if (user) {
|
||||
await auth?.solveUID();
|
||||
if (auth && auth.user) {
|
||||
indexObject[''] = undefined;
|
||||
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
|
||||
indexObject.balanceLeft = user.wallet.total_balance;
|
||||
indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
|
||||
indexObject.balanceLeft = auth.user.wallet.total_balance;
|
||||
}
|
||||
|
||||
return indexObject;
|
||||
}
|
||||
|
||||
@CloudHTTPv2({
|
||||
name: 'crawl2',
|
||||
runtime: {
|
||||
memory: '4GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 22,
|
||||
@Method({
|
||||
name: 'getIndex',
|
||||
description: 'Index of the service',
|
||||
proto: {
|
||||
http: {
|
||||
action: 'get',
|
||||
path: '/',
|
||||
}
|
||||
},
|
||||
tags: ['Crawler'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
tags: ['misc', 'crawl'],
|
||||
returnType: [String, Object],
|
||||
})
|
||||
@CloudHTTPv2({
|
||||
runtime: {
|
||||
memory: '4GiB',
|
||||
cpu: 2,
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 10,
|
||||
maxInstances: 1000,
|
||||
minInstances: 1,
|
||||
async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) {
|
||||
const indexObject = await this.getIndex(auth);
|
||||
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
return indexObject;
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${indexObject}`,
|
||||
{ contentType: 'text/plain; charset=utf-8', envelope: null }
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@Method({
|
||||
name: 'crawlByPostingToIndex',
|
||||
description: 'Crawl any url into markdown',
|
||||
proto: {
|
||||
http: {
|
||||
action: 'POST',
|
||||
path: '/',
|
||||
}
|
||||
},
|
||||
tags: ['Crawler'],
|
||||
httpMethod: ['get', 'post'],
|
||||
tags: ['crawl'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
})
|
||||
@Method({
|
||||
description: 'Crawl any url into markdown',
|
||||
proto: {
|
||||
http: {
|
||||
action: ['GET', 'POST'],
|
||||
path: '::url',
|
||||
}
|
||||
},
|
||||
tags: ['crawl'],
|
||||
returnType: [String, OutputServerEventStream, RawString],
|
||||
})
|
||||
async crawl(
|
||||
@RPCReflect() rpcReflect: RPCReflection,
|
||||
@Ctx() ctx: {
|
||||
req: Request,
|
||||
res: Response,
|
||||
},
|
||||
@Ctx() ctx: Context,
|
||||
auth: JinaEmbeddingsAuthDTO,
|
||||
crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
|
||||
crawlerOptionsParamsAllowed: CrawlerOptions,
|
||||
) {
|
||||
const uid = await auth.solveUID();
|
||||
let chargeAmount = 0;
|
||||
const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
||||
const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
|
||||
|
||||
// Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url.
|
||||
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions);
|
||||
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.path), crawlerOptions);
|
||||
if (!targetUrl) {
|
||||
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
return this.getIndex(latestUser);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
|
||||
{ contentType: 'text/plain', envelope: null }
|
||||
);
|
||||
return await this.getIndex(auth);
|
||||
}
|
||||
|
||||
// Prevent circular crawling
|
||||
this.puppeteerControl.circuitBreakerHosts.add(
|
||||
ctx.req.hostname.toLowerCase()
|
||||
ctx.hostname.toLowerCase()
|
||||
);
|
||||
|
||||
if (uid) {
|
||||
@ -222,8 +265,8 @@ export class CrawlerHost extends RPCHost {
|
||||
apiRoll.chargeAmount = chargeAmount;
|
||||
}
|
||||
});
|
||||
} else if (ctx.req.ip) {
|
||||
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, [rpcReflect.name.toUpperCase()],
|
||||
} else if (ctx.ip) {
|
||||
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, [rpcReflect.name.toUpperCase()],
|
||||
[
|
||||
// 20 requests per minute
|
||||
new Date(Date.now() - 60 * 1000), 20
|
||||
@ -254,9 +297,12 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
}
|
||||
|
||||
if (crawlerOptions.robotsTxt) {
|
||||
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
|
||||
}
|
||||
|
||||
const crawlOpts = await this.configure(crawlerOptions);
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
rpcReflect.return(sseStream);
|
||||
|
||||
@ -265,8 +311,11 @@ export class CrawlerHost extends RPCHost {
|
||||
if (!scrapped) {
|
||||
continue;
|
||||
}
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -293,17 +342,20 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
let lastScrapped;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
@ -324,7 +376,7 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -342,16 +394,18 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -370,7 +424,7 @@ export class CrawlerHost extends RPCHost {
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
@ -380,7 +434,7 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -399,7 +453,7 @@ export class CrawlerHost extends RPCHost {
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
|
||||
|
||||
}
|
||||
|
||||
@ -419,7 +473,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
let result: URL;
|
||||
const normalizeUrl = (await pNormalizeUrl).default;
|
||||
const normalizeUrl = require('@esm2cjs/normalize-url').default;
|
||||
try {
|
||||
result = new URL(
|
||||
normalizeUrl(
|
||||
@ -638,7 +692,25 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
|
||||
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
||||
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
|
||||
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
|
||||
if (!sideLoaded.file) {
|
||||
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
||||
}
|
||||
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
||||
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
|
||||
return;
|
||||
}
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
|
||||
const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href);
|
||||
const snapshot = {
|
||||
href: urlToCrawl.toString(),
|
||||
html,
|
||||
title: '',
|
||||
text: '',
|
||||
} as PageSnapshot;
|
||||
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -653,26 +725,68 @@ export class CrawlerHost extends RPCHost {
|
||||
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
|
||||
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
|
||||
) {
|
||||
if (cache.snapshot) {
|
||||
cache.snapshot.isFromCache = true;
|
||||
}
|
||||
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
||||
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
|
||||
const domainProfile = await DomainProfile.fromFirestore(digest);
|
||||
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
|
||||
try {
|
||||
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
|
||||
try {
|
||||
const altOpts = { ...crawlOpts };
|
||||
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
|
||||
await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
|
||||
await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
|
||||
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
||||
|
||||
// Expect downstream code to "break" here if it's satisfied with the direct engine
|
||||
yield snapshot;
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
|
||||
return;
|
||||
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
||||
return Promise.reject(err);
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
|
||||
|
||||
return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
||||
});
|
||||
if (!sideLoaded.file) {
|
||||
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
||||
}
|
||||
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
|
||||
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
|
||||
yield draftSnapshot;
|
||||
return;
|
||||
}
|
||||
|
||||
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
|
||||
draftSnapshot.title ??= analyzed.title;
|
||||
let fallbackProxyIsUsed = false;
|
||||
if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) {
|
||||
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
|
||||
if (!proxyLoaded.file) {
|
||||
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
|
||||
}
|
||||
const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
|
||||
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
|
||||
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
|
||||
draftSnapshot = proxySnapshot;
|
||||
sideLoaded = proxyLoaded;
|
||||
fallbackProxyIsUsed = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
|
||||
yield draftSnapshot;
|
||||
}
|
||||
|
||||
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
|
||||
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
|
||||
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
|
||||
if (fallbackProxyIsUsed) {
|
||||
this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
|
||||
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
|
||||
@ -782,6 +896,8 @@ export class CrawlerHost extends RPCHost {
|
||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
|
||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||
this.threadLocal.set('withIframe', opts.withIframe);
|
||||
this.threadLocal.set('withShadowDom', opts.withShadowDom);
|
||||
this.threadLocal.set('userAgent', opts.userAgent);
|
||||
if (opts.timeout) {
|
||||
this.threadLocal.set('timeout', opts.timeout * 1000);
|
||||
@ -804,6 +920,9 @@ export class CrawlerHost extends RPCHost {
|
||||
referer: opts.referer,
|
||||
viewport: opts.viewport,
|
||||
engine: opts.engine,
|
||||
allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
|
||||
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
|
||||
private: Boolean(opts.doNotTrack),
|
||||
};
|
||||
|
||||
if (opts.locale) {
|
||||
@ -842,14 +961,15 @@ export class CrawlerHost extends RPCHost {
|
||||
return crawlOpts;
|
||||
}
|
||||
|
||||
formatSnapshot(
|
||||
protected async formatSnapshot(
|
||||
crawlerOptions: CrawlerOptions,
|
||||
snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
pageshotUrl?: string;
|
||||
},
|
||||
nominalUrl?: URL,
|
||||
urlValidMs?: number
|
||||
urlValidMs?: number,
|
||||
scrappingOptions?: ScrappingOptions
|
||||
) {
|
||||
const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
|
||||
|
||||
@ -870,7 +990,29 @@ export class CrawlerHost extends RPCHost {
|
||||
return output;
|
||||
}
|
||||
|
||||
return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
|
||||
return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions);
|
||||
}
|
||||
|
||||
async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) {
|
||||
const snapshotCopy = _.cloneDeep(snapshot);
|
||||
|
||||
if (snapshotCopy.pdfs?.length) {
|
||||
const pdfUrl = snapshotCopy.pdfs[0];
|
||||
if (pdfUrl.startsWith('http')) {
|
||||
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
|
||||
if (sideLoaded?.body) {
|
||||
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
|
||||
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
||||
}
|
||||
|
||||
const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions);
|
||||
if (r.file) {
|
||||
snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
||||
}
|
||||
|
||||
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
||||
@ -967,6 +1109,26 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
|
||||
if (snapshot.pdfs?.length) {
|
||||
return false;
|
||||
}
|
||||
if (!snapshot.title) {
|
||||
return true;
|
||||
}
|
||||
if (snapshot.parsed?.content) {
|
||||
return false;
|
||||
}
|
||||
if (snapshot.html) {
|
||||
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
|
||||
const tokens = r.tokens;
|
||||
if (tokens < 200) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
getDomainProfileUrlDigest(url: URL) {
|
||||
const pathname = url.pathname;
|
||||
const pathVec = pathname.split('/');
|
||||
@ -981,4 +1143,29 @@ export class CrawlerHost extends RPCHost {
|
||||
path: finalPath,
|
||||
};
|
||||
}
|
||||
|
||||
@retryWith((err) => {
|
||||
if (err instanceof ServiceBadAttemptError) {
|
||||
// Keep trying
|
||||
return true;
|
||||
}
|
||||
if (err instanceof ApplicationError) {
|
||||
// Quit with this error
|
||||
return false;
|
||||
}
|
||||
return undefined;
|
||||
}, 3)
|
||||
async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
|
||||
const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
|
||||
const r = await this.curlControl.sideLoad(url, {
|
||||
...opts,
|
||||
proxyUrl: proxy.href,
|
||||
});
|
||||
|
||||
if (opts && opts.allocProxy) {
|
||||
opts.proxyUrl ??= proxy.href;
|
||||
}
|
||||
|
||||
return { ...r, proxy };
|
||||
}
|
||||
}
|
@ -1,21 +1,25 @@
|
||||
import {
|
||||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
AssertionFailureError,
|
||||
objHashMd5B64Of,
|
||||
assignMeta,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import {
|
||||
assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { objHashMd5B64Of } from 'civkit/hash';
|
||||
import _ from 'lodash';
|
||||
import { Request, Response } from 'express';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
|
||||
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||
import { SerperSearchResult } from '../db/searched';
|
||||
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||
import { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
||||
import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
|
||||
|
||||
import { GlobalLogger } from '../services/logger';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||
import { InsufficientBalanceError } from '../services/errors';
|
||||
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||
|
||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
|
||||
@ -33,9 +37,9 @@ export class SearcherHost extends RPCHost {
|
||||
targetResultCount = 5;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected serperSearchService: SerperSearchService,
|
||||
protected crawler: CrawlerHost,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
@ -49,39 +53,30 @@ export class SearcherHost extends RPCHost {
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
@CloudHTTPv2({
|
||||
name: 'search2',
|
||||
runtime: {
|
||||
cpu: 4,
|
||||
memory: '4GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 4,
|
||||
@Method({
|
||||
name: 'searchIndex',
|
||||
ext: {
|
||||
http: {
|
||||
action: ['get', 'post'],
|
||||
path: '/search'
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
tags: ['search'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
})
|
||||
@CloudHTTPv2({
|
||||
runtime: {
|
||||
cpu: 4,
|
||||
memory: '16GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 4,
|
||||
maxInstances: 200,
|
||||
minInstances: 1,
|
||||
@Method({
|
||||
ext: {
|
||||
http: {
|
||||
action: ['get', 'post'],
|
||||
path: '::q'
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
tags: ['search'],
|
||||
returnType: [String, OutputServerEventStream, RawString],
|
||||
})
|
||||
async search(
|
||||
@RPCReflect() rpcReflect: RPCReflection,
|
||||
@Ctx() ctx: {
|
||||
req: Request,
|
||||
res: Response,
|
||||
},
|
||||
@Ctx() ctx: Context,
|
||||
auth: JinaEmbeddingsAuthDTO,
|
||||
crawlerOptions: CrawlerOptions,
|
||||
searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
|
||||
@ -102,19 +97,17 @@ export class SearcherHost extends RPCHost {
|
||||
|
||||
const uid = await auth.solveUID();
|
||||
// Return content by default
|
||||
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
|
||||
const crawlWithoutContent = respondWith.includes('no-content');
|
||||
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
|
||||
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
|
||||
const withFavicon = Boolean(ctx.get('X-With-Favicons'));
|
||||
|
||||
let chargeAmount = 0;
|
||||
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
||||
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
||||
if (!noSlashPath && !q) {
|
||||
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||
const index = this.crawler.getIndex(latestUser);
|
||||
const index = await this.crawler.getIndex(auth);
|
||||
if (!uid) {
|
||||
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
||||
}
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
|
||||
return index;
|
||||
}
|
||||
@ -189,7 +182,7 @@ export class SearcherHost extends RPCHost {
|
||||
chargeAmount = 10000;
|
||||
}
|
||||
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
|
||||
if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) {
|
||||
if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) {
|
||||
return lastScrapped;
|
||||
}
|
||||
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
|
||||
@ -201,7 +194,7 @@ export class SearcherHost extends RPCHost {
|
||||
withFavicon
|
||||
);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
rpcReflect.return(sseStream);
|
||||
|
||||
@ -210,6 +203,9 @@ export class SearcherHost extends RPCHost {
|
||||
if (!scrapped) {
|
||||
continue;
|
||||
}
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
|
||||
chargeAmount = this.assignChargeAmount(scrapped);
|
||||
sseStream.write({
|
||||
@ -233,7 +229,7 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
|
||||
let earlyReturn = false;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
||||
const setEarlyReturnTimer = () => {
|
||||
if (earlyReturnTimer) {
|
||||
@ -251,6 +247,9 @@ export class SearcherHost extends RPCHost {
|
||||
|
||||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
||||
setEarlyReturnTimer();
|
||||
}
|
||||
@ -299,7 +298,9 @@ export class SearcherHost extends RPCHost {
|
||||
|
||||
for await (const scrapped of it) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (rpcReflect.signal.aborted) {
|
||||
break;
|
||||
}
|
||||
if (_.some(scrapped, (x) => this.pageQualified(x))) {
|
||||
setEarlyReturnTimer();
|
||||
}
|
||||
@ -367,8 +368,8 @@ export class SearcherHost extends RPCHost {
|
||||
const dataItems = [
|
||||
{ key: 'title', label: 'Title' },
|
||||
{ key: 'url', label: 'URL Source' },
|
||||
{ key: 'description', label: 'Description'},
|
||||
]
|
||||
{ key: 'description', label: 'Description' },
|
||||
];
|
||||
|
||||
if (withContent) {
|
||||
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
|
||||
@ -386,7 +387,7 @@ export class SearcherHost extends RPCHost {
|
||||
result.toString = function () {
|
||||
const self = this as any;
|
||||
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
|
||||
}
|
||||
};
|
||||
return result;
|
||||
}));
|
||||
|
||||
@ -408,7 +409,6 @@ export class SearcherHost extends RPCHost {
|
||||
if (!searchResults) {
|
||||
return;
|
||||
}
|
||||
|
||||
const urls = searchResults.map((x) => new URL(x.link));
|
||||
const snapshotMap = new WeakMap();
|
||||
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
|
||||
@ -427,7 +427,7 @@ export class SearcherHost extends RPCHost {
|
||||
if (snapshotMap.has(x)) {
|
||||
return snapshotMap.get(x);
|
||||
}
|
||||
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||
return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
|
||||
r.title ??= upstreamSearchResult.title;
|
||||
r.description = upstreamSearchResult.snippet;
|
||||
snapshotMap.set(x, r);
|
@ -1,22 +1,30 @@
|
||||
import {
|
||||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
AssertionFailureError,
|
||||
objHashMd5B64Of,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { Request, Response } from 'express';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
||||
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
import { SearchResult } from '../db/searched';
|
||||
|
||||
import {
|
||||
assignTransferProtocolMeta, RPCHost, RPCReflection,
|
||||
AssertionFailureError,
|
||||
RawString,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { objHashMd5B64Of } from 'civkit/hash';
|
||||
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
||||
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
|
||||
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||
import { SearchResult } from '../db/searched';
|
||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||
import { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
||||
|
||||
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
||||
import { GlobalLogger } from '../services/logger';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
||||
import { InsufficientBalanceError } from '../services/errors';
|
||||
|
||||
|
||||
@singleton()
|
||||
@ -32,9 +40,9 @@ export class SearcherHost extends RPCHost {
|
||||
targetResultCount = 5;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected braveSearchService: BraveSearchService,
|
||||
protected crawler: CrawlerHost,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
@ -48,39 +56,30 @@ export class SearcherHost extends RPCHost {
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
@CloudHTTPv2({
|
||||
name: 'search2',
|
||||
runtime: {
|
||||
cpu: 4,
|
||||
memory: '4GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 4,
|
||||
@Method({
|
||||
name: 'searchIndex',
|
||||
ext: {
|
||||
http: {
|
||||
action: ['get', 'post'],
|
||||
path: '/search'
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
tags: ['search'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
})
|
||||
@CloudHTTPv2({
|
||||
runtime: {
|
||||
cpu: 4,
|
||||
memory: '16GiB',
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 4,
|
||||
maxInstances: 200,
|
||||
minInstances: 1,
|
||||
@Method({
|
||||
ext: {
|
||||
http: {
|
||||
action: ['get', 'post'],
|
||||
path: '::q'
|
||||
}
|
||||
},
|
||||
tags: ['Searcher'],
|
||||
httpMethod: ['get', 'post'],
|
||||
returnType: [String, OutputServerEventStream],
|
||||
exposeRoot: true,
|
||||
tags: ['search'],
|
||||
returnType: [String, OutputServerEventStream, RawString],
|
||||
})
|
||||
async search(
|
||||
@RPCReflect() rpcReflect: RPCReflection,
|
||||
@Ctx() ctx: {
|
||||
req: Request,
|
||||
res: Response,
|
||||
},
|
||||
@Ctx() ctx: Context,
|
||||
auth: JinaEmbeddingsAuthDTO,
|
||||
@Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
|
||||
count: number,
|
||||
@ -90,14 +89,13 @@ export class SearcherHost extends RPCHost {
|
||||
) {
|
||||
const uid = await auth.solveUID();
|
||||
let chargeAmount = 0;
|
||||
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
|
||||
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
|
||||
if (!noSlashPath && !q) {
|
||||
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||
const index = this.crawler.getIndex(latestUser);
|
||||
const index = await this.crawler.getIndex(auth);
|
||||
if (!uid) {
|
||||
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
|
||||
}
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
|
||||
return index;
|
||||
}
|
||||
@ -160,7 +158,7 @@ export class SearcherHost extends RPCHost {
|
||||
count,
|
||||
);
|
||||
|
||||
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
|
||||
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
|
||||
const sseStream = new OutputServerEventStream();
|
||||
rpcReflect.return(sseStream);
|
||||
|
||||
@ -193,7 +191,7 @@ export class SearcherHost extends RPCHost {
|
||||
|
||||
let lastScrapped: any[] | undefined;
|
||||
let earlyReturn = false;
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
|
||||
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
|
||||
const setEarlyReturnTimer = () => {
|
||||
if (earlyReturnTimer) {
|
@ -14,7 +14,7 @@ import robotsParser from 'robots-parser';
|
||||
import { DOMParser } from '@xmldom/xmldom';
|
||||
|
||||
import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
|
||||
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||
import { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
|
||||
import { getFunctions } from 'firebase-admin/functions';
|
@ -9,7 +9,7 @@ import {
|
||||
FirebaseStorageBucketControl, Logger, Param, TempFileManager
|
||||
} from '../shared';
|
||||
import _ from 'lodash';
|
||||
import { CrawlerHost } from './crawler';
|
||||
import { CrawlerHost } from '../api/crawler';
|
||||
|
||||
import { Crawled } from '../db/crawled';
|
||||
import dayjs from 'dayjs';
|
@ -1,6 +1,6 @@
|
||||
import { Also, Prop } from 'civkit';
|
||||
import { FirestoreRecord } from '../shared/lib/firestore';
|
||||
import { ENGINE_TYPE } from '../dto/scrapping-options';
|
||||
import { ENGINE_TYPE } from '../dto/crawler-options';
|
||||
|
||||
@Also({
|
||||
dictOf: Object
|
@ -1,6 +1,6 @@
|
||||
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||
import type { Request, Response } from 'express';
|
||||
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
import { Context } from '../services/registry';
|
||||
|
||||
export enum CONTENT_FORMAT {
|
||||
CONTENT = 'content',
|
||||
@ -19,6 +19,7 @@ export enum ENGINE_TYPE {
|
||||
DIRECT = 'direct',
|
||||
VLM = 'vlm',
|
||||
READER_LM = 'readerlm-v2',
|
||||
CF_BROWSER_RENDERING = 'cf-browser-rendering',
|
||||
}
|
||||
|
||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
@ -125,6 +126,11 @@ class Viewport extends AutoCastable {
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Proxy': {
|
||||
description: `Use a proxy server provided by Jina AI.\n\nOptionally specify two-letter country code.`,
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Set-Cookie': {
|
||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||
`Syntax is the same with standard Set-Cookie`,
|
||||
@ -297,6 +303,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
proxyUrl?: string;
|
||||
|
||||
@Prop()
|
||||
proxy?: string;
|
||||
|
||||
@Prop()
|
||||
userAgent?: string;
|
||||
|
||||
@ -338,15 +347,18 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
jsonSchema?: object;
|
||||
|
||||
@Prop()
|
||||
robotsTxt?: string;
|
||||
|
||||
@Prop()
|
||||
doNotTrack?: number | null;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
req: Request,
|
||||
res: Response,
|
||||
} | undefined;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||
|
||||
const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format');
|
||||
if (customMode !== undefined) {
|
||||
const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format');
|
||||
if (customMode) {
|
||||
instance.respondWith = customMode;
|
||||
}
|
||||
if (instance.respondWith) {
|
||||
@ -361,74 +373,74 @@ export class CrawlerOptions extends AutoCastable {
|
||||
}
|
||||
}
|
||||
|
||||
const locale = ctx?.req.get('x-locale');
|
||||
if (locale !== undefined) {
|
||||
const locale = ctx?.get('x-locale');
|
||||
if (locale) {
|
||||
instance.locale = locale;
|
||||
}
|
||||
|
||||
const referer = ctx?.req.get('x-referer');
|
||||
if (referer !== undefined) {
|
||||
const referer = ctx?.get('x-referer');
|
||||
if (referer) {
|
||||
instance.referer = referer;
|
||||
}
|
||||
|
||||
const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
|
||||
if (withGeneratedAlt !== undefined) {
|
||||
const withGeneratedAlt = ctx?.get('x-with-generated-alt');
|
||||
if (withGeneratedAlt) {
|
||||
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
|
||||
}
|
||||
const withLinksSummary = ctx?.req.get('x-with-links-summary');
|
||||
if (withLinksSummary !== undefined) {
|
||||
const withLinksSummary = ctx?.get('x-with-links-summary');
|
||||
if (withLinksSummary) {
|
||||
if (withLinksSummary === 'all') {
|
||||
instance.withLinksSummary = withLinksSummary;
|
||||
} else {
|
||||
instance.withLinksSummary = Boolean(withLinksSummary);
|
||||
}
|
||||
}
|
||||
const withImagesSummary = ctx?.req.get('x-with-images-summary');
|
||||
if (withImagesSummary !== undefined) {
|
||||
const withImagesSummary = ctx?.get('x-with-images-summary');
|
||||
if (withImagesSummary) {
|
||||
instance.withImagesSummary = Boolean(withImagesSummary);
|
||||
}
|
||||
const retainImages = ctx?.req.get('x-retain-images');
|
||||
const retainImages = ctx?.get('x-retain-images');
|
||||
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
|
||||
instance.retainImages = retainImages as any;
|
||||
}
|
||||
if (instance.withGeneratedAlt) {
|
||||
instance.retainImages = 'all_p';
|
||||
}
|
||||
const noCache = ctx?.req.get('x-no-cache');
|
||||
if (noCache !== undefined) {
|
||||
const noCache = ctx?.get('x-no-cache');
|
||||
if (noCache) {
|
||||
instance.noCache = Boolean(noCache);
|
||||
}
|
||||
if (instance.noCache && instance.cacheTolerance === undefined) {
|
||||
instance.cacheTolerance = 0;
|
||||
}
|
||||
let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || '');
|
||||
let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || '');
|
||||
if (!isNaN(cacheTolerance)) {
|
||||
instance.cacheTolerance = cacheTolerance;
|
||||
}
|
||||
|
||||
const noGfm = ctx?.req.get('x-no-gfm');
|
||||
const noGfm = ctx?.get('x-no-gfm');
|
||||
if (noGfm) {
|
||||
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
|
||||
}
|
||||
|
||||
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
|
||||
let timeoutSeconds = parseInt(ctx?.get('x-timeout') || '');
|
||||
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
|
||||
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
|
||||
} else if (ctx?.req.get('x-timeout')) {
|
||||
} else if (ctx?.get('x-timeout')) {
|
||||
instance.timeout = null;
|
||||
}
|
||||
|
||||
const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
|
||||
instance.removeSelector ??= removeSelector;
|
||||
const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
|
||||
instance.targetSelector ??= targetSelector;
|
||||
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
|
||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||
const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean);
|
||||
instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined;
|
||||
const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean);
|
||||
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
|
||||
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
|
||||
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
|
||||
instance.targetSelector = filterSelector(instance.targetSelector);
|
||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
|
||||
instance.userAgent ??= overrideUserAgent;
|
||||
|
||||
const engine = ctx?.req.get('x-engine');
|
||||
const engine = ctx?.get('x-engine');
|
||||
if (engine) {
|
||||
instance.engine = engine;
|
||||
}
|
||||
@ -443,18 +455,18 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.respondWith = CONTENT_FORMAT.READER_LM;
|
||||
}
|
||||
|
||||
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
|
||||
if (keepImgDataUrl !== undefined) {
|
||||
const keepImgDataUrl = ctx?.get('x-keep-img-data-url');
|
||||
if (keepImgDataUrl) {
|
||||
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||
}
|
||||
const withIframe = ctx?.req.get('x-with-iframe');
|
||||
if (withIframe !== undefined) {
|
||||
const withIframe = ctx?.get('x-with-iframe');
|
||||
if (withIframe) {
|
||||
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
|
||||
}
|
||||
if (instance.withIframe) {
|
||||
instance.timeout ??= null;
|
||||
}
|
||||
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
|
||||
const withShadowDom = ctx?.get('x-with-shadow-dom');
|
||||
if (withShadowDom) {
|
||||
instance.withShadowDom = Boolean(withShadowDom);
|
||||
}
|
||||
@ -463,7 +475,7 @@ export class CrawlerOptions extends AutoCastable {
|
||||
}
|
||||
|
||||
const cookies: Cookie[] = [];
|
||||
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||
const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean);
|
||||
if (Array.isArray(setCookieHeaders)) {
|
||||
for (const setCookie of setCookieHeaders) {
|
||||
cookies.push({
|
||||
@ -477,21 +489,24 @@ export class CrawlerOptions extends AutoCastable {
|
||||
}
|
||||
instance.setCookies = cookies;
|
||||
|
||||
const proxyUrl = ctx?.req.get('x-proxy-url');
|
||||
instance.proxyUrl ??= proxyUrl;
|
||||
const proxyUrl = ctx?.get('x-proxy-url');
|
||||
instance.proxyUrl ??= proxyUrl || undefined;
|
||||
const proxy = ctx?.get('x-proxy');
|
||||
instance.proxy ??= proxy || undefined;
|
||||
const robotsTxt = ctx?.get('x-robots-txt');
|
||||
instance.robotsTxt ??= robotsTxt || undefined;
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
|
||||
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
||||
const tokenBudget = ctx?.get('x-token-budget');
|
||||
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
||||
|
||||
const baseMode = ctx?.req.get('x-base') || undefined;
|
||||
const baseMode = ctx?.get('x-base');
|
||||
if (baseMode) {
|
||||
instance.base = baseMode as any;
|
||||
}
|
||||
|
||||
const dnt = ctx?.get('dnt');
|
||||
instance.doNotTrack ??= (parseInt(dnt || '') || null);
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
216
src/dto/jina-embeddings-auth.ts
Normal file
216
src/dto/jina-embeddings-auth.ts
Normal file
@ -0,0 +1,216 @@
|
||||
import _ from 'lodash';
|
||||
import {
|
||||
Also, AuthenticationFailedError, AuthenticationRequiredError,
|
||||
DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT,
|
||||
AutoCastable,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { htmlEscape } from 'civkit/escape';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
|
||||
import type { Context } from 'koa';
|
||||
|
||||
import logger from '../services/logger';
|
||||
import { InjectProperty } from '../services/registry';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
|
||||
import envConfig from '../shared/services/secrets';
|
||||
import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
|
||||
|
||||
const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
|
||||
|
||||
const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY);
|
||||
|
||||
@Also({
|
||||
openapi: {
|
||||
operation: {
|
||||
parameters: {
|
||||
'Authorization': {
|
||||
description: htmlEscape`Jina Token for authentication.\n\n` +
|
||||
htmlEscape`- Member of <JinaEmbeddingsAuthDTO>\n\n` +
|
||||
`- Authorization: Bearer {YOUR_JINA_TOKEN}`
|
||||
,
|
||||
in: 'header',
|
||||
schema: {
|
||||
anyOf: [
|
||||
{ type: 'string', format: 'token' }
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
export class JinaEmbeddingsAuthDTO extends AutoCastable {
|
||||
uid?: string;
|
||||
bearerToken?: string;
|
||||
user?: JinaEmbeddingsTokenAccount;
|
||||
|
||||
@InjectProperty(AsyncLocalContext)
|
||||
ctxMgr!: AsyncLocalContext;
|
||||
|
||||
jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as JinaEmbeddingsAuthDTO;
|
||||
|
||||
const ctx = input[RPC_CALL_ENVIRONMENT] as Context;
|
||||
|
||||
if (ctx) {
|
||||
const authorization = ctx.get('authorization');
|
||||
|
||||
if (authorization) {
|
||||
const authToken = authorization.split(' ')[1] || authorization;
|
||||
instance.bearerToken = authToken;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
if (!instance.bearerToken && input._token) {
|
||||
instance.bearerToken = input._token;
|
||||
}
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
async getBrief(ignoreCache?: boolean | string) {
|
||||
if (!this.bearerToken) {
|
||||
throw new AuthenticationRequiredError({
|
||||
message: 'Jina API key is required to authenticate. Please get one from https://jina.ai'
|
||||
});
|
||||
}
|
||||
|
||||
let account;
|
||||
try {
|
||||
account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
|
||||
} catch (err) {
|
||||
// FireStore would not accept any string as input and may throw if not happy with it
|
||||
void 0;
|
||||
}
|
||||
|
||||
|
||||
const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.getTime() : Infinity;
|
||||
|
||||
if (account && !ignoreCache) {
|
||||
if (account && age < 180_000) {
|
||||
this.user = account;
|
||||
this.uid = this.user?.user_id;
|
||||
|
||||
return account;
|
||||
}
|
||||
}
|
||||
|
||||
try {
|
||||
const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
|
||||
const brief = r.data;
|
||||
const draftAccount = JinaEmbeddingsTokenAccount.from({
|
||||
...account, ...brief, _id: this.bearerToken,
|
||||
lastSyncedAt: new Date()
|
||||
});
|
||||
await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true });
|
||||
|
||||
this.user = draftAccount;
|
||||
this.uid = this.user?.user_id;
|
||||
|
||||
return draftAccount;
|
||||
} catch (err: any) {
|
||||
authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) });
|
||||
|
||||
if (err?.status === 401) {
|
||||
throw new AuthenticationFailedError({
|
||||
message: 'Invalid API key, please get a new one from https://jina.ai'
|
||||
});
|
||||
}
|
||||
|
||||
if (account) {
|
||||
this.user = account;
|
||||
this.uid = this.user?.user_id;
|
||||
|
||||
return account;
|
||||
}
|
||||
|
||||
|
||||
throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`);
|
||||
}
|
||||
}
|
||||
|
||||
async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') {
|
||||
const user = await this.assertUser();
|
||||
const uid = user.user_id;
|
||||
user.wallet.total_balance -= tokenCount;
|
||||
|
||||
return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, {
|
||||
model_name: mdl,
|
||||
api_endpoint: endpoint,
|
||||
consumer: {
|
||||
id: uid,
|
||||
user_id: uid,
|
||||
},
|
||||
usage: {
|
||||
total_tokens: tokenCount
|
||||
},
|
||||
labels: {
|
||||
model_name: mdl
|
||||
}
|
||||
}).then((r) => {
|
||||
JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!)
|
||||
.update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) })
|
||||
.catch((err) => {
|
||||
authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
|
||||
return r;
|
||||
}).catch((err) => {
|
||||
user.wallet.total_balance += tokenCount;
|
||||
authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
}
|
||||
|
||||
async solveUID() {
|
||||
if (this.uid) {
|
||||
this.ctxMgr.set('uid', this.uid);
|
||||
|
||||
return this.uid;
|
||||
}
|
||||
|
||||
if (this.bearerToken) {
|
||||
await this.getBrief();
|
||||
this.ctxMgr.set('uid', this.uid);
|
||||
|
||||
return this.uid;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
|
||||
async assertUID() {
|
||||
const uid = await this.solveUID();
|
||||
|
||||
if (!uid) {
|
||||
throw new AuthenticationRequiredError('Authentication failed');
|
||||
}
|
||||
|
||||
return uid;
|
||||
}
|
||||
|
||||
async assertUser() {
|
||||
if (this.user) {
|
||||
return this.user;
|
||||
}
|
||||
|
||||
await this.getBrief();
|
||||
|
||||
return this.user!;
|
||||
}
|
||||
|
||||
getRateLimits(...tags: string[]) {
|
||||
const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
|
||||
|
||||
if (descs.length) {
|
||||
return descs;
|
||||
}
|
||||
|
||||
return undefined;
|
||||
}
|
||||
}
|
169
src/lib/transform-server-event-stream.ts
Normal file
169
src/lib/transform-server-event-stream.ts
Normal file
@ -0,0 +1,169 @@
|
||||
import { TPM, parseJSONText } from 'civkit';
|
||||
import { Transform, TransformCallback, TransformOptions } from 'stream';
|
||||
|
||||
export class InputServerEventStream extends Transform {
|
||||
cache: string[] = [];
|
||||
|
||||
constructor(options?: TransformOptions) {
|
||||
super({
|
||||
...options,
|
||||
readableObjectMode: true
|
||||
});
|
||||
}
|
||||
|
||||
decodeRoutine() {
|
||||
if (!this.cache.length) {
|
||||
return;
|
||||
}
|
||||
|
||||
const vecs = this.cache.join('').split(/\r?\n\r?\n/);
|
||||
this.cache.length = 0;
|
||||
const lastVec = vecs.pop();
|
||||
if (lastVec) {
|
||||
this.cache.push(lastVec);
|
||||
}
|
||||
|
||||
for (const x of vecs) {
|
||||
const lines: string[] = x.split(/\r?\n/);
|
||||
|
||||
const event: {
|
||||
event?: string;
|
||||
data?: string;
|
||||
id?: string;
|
||||
retry?: number;
|
||||
} = {};
|
||||
|
||||
for (const l of lines) {
|
||||
const columnPos = l.indexOf(':');
|
||||
if (columnPos <= 0) {
|
||||
continue;
|
||||
}
|
||||
const key = l.substring(0, columnPos);
|
||||
const rawValue = l.substring(columnPos + 1);
|
||||
const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue;
|
||||
if (key === 'data') {
|
||||
if (event.data) {
|
||||
event.data += value || '\n';
|
||||
} else if (event.data === '') {
|
||||
event.data += '\n';
|
||||
event.data += value || '\n';
|
||||
} else {
|
||||
event.data = value;
|
||||
}
|
||||
} else if (key === 'retry') {
|
||||
event.retry = parseInt(value, 10);
|
||||
} else {
|
||||
Reflect.set(event, key, value);
|
||||
}
|
||||
}
|
||||
|
||||
if (event.data) {
|
||||
const parsed = parseJSONText(event.data);
|
||||
if (parsed && typeof parsed === 'object') {
|
||||
event.data = parsed;
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(event).length) {
|
||||
this.push(event);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
|
||||
if (chunk === null) {
|
||||
this.push(null);
|
||||
}
|
||||
|
||||
this.cache.push(chunk.toString());
|
||||
this.decodeRoutine();
|
||||
|
||||
callback();
|
||||
}
|
||||
|
||||
override _final(callback: (error?: Error | null | undefined) => void): void {
|
||||
this.decodeRoutine();
|
||||
callback();
|
||||
}
|
||||
}
|
||||
|
||||
@TPM({
|
||||
contentType: 'text/event-stream',
|
||||
})
|
||||
export class OutputServerEventStream extends Transform {
|
||||
n: number = 0;
|
||||
|
||||
constructor(options?: TransformOptions) {
|
||||
super({
|
||||
...options, writableObjectMode: true, encoding: 'utf-8'
|
||||
});
|
||||
}
|
||||
|
||||
encodeRoutine(chunk: {
|
||||
event?: string;
|
||||
data?: any;
|
||||
id?: string;
|
||||
retry?: number;
|
||||
} | string) {
|
||||
if (typeof chunk === 'object') {
|
||||
const lines: string[] = [];
|
||||
|
||||
if (chunk.event) {
|
||||
lines.push(`event: ${chunk.event}`);
|
||||
}
|
||||
if (chunk.data) {
|
||||
if (typeof chunk.data === 'string') {
|
||||
for (const x of chunk.data.split(/\r?\n/)) {
|
||||
lines.push(`data: ${x}`);
|
||||
}
|
||||
} else {
|
||||
lines.push(`data: ${JSON.stringify(chunk.data)}`);
|
||||
}
|
||||
}
|
||||
if (chunk.id) {
|
||||
lines.push(`id: ${chunk.id}`);
|
||||
}
|
||||
if (chunk.retry) {
|
||||
lines.push(`retry: ${chunk.retry}`);
|
||||
}
|
||||
if (!lines.length) {
|
||||
lines.push(`data: ${JSON.stringify(chunk)}`);
|
||||
}
|
||||
this.push(lines.join('\n'));
|
||||
this.push('\n\n');
|
||||
this.n++;
|
||||
|
||||
return;
|
||||
} else if (typeof chunk === 'string') {
|
||||
const lines: string[] = [];
|
||||
for (const x of chunk.split(/\r?\n/)) {
|
||||
lines.push(`data: ${x}`);
|
||||
}
|
||||
|
||||
this.push(lines.join('\n'));
|
||||
this.push('\n\n');
|
||||
this.n++;
|
||||
}
|
||||
}
|
||||
|
||||
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
|
||||
if (chunk === null) {
|
||||
this.push(null);
|
||||
}
|
||||
|
||||
this.encodeRoutine(chunk);
|
||||
|
||||
callback();
|
||||
}
|
||||
}
|
||||
|
||||
export interface OutputServerEventStream extends Transform {
|
||||
write(chunk: string | {
|
||||
event?: string;
|
||||
data?: any;
|
||||
id?: string;
|
||||
retry?: number;
|
||||
}, callback?: (error: Error | null | undefined) => void): boolean;
|
||||
write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean;
|
||||
write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean;
|
||||
}
|
10
src/services/async-context.ts
Normal file
10
src/services/async-context.ts
Normal file
@ -0,0 +1,10 @@
|
||||
import { GlobalAsyncContext } from 'civkit/async-context';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
|
||||
@singleton()
|
||||
export class AsyncLocalContext extends GlobalAsyncContext { }
|
||||
|
||||
const instance = container.resolve(AsyncLocalContext);
|
||||
Reflect.set(process, 'asyncLocalContext', instance);
|
||||
|
||||
export default instance;
|
72
src/services/blackhole-detector.ts
Normal file
72
src/services/blackhole-detector.ts
Normal file
@ -0,0 +1,72 @@
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { GlobalLogger } from './logger';
|
||||
|
||||
|
||||
@singleton()
|
||||
export class BlackHoleDetector extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
lastWorkedTs?: number;
|
||||
lastDoneRequestTs?: number;
|
||||
lastIncomingRequestTs?: number;
|
||||
|
||||
maxDelay = 1000 * 30;
|
||||
concurrentRequests = 0;
|
||||
|
||||
strikes = 0;
|
||||
|
||||
constructor(protected globalLogger: GlobalLogger) {
|
||||
super(...arguments);
|
||||
|
||||
if (process.env.NODE_ENV?.startsWith('prod')) {
|
||||
setInterval(() => {
|
||||
this.routine();
|
||||
}, 1000 * 15).unref();
|
||||
}
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.logger.debug('BlackHoleDetector started');
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
routine() {
|
||||
const now = Date.now();
|
||||
const lastWorked = this.lastWorkedTs;
|
||||
if (!lastWorked) {
|
||||
return;
|
||||
}
|
||||
const dt = (now - lastWorked);
|
||||
if (this.concurrentRequests > 0 &&
|
||||
this.lastIncomingRequestTs && lastWorked &&
|
||||
this.lastIncomingRequestTs >= lastWorked &&
|
||||
(dt > (this.maxDelay * (this.strikes + 1)))
|
||||
) {
|
||||
this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
|
||||
this.strikes += 1;
|
||||
}
|
||||
|
||||
if (this.strikes >= 3) {
|
||||
this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
|
||||
this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`));
|
||||
}
|
||||
}
|
||||
|
||||
incomingRequest() {
|
||||
this.lastIncomingRequestTs = Date.now();
|
||||
this.lastWorkedTs ??= Date.now();
|
||||
this.concurrentRequests++;
|
||||
}
|
||||
doneWithRequest() {
|
||||
this.concurrentRequests--;
|
||||
this.lastDoneRequestTs = Date.now();
|
||||
}
|
||||
|
||||
itWorked() {
|
||||
this.lastWorkedTs = Date.now();
|
||||
this.strikes = 0;
|
||||
}
|
||||
|
||||
};
|
@ -7,6 +7,7 @@ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
||||
import { AsyncContext } from '../shared';
|
||||
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
|
||||
import type { Request, Response } from 'express';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
|
||||
@singleton()
|
||||
export class BraveSearchService extends AsyncService {
|
||||
@ -20,6 +21,7 @@ export class BraveSearchService extends AsyncService {
|
||||
protected secretExposer: SecretExposer,
|
||||
protected geoipControl: GeoIPService,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected blackHoleDetector: BlackHoleDetector,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
@ -69,6 +71,7 @@ export class BraveSearchService extends AsyncService {
|
||||
while (maxTries--) {
|
||||
try {
|
||||
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
|
||||
this.blackHoleDetector.itWorked();
|
||||
|
||||
return r.parsed;
|
||||
} catch (err: any) {
|
38
src/services/cf-browser-rendering.ts
Normal file
38
src/services/cf-browser-rendering.ts
Normal file
@ -0,0 +1,38 @@
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { Logger, SecretExposer } from '../shared';
|
||||
import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
|
||||
|
||||
@singleton()
|
||||
export class CFBrowserRendering extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
client!: CloudFlareHTTP;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected secretExposer: SecretExposer,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':');
|
||||
this.client = new CloudFlareHTTP(account, key);
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async fetchContent(url: string) {
|
||||
const r = await this.client.fetchBrowserRenderedHTML({ url });
|
||||
|
||||
return r.parsed.result;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const instance = container.resolve(CFBrowserRendering);
|
||||
|
||||
export default instance;
|
387
src/services/curl.ts
Normal file
387
src/services/curl.ts
Normal file
@ -0,0 +1,387 @@
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
|
||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
|
||||
import { ScrappingOptions } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { AssertionFailureError, FancyFile } from 'civkit';
|
||||
import { ServiceBadAttemptError, TempFileManager } from '../shared';
|
||||
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
|
||||
import { ZSTDDecompress } from 'simple-zstd';
|
||||
import _ from 'lodash';
|
||||
import { Readable } from 'stream';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
|
||||
export interface CURLScrappingOptions extends ScrappingOptions {
|
||||
method?: string;
|
||||
body?: string | Buffer;
|
||||
}
|
||||
|
||||
@singleton()
|
||||
export class CurlControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
chromeVersion: string = `132`;
|
||||
safariVersion: string = `537.36`;
|
||||
platform: string = `Linux`;
|
||||
ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
|
||||
|
||||
lifeCycleTrack = new WeakMap();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected tempFileManager: TempFileManager,
|
||||
protected asyncLocalContext: AsyncLocalContext,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
if (process.platform === 'darwin') {
|
||||
this.platform = `macOS`;
|
||||
} else if (process.platform === 'win32') {
|
||||
this.platform = `Windows`;
|
||||
}
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
impersonateChrome(ua: string) {
|
||||
this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1];
|
||||
this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1];
|
||||
this.ua = ua;
|
||||
}
|
||||
|
||||
curlImpersonateHeader(curl: Curl, headers?: object) {
|
||||
const mixinHeaders: Record<string, string> = {
|
||||
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': this.platform,
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
'User-Agent': this.ua,
|
||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||
'Sec-Fetch-Site': 'none',
|
||||
'Sec-Fetch-Mode': 'navigate',
|
||||
'Sec-Fetch-User': '?1',
|
||||
'Sec-Fetch-Dest': 'document',
|
||||
'Accept-Encoding': 'gzip, deflate, br, zstd',
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
};
|
||||
const headersCopy: Record<string, string | undefined> = { ...headers };
|
||||
for (const k of Object.keys(mixinHeaders)) {
|
||||
const lowerK = k.toLowerCase();
|
||||
if (headersCopy[lowerK]) {
|
||||
mixinHeaders[k] = headersCopy[lowerK];
|
||||
delete headersCopy[lowerK];
|
||||
}
|
||||
}
|
||||
Object.assign(mixinHeaders, headersCopy);
|
||||
|
||||
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
|
||||
if (Array.isArray(v) && v.length) {
|
||||
return v.map((v2) => `${k}: ${v2}`);
|
||||
}
|
||||
return [`${k}: ${v}`];
|
||||
}));
|
||||
|
||||
return curl;
|
||||
}
|
||||
|
||||
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
return new Promise<{
|
||||
statusCode: number,
|
||||
data?: FancyFile,
|
||||
headers: HeaderInfo[],
|
||||
}>((resolve, reject) => {
|
||||
let contentType = '';
|
||||
const curl = new Curl();
|
||||
curl.enable(CurlFeature.StreamResponse);
|
||||
curl.setOpt('URL', urlToCrawl.toString());
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, false);
|
||||
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(30_000, crawlOpts?.timeoutMs || 30_000));
|
||||
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
||||
if (crawlOpts?.method) {
|
||||
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
|
||||
}
|
||||
if (crawlOpts?.body) {
|
||||
curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString());
|
||||
}
|
||||
|
||||
const headersToSet = { ...crawlOpts?.extraHeaders };
|
||||
if (crawlOpts?.cookies?.length) {
|
||||
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${encodeURIComponent(cookie.value)}`);
|
||||
headersToSet.cookie ??= cookieChunks.join('; ');
|
||||
}
|
||||
if (crawlOpts?.referer) {
|
||||
headersToSet.referer ??= crawlOpts.referer;
|
||||
}
|
||||
if (crawlOpts?.overrideUserAgent) {
|
||||
headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
|
||||
}
|
||||
|
||||
this.curlImpersonateHeader(curl, headersToSet);
|
||||
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
|
||||
curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href);
|
||||
}
|
||||
|
||||
let curlStream: Readable | undefined;
|
||||
curl.on('error', (err, errCode) => {
|
||||
curl.close();
|
||||
this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err: marshalErrorLike(err), urlToCrawl });
|
||||
if (curlStream) {
|
||||
// For some reason, manually emitting error event is required for curlStream.
|
||||
curlStream.emit('error', err);
|
||||
curlStream.destroy(err);
|
||||
}
|
||||
const err2 = this.digestCurlCode(errCode, err.message);
|
||||
if (err2) {
|
||||
reject(err2);
|
||||
return;
|
||||
}
|
||||
reject(new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`));
|
||||
});
|
||||
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
||||
let status = -1;
|
||||
let contentEncoding = '';
|
||||
curl.once('end', () => {
|
||||
if (curlStream) {
|
||||
curlStream.once('end', () => curl.close());
|
||||
return;
|
||||
}
|
||||
curl.close();
|
||||
});
|
||||
curl.on('stream', (stream, statusCode, headers) => {
|
||||
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
|
||||
status = statusCode;
|
||||
curlStream = stream;
|
||||
for (const headerSet of (headers as HeaderInfo[])) {
|
||||
for (const [k, v] of Object.entries(headerSet)) {
|
||||
if (k.trim().endsWith(':')) {
|
||||
Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
|
||||
Reflect.deleteProperty(headerSet, k);
|
||||
continue;
|
||||
}
|
||||
if (v === undefined) {
|
||||
Reflect.set(headerSet, k, '');
|
||||
continue;
|
||||
}
|
||||
if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
|
||||
contentType = v.toLowerCase();
|
||||
}
|
||||
}
|
||||
}
|
||||
const lastResHeaders = headers[headers.length - 1];
|
||||
for (const [k, v] of Object.entries(lastResHeaders)) {
|
||||
const kl = k.toLowerCase();
|
||||
if (kl === 'content-type') {
|
||||
contentType = v.toLowerCase();
|
||||
}
|
||||
if (kl === 'content-encoding') {
|
||||
contentEncoding = v.toLowerCase();
|
||||
}
|
||||
if (contentType && contentEncoding) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if ([301, 302, 307, 308].includes(statusCode)) {
|
||||
if (stream) {
|
||||
stream.resume();
|
||||
}
|
||||
resolve({
|
||||
statusCode: status,
|
||||
data: undefined,
|
||||
headers: headers as HeaderInfo[],
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
if (!stream) {
|
||||
resolve({
|
||||
statusCode: status,
|
||||
data: undefined,
|
||||
headers: headers as HeaderInfo[],
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
switch (contentEncoding) {
|
||||
case 'gzip': {
|
||||
const decompressed = createGunzip();
|
||||
stream.pipe(decompressed);
|
||||
stream.once('error', (err) => {
|
||||
decompressed.destroy(err);
|
||||
});
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'deflate': {
|
||||
const decompressed = createInflate();
|
||||
stream.pipe(decompressed);
|
||||
stream.once('error', (err) => {
|
||||
decompressed.destroy(err);
|
||||
});
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'br': {
|
||||
const decompressed = createBrotliDecompress();
|
||||
stream.pipe(decompressed);
|
||||
stream.once('error', (err) => {
|
||||
decompressed.destroy(err);
|
||||
});
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
case 'zstd': {
|
||||
const decompressed = ZSTDDecompress();
|
||||
stream.pipe(decompressed);
|
||||
stream.once('error', (err) => {
|
||||
decompressed.destroy(err);
|
||||
});
|
||||
stream = decompressed;
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
const fpath = this.tempFileManager.alloc();
|
||||
const fancyFile = FancyFile.auto(stream, fpath);
|
||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||
resolve({
|
||||
statusCode: status,
|
||||
data: fancyFile,
|
||||
headers: headers as HeaderInfo[],
|
||||
});
|
||||
});
|
||||
|
||||
curl.perform();
|
||||
});
|
||||
}
|
||||
|
||||
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
let leftRedirection = 10;
|
||||
let opts = { ...crawlOpts };
|
||||
let nextHopUrl = urlToCrawl;
|
||||
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||
do {
|
||||
const r = await this.urlToFile1Shot(nextHopUrl, opts);
|
||||
|
||||
if ([301, 302, 307, 308].includes(r.statusCode)) {
|
||||
const headers = r.headers[r.headers.length - 1];
|
||||
const location = headers.Location || headers.location;
|
||||
if (!location) {
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
|
||||
}
|
||||
|
||||
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
|
||||
if (setCookieHeader) {
|
||||
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
||||
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
|
||||
if (parsed.length) {
|
||||
opts.cookies = [...(opts.cookies || []), ...parsed];
|
||||
}
|
||||
}
|
||||
|
||||
nextHopUrl = new URL(location, nextHopUrl);
|
||||
fakeHeaderInfos.push(...r.headers);
|
||||
leftRedirection -= 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
return {
|
||||
statusCode: r.statusCode,
|
||||
data: r.data,
|
||||
headers: fakeHeaderInfos.concat(r.headers),
|
||||
};
|
||||
} while (leftRedirection > 0);
|
||||
|
||||
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
||||
}
|
||||
|
||||
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
|
||||
|
||||
let finalURL = targetUrl;
|
||||
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
|
||||
impersonate: {},
|
||||
proxyOrigin: {},
|
||||
};
|
||||
for (const headers of curlResult.headers) {
|
||||
sideLoadOpts.impersonate[finalURL.href] = {
|
||||
status: headers.result?.code || -1,
|
||||
headers: _.omit(headers, 'result'),
|
||||
contentType: headers['Content-Type'] || headers['content-type'],
|
||||
};
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
|
||||
}
|
||||
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
|
||||
const location = headers.Location || headers.location;
|
||||
if (!location) {
|
||||
throw new Error(`Bad redirection: ${curlResult.headers.length} times`);
|
||||
}
|
||||
finalURL = new URL(location, finalURL);
|
||||
}
|
||||
}
|
||||
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
|
||||
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type']).toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
|
||||
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
|
||||
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
|
||||
|
||||
if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
|
||||
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
|
||||
}
|
||||
|
||||
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
|
||||
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
|
||||
|
||||
return {
|
||||
finalURL,
|
||||
sideLoadOpts,
|
||||
chain: curlResult.headers,
|
||||
status: curlResult.statusCode,
|
||||
headers: lastHeaders,
|
||||
contentType,
|
||||
contentDisposition,
|
||||
fileName,
|
||||
file: curlResult.data
|
||||
};
|
||||
}
|
||||
|
||||
digestCurlCode(code: CurlCode, msg: string) {
|
||||
switch (code) {
|
||||
// 400 User errors
|
||||
case CurlCode.CURLE_GOT_NOTHING:
|
||||
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
|
||||
case CurlCode.CURLE_REMOTE_ACCESS_DENIED: {
|
||||
return new AssertionFailureError(msg);
|
||||
}
|
||||
|
||||
// Retryable errors
|
||||
case CurlCode.CURLE_SSL_CONNECT_ERROR:
|
||||
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
|
||||
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
|
||||
case CurlCode.CURLE_COULDNT_CONNECT:
|
||||
case CurlCode.CURLE_PARTIAL_FILE:
|
||||
case CurlCode.CURLE_OPERATION_TIMEDOUT: {
|
||||
return new ServiceBadAttemptError(msg);
|
||||
}
|
||||
|
||||
default: {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
70
src/services/errors.ts
Normal file
70
src/services/errors.ts
Normal file
@ -0,0 +1,70 @@
|
||||
import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc';
|
||||
import _ from 'lodash';
|
||||
import dayjs from 'dayjs';
|
||||
import utc from 'dayjs/plugin/utc';
|
||||
|
||||
dayjs.extend(utc);
|
||||
|
||||
@StatusCode(50301)
|
||||
export class ServiceDisabledError extends ApplicationError { }
|
||||
|
||||
@StatusCode(50302)
|
||||
export class ServiceCrashedError extends ApplicationError { }
|
||||
|
||||
@StatusCode(50303)
|
||||
export class ServiceNodeResourceDrainError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40104)
|
||||
export class EmailUnverifiedError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40201)
|
||||
export class InsufficientCreditsError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40202)
|
||||
export class FreeFeatureLimitError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40203)
|
||||
export class InsufficientBalanceError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40903)
|
||||
export class LockConflictError extends ApplicationError { }
|
||||
|
||||
@StatusCode(40904)
|
||||
export class BudgetExceededError extends ApplicationError { }
|
||||
|
||||
@StatusCode(45101)
|
||||
export class HarmfulContentError extends ApplicationError { }
|
||||
|
||||
@StatusCode(45102)
|
||||
export class SecurityCompromiseError extends ApplicationError { }
|
||||
|
||||
@StatusCode(41201)
|
||||
export class BatchSizeTooLargeError extends ApplicationError { }
|
||||
|
||||
|
||||
@StatusCode(42903)
|
||||
export class RateLimitTriggeredError extends ApplicationError {
|
||||
|
||||
@Prop({
|
||||
desc: 'Retry after seconds',
|
||||
})
|
||||
retryAfter?: number;
|
||||
|
||||
@Prop({
|
||||
desc: 'Retry after date',
|
||||
})
|
||||
retryAfterDate?: Date;
|
||||
|
||||
protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() {
|
||||
const retryAfter = this.retryAfter || this.retryAfterDate;
|
||||
if (!retryAfter) {
|
||||
return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL];
|
||||
}
|
||||
|
||||
return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), {
|
||||
headers: {
|
||||
'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`,
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
24
src/services/finalizer.ts
Normal file
24
src/services/finalizer.ts
Normal file
@ -0,0 +1,24 @@
|
||||
import { AbstractFinalizerService } from 'civkit/finalizer';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { isMainThread } from 'worker_threads';
|
||||
import { GlobalLogger } from './logger';
|
||||
|
||||
@singleton()
|
||||
export class FinalizerService extends AbstractFinalizerService {
|
||||
|
||||
container = container;
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(protected globalLogger: GlobalLogger) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const instance = container.resolve(FinalizerService);
|
||||
export const { Finalizer } = instance.decorators();
|
||||
export default instance;
|
||||
|
||||
if (isMainThread) {
|
||||
instance.serviceReady();
|
||||
}
|
@ -4,9 +4,10 @@ import { Logger } from '../shared/services/logger';
|
||||
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import TurndownService from 'turndown';
|
||||
import { Threaded } from '../shared/services/threaded';
|
||||
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
|
||||
import { Threaded } from '../services/threaded';
|
||||
import type { ExtraScrappingOptions } from '../api/crawler';
|
||||
import { tailwindClasses } from '../utils/tailwind-classes';
|
||||
import { countGPTToken } from '../shared';
|
||||
|
||||
const pLinkedom = import('linkedom');
|
||||
|
||||
@ -37,7 +38,8 @@ export class JSDomControl extends AsyncService {
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
return this.actualNarrowSnapshot(snapshot, options);
|
||||
// SideLoad contains native objects that cannot go through thread boundaries.
|
||||
return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
@ -348,6 +350,22 @@ export class JSDomControl extends AsyncService {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
async analyzeHTMLTextLite(sourceHTML: string) {
|
||||
let jsdom = this.linkedom.parseHTML(sourceHTML);
|
||||
if (!jsdom.window.document.documentElement) {
|
||||
jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`);
|
||||
}
|
||||
jsdom.window.document.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
|
||||
const text = jsdom.window.document.body.innerText || '';
|
||||
|
||||
return {
|
||||
title: jsdom.window.document.title,
|
||||
text,
|
||||
tokens: countGPTToken(text.replaceAll(/[\s\r\n\t]+/g, ' ')),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
const jsdomControl = container.resolve(JSDomControl);
|
57
src/services/logger.ts
Normal file
57
src/services/logger.ts
Normal file
@ -0,0 +1,57 @@
|
||||
import { AbstractPinoLogger } from 'civkit/pino-logger';
|
||||
import { singleton, container } from 'tsyringe';
|
||||
import { threadId } from 'node:worker_threads';
|
||||
import { getTraceCtx } from 'civkit/async-context';
|
||||
|
||||
|
||||
const levelToSeverityMap: { [k: string]: string | undefined; } = {
|
||||
trace: 'DEFAULT',
|
||||
debug: 'DEBUG',
|
||||
info: 'INFO',
|
||||
warn: 'WARNING',
|
||||
error: 'ERROR',
|
||||
fatal: 'CRITICAL',
|
||||
};
|
||||
|
||||
@singleton()
|
||||
export class GlobalLogger extends AbstractPinoLogger {
|
||||
loggerOptions = {
|
||||
level: 'debug',
|
||||
base: {
|
||||
tid: threadId,
|
||||
}
|
||||
};
|
||||
|
||||
override init(): void {
|
||||
if (process.env['NODE_ENV']?.startsWith('prod')) {
|
||||
super.init(process.stdout);
|
||||
} else {
|
||||
const PinoPretty = require('pino-pretty').PinoPretty;
|
||||
super.init(PinoPretty({
|
||||
singleLine: true,
|
||||
colorize: true,
|
||||
messageFormat(log: any, messageKey: any) {
|
||||
return `${log['tid'] ? `[${log['tid']}]` : ''}[${log['service'] || 'ROOT'}] ${log[messageKey]}`;
|
||||
},
|
||||
}));
|
||||
}
|
||||
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
override log(...args: any[]) {
|
||||
const [levelObj, ...rest] = args;
|
||||
const severity = levelToSeverityMap[levelObj?.level];
|
||||
const traceCtx = getTraceCtx();
|
||||
const patched: any= { ...levelObj, severity };
|
||||
const traceId = traceCtx?.googleTraceId || traceCtx?.traceId;
|
||||
if (traceId && process.env['GCLOUD_PROJECT']) {
|
||||
patched['logging.googleapis.com/trace'] = `projects/${process.env['GCLOUD_PROJECT']}/traces/${traceId}`;
|
||||
}
|
||||
return super.log(patched, ...rest);
|
||||
}
|
||||
}
|
||||
|
||||
const instance = container.resolve(GlobalLogger);
|
||||
export default instance;
|
@ -8,14 +8,15 @@ import { PDFContent } from '../db/pdf';
|
||||
import dayjs from 'dayjs';
|
||||
import { FirebaseStorageBucketControl } from '../shared';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
|
||||
import path from 'path';
|
||||
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
|
||||
dayjs.extend(utc); // Extend dayjs with the UTC plugin
|
||||
const timezone = require('dayjs/plugin/timezone');
|
||||
dayjs.extend(timezone);
|
||||
|
||||
const pPdfjs = import('pdfjs-dist');
|
||||
|
||||
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
|
||||
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@ -26,7 +27,10 @@ function stdDev(numbers: number[]) {
|
||||
return Math.sqrt(avgSquareDiff);
|
||||
}
|
||||
|
||||
function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean {
|
||||
function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean {
|
||||
if (!transform) {
|
||||
return false;
|
||||
}
|
||||
const [a, b, c, d, _e, _f] = transform;
|
||||
|
||||
// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
|
||||
@ -94,13 +98,15 @@ export class PDFExtractor extends AsyncService {
|
||||
loadingTask = this.pdfjs.getDocument({
|
||||
data: binary,
|
||||
disableFontFace: true,
|
||||
verbosity: 0
|
||||
verbosity: 0,
|
||||
cMapUrl: nodeCmapUrl,
|
||||
});
|
||||
} else {
|
||||
loadingTask = this.pdfjs.getDocument({
|
||||
url,
|
||||
disableFontFace: true,
|
||||
verbosity: 0
|
||||
verbosity: 0,
|
||||
cMapUrl: nodeCmapUrl,
|
||||
});
|
||||
}
|
||||
|
||||
@ -112,7 +118,7 @@ export class PDFExtractor extends AsyncService {
|
||||
|
||||
for (const pg of _.range(0, doc.numPages)) {
|
||||
const page = await doc.getPage(pg + 1);
|
||||
const textContent = await page.getTextContent();
|
||||
const textContent = await page.getTextContent({ includeMarkedContent: true });
|
||||
textItems.push((textContent.items as TextItem[]));
|
||||
}
|
||||
|
||||
@ -335,6 +341,7 @@ export class PDFExtractor extends AsyncService {
|
||||
});
|
||||
} catch (err) {
|
||||
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
|
||||
throw err;
|
||||
}
|
||||
|
||||
return extracted;
|
65
src/services/pseudo-transfer.ts
Normal file
65
src/services/pseudo-transfer.ts
Normal file
@ -0,0 +1,65 @@
|
||||
import { marshalErrorLike } from 'civkit';
|
||||
import { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseudo-transfer';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
|
||||
|
||||
@singleton()
|
||||
export class PseudoTransfer extends AbstractPseudoTransfer {
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const instance = container.resolve(PseudoTransfer);
|
||||
|
||||
Object.defineProperty(Error.prototype, SYM_PSEUDO_TRANSFERABLE, {
|
||||
value: function () {
|
||||
const prototype = this;
|
||||
return {
|
||||
copyOwnProperty: 'all',
|
||||
marshall: (input: Error) => marshalErrorLike(input),
|
||||
unMarshall: (input: object) => {
|
||||
Object.setPrototypeOf(input, prototype);
|
||||
return input;
|
||||
},
|
||||
};
|
||||
},
|
||||
enumerable: false,
|
||||
});
|
||||
instance.expectPseudoTransferableType(Error);
|
||||
for (const x of [...Object.values(require('./errors')), ...Object.values(require('civkit/civ-rpc'))]) {
|
||||
if (typeof x === 'function' && x.prototype instanceof Error) {
|
||||
instance.expectPseudoTransferableType(x as any);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Object.defineProperty(URL.prototype, SYM_PSEUDO_TRANSFERABLE, {
|
||||
value: function () {
|
||||
return {
|
||||
copyOwnProperty: 'none',
|
||||
marshall: (input: URL) => ({ href: input.href }),
|
||||
unMarshall: (input: { href: string; }) => new URL(input.href),
|
||||
};
|
||||
},
|
||||
enumerable: false,
|
||||
});
|
||||
instance.expectPseudoTransferableType(URL);
|
||||
|
||||
Object.defineProperty(Buffer.prototype, SYM_PSEUDO_TRANSFERABLE, {
|
||||
value: function () {
|
||||
return {
|
||||
copyOwnProperty: 'none',
|
||||
unMarshall: (input: Uint8Array | Buffer) => Buffer.isBuffer(input) ? input : Buffer.from(input),
|
||||
marshall: (input: Uint8Array | Buffer) => input,
|
||||
};
|
||||
},
|
||||
enumerable: false,
|
||||
});
|
||||
instance.expectPseudoTransferableType(Buffer);
|
||||
|
||||
|
||||
export default instance;
|
@ -1,7 +1,7 @@
|
||||
import os from 'os';
|
||||
import fs from 'fs';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError } from 'civkit';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
|
||||
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer';
|
||||
@ -14,6 +14,9 @@ import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainE
|
||||
import { TimeoutError } from 'puppeteer';
|
||||
import _ from 'lodash';
|
||||
import { isIP } from 'net';
|
||||
import { CurlControl } from './curl';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
const tldExtract = require('tld-extract');
|
||||
|
||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||
@ -53,6 +56,8 @@ export interface PageSnapshot {
|
||||
text: string;
|
||||
status?: number;
|
||||
statusText?: string;
|
||||
isIntermediate?: boolean;
|
||||
isFromCache?: boolean;
|
||||
parsed?: Partial<ReadabilityParsed> | null;
|
||||
screenshot?: Buffer;
|
||||
pageshot?: Buffer;
|
||||
@ -82,17 +87,22 @@ export interface ScrappingOptions {
|
||||
injectFrameScripts?: string[];
|
||||
injectPageScripts?: string[];
|
||||
viewport?: Viewport;
|
||||
proxyResources?: boolean;
|
||||
|
||||
sideLoad?: {
|
||||
impersonate: {
|
||||
[url: string]: {
|
||||
status: number;
|
||||
headers: { [k: string]: string | string[]; };
|
||||
contentType?: string;
|
||||
body?: FancyFile;
|
||||
};
|
||||
};
|
||||
proxyOrigin: { [origin: string]: string; };
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
|
||||
// const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
|
||||
// puppeteer.use(puppeteerStealth());
|
||||
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
|
||||
// puppeteer.use(puppeteerUAOverride({
|
||||
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
|
||||
// platform: `Linux`,
|
||||
// }))
|
||||
|
||||
puppeteer.use(puppeteerBlockResources({
|
||||
blockedTypes: new Set(['media']),
|
||||
interceptResolutionPriority: 1,
|
||||
@ -460,6 +470,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected curlControl: CurlControl,
|
||||
protected blackHoleDetector: BlackHoleDetector,
|
||||
) {
|
||||
super(...arguments);
|
||||
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
||||
@ -514,10 +526,11 @@ export class PuppeteerControl extends AsyncService {
|
||||
});
|
||||
this.ua = await this.browser.userAgent();
|
||||
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
||||
this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
|
||||
|
||||
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
||||
|
||||
this.emit('ready');
|
||||
|
||||
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||
}
|
||||
|
||||
@perNextTick()
|
||||
@ -538,8 +551,10 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
}
|
||||
|
||||
async newPage() {
|
||||
await this.serviceReady();
|
||||
async newPage(bewareDeadLock: any = false) {
|
||||
if (!bewareDeadLock) {
|
||||
await this.serviceReady();
|
||||
}
|
||||
const sn = this._sn++;
|
||||
let page;
|
||||
try {
|
||||
@ -687,7 +702,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
`);
|
||||
|
||||
this.snMap.set(page, sn);
|
||||
this.logger.info(`Page ${sn} created.`);
|
||||
this.logger.debug(`Page ${sn} created.`);
|
||||
this.lastPageCratedAt = Date.now();
|
||||
this.livePages.add(page);
|
||||
this.pagePhase.set(page, 'idle');
|
||||
@ -731,7 +746,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
return;
|
||||
}
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Closing page ${sn}`);
|
||||
this.logger.debug(`Closing page ${sn}`);
|
||||
await Promise.race([
|
||||
(async () => {
|
||||
const ctx = page.browserContext();
|
||||
@ -749,7 +764,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
this.pagePhase.delete(page);
|
||||
}
|
||||
|
||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
|
||||
@ -761,7 +776,9 @@ export class PuppeteerControl extends AsyncService {
|
||||
const page = await this.getNextPage();
|
||||
this.pagePhase.set(page, 'active');
|
||||
page.on('response', (resp) => {
|
||||
if (resp.request().isNavigationRequest()) {
|
||||
this.blackHoleDetector.itWorked();
|
||||
const req = resp.request();
|
||||
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
|
||||
navigationResponse = resp;
|
||||
}
|
||||
if (!resp.ok()) {
|
||||
@ -774,7 +791,111 @@ export class PuppeteerControl extends AsyncService {
|
||||
pdfUrls.push(url);
|
||||
}
|
||||
});
|
||||
if (options?.extraHeaders) {
|
||||
page.on('request', async (req) => {
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const reqUrlParsed = new URL(req.url());
|
||||
if (!reqUrlParsed.protocol.startsWith('http')) {
|
||||
const overrides = req.continueRequestOverrides();
|
||||
|
||||
return req.continue(overrides, 0);
|
||||
}
|
||||
const typ = req.resourceType();
|
||||
if (!options.proxyResources) {
|
||||
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
|
||||
if (!isDocRequest) {
|
||||
const overrides = req.continueRequestOverrides();
|
||||
|
||||
return req.continue(overrides, 0);
|
||||
}
|
||||
}
|
||||
const sideload = options.sideLoad;
|
||||
|
||||
const impersonate = sideload?.impersonate[reqUrlParsed.href];
|
||||
if (impersonate) {
|
||||
let body;
|
||||
if (impersonate.body) {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
return req.respond({
|
||||
status: impersonate.status,
|
||||
headers: impersonate.headers,
|
||||
contentType: impersonate.contentType,
|
||||
body: body ? Uint8Array.from(body) : undefined,
|
||||
}, 999);
|
||||
}
|
||||
|
||||
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
|
||||
|
||||
if (proxy) {
|
||||
try {
|
||||
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
|
||||
...options,
|
||||
method: req.method(),
|
||||
body: req.postData(),
|
||||
extraHeaders: {
|
||||
...req.headers(),
|
||||
...options.extraHeaders,
|
||||
},
|
||||
proxyUrl: proxy
|
||||
});
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
if (curled.chain.length === 1) {
|
||||
if (!curled.file) {
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
}, 999);
|
||||
}
|
||||
const body = await readFile(await curled.file.filePath);
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
return req.respond({
|
||||
status: curled.status,
|
||||
headers: _.omit(curled.headers, 'result'),
|
||||
contentType: curled.contentType,
|
||||
body: Uint8Array.from(body),
|
||||
}, 999);
|
||||
}
|
||||
options.sideLoad ??= curled.sideLoadOpts;
|
||||
_.merge(options.sideLoad, curled.sideLoadOpts);
|
||||
const firstReq = curled.chain[0];
|
||||
|
||||
return req.respond({
|
||||
status: firstReq.result!.code,
|
||||
headers: _.omit(firstReq, 'result'),
|
||||
}, 999);
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
const overrides = req.continueRequestOverrides();
|
||||
const continueArgs = [{
|
||||
...overrides,
|
||||
headers: {
|
||||
...req.headers(),
|
||||
...overrides?.headers,
|
||||
...options.extraHeaders,
|
||||
}
|
||||
}, 1] as const;
|
||||
|
||||
return req.continue(continueArgs[0], continueArgs[1]);
|
||||
});
|
||||
if (options.extraHeaders) {
|
||||
page.on('request', async (req) => {
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
@ -795,7 +916,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
let pageScriptEvaluations: Promise<unknown>[] = [];
|
||||
let frameScriptEvaluations: Promise<unknown>[] = [];
|
||||
if (options?.injectPageScripts?.length) {
|
||||
if (options.injectPageScripts?.length) {
|
||||
page.on('framenavigated', (frame) => {
|
||||
if (frame !== page.mainFrame()) {
|
||||
return;
|
||||
@ -808,7 +929,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
);
|
||||
});
|
||||
}
|
||||
if (options?.injectFrameScripts?.length) {
|
||||
if (options.injectFrameScripts?.length) {
|
||||
page.on('framenavigated', (frame) => {
|
||||
frameScriptEvaluations.push(
|
||||
Promise.allSettled(options.injectFrameScripts!.map((x) => frame.evaluate(x).catch((err) => {
|
||||
@ -819,34 +940,28 @@ export class PuppeteerControl extends AsyncService {
|
||||
}
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
if (options?.locale) {
|
||||
if (options.locale) {
|
||||
// Add headers via request interception to walk around this bug
|
||||
// https://github.com/puppeteer/puppeteer/issues/10235
|
||||
// await page.setExtraHTTPHeaders({
|
||||
// 'Accept-Language': options?.locale
|
||||
// 'Accept-Language': options.locale
|
||||
// });
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, "language", {
|
||||
get: function () {
|
||||
return options?.locale;
|
||||
return options.locale;
|
||||
}
|
||||
});
|
||||
Object.defineProperty(navigator, "languages", {
|
||||
get: function () {
|
||||
return [options?.locale];
|
||||
return [options.locale];
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
if (options?.proxyUrl) {
|
||||
await page.useProxy(options.proxyUrl, {
|
||||
headers: options.extraHeaders,
|
||||
interceptResolutionPriority: 2,
|
||||
});
|
||||
}
|
||||
if (options?.cookies) {
|
||||
if (options.cookies) {
|
||||
const mapped = options.cookies.map((x) => {
|
||||
const draft: CookieParam = {
|
||||
name: x.name,
|
||||
@ -876,10 +991,10 @@ export class PuppeteerControl extends AsyncService {
|
||||
});
|
||||
}
|
||||
}
|
||||
if (options?.overrideUserAgent) {
|
||||
if (options.overrideUserAgent) {
|
||||
await page.setUserAgent(options.overrideUserAgent);
|
||||
}
|
||||
if (options?.viewport) {
|
||||
if (options.viewport) {
|
||||
await page.setViewport(options.viewport);
|
||||
}
|
||||
|
||||
@ -921,13 +1036,13 @@ export class PuppeteerControl extends AsyncService {
|
||||
);
|
||||
});
|
||||
|
||||
const timeout = options?.timeoutMs || 30_000;
|
||||
const timeout = options.timeoutMs || 30_000;
|
||||
const goToOptions: GoToOptions = {
|
||||
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
|
||||
timeout,
|
||||
};
|
||||
|
||||
if (options?.referer) {
|
||||
if (options.referer) {
|
||||
goToOptions.referer = options.referer;
|
||||
}
|
||||
|
||||
@ -1019,7 +1134,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
});
|
||||
gotoPromise.catch(() => 'just dont crash anything');
|
||||
let waitForPromise: Promise<any> | undefined;
|
||||
if (options?.waitForSelector) {
|
||||
if (options.waitForSelector) {
|
||||
const t0 = Date.now();
|
||||
waitForPromise = nextSnapshotDeferred.promise.then(() => {
|
||||
const t1 = Date.now();
|
||||
@ -1054,7 +1169,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
if (waitForPromise) {
|
||||
ckpt.push(waitForPromise);
|
||||
}
|
||||
if (options?.minIntervalMs) {
|
||||
if (options.minIntervalMs) {
|
||||
ckpt.push(delay(options.minIntervalMs));
|
||||
}
|
||||
let error;
|
||||
@ -1074,7 +1189,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
} as PageSnapshot;
|
||||
break;
|
||||
}
|
||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
screenshot = Buffer.from(await page.screenshot());
|
||||
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||
lastHTML = snapshot.html;
|
||||
@ -1084,7 +1199,8 @@ export class PuppeteerControl extends AsyncService {
|
||||
...snapshot,
|
||||
status: navigationResponse?.status(),
|
||||
statusText: navigationResponse?.statusText(),
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot
|
||||
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
|
||||
isIntermediate: true,
|
||||
} as PageSnapshot;
|
||||
}
|
||||
if (error) {
|
60
src/services/registry.ts
Normal file
60
src/services/registry.ts
Normal file
@ -0,0 +1,60 @@
|
||||
import { propertyInjectorFactory } from 'civkit/property-injector';
|
||||
import { KoaRPCRegistry } from 'civkit/civ-rpc/koa';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { IntegrityEnvelope } from 'civkit/civ-rpc';
|
||||
import bodyParser from '@koa/bodyparser';
|
||||
|
||||
import { GlobalLogger } from './logger';
|
||||
import { TempFileManager } from './temp-file';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
export { Context } from 'koa';
|
||||
|
||||
export const InjectProperty = propertyInjectorFactory(container);
|
||||
|
||||
@singleton()
|
||||
export class RPCRegistry extends KoaRPCRegistry {
|
||||
|
||||
title = 'Jina Reader API';
|
||||
container = container;
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
static override envelope = IntegrityEnvelope;
|
||||
override _BODY_PARSER_LIMIT = '102mb';
|
||||
override _RESPONSE_STREAM_MODE = 'koa' as const;
|
||||
|
||||
override koaMiddlewares = [
|
||||
this.__CORSAllowAllMiddleware.bind(this),
|
||||
bodyParser({
|
||||
encoding: 'utf-8',
|
||||
enableTypes: ['json', 'form'],
|
||||
jsonLimit: this._BODY_PARSER_LIMIT,
|
||||
xmlLimit: this._BODY_PARSER_LIMIT,
|
||||
formLimit: this._BODY_PARSER_LIMIT,
|
||||
}),
|
||||
this.__multiParse.bind(this),
|
||||
this.__binaryParse.bind(this),
|
||||
];
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected ctxMgr: AsyncLocalContext,
|
||||
protected tempFileManager: TempFileManager,
|
||||
protected blackHoleDetector: BlackHoleDetector,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
this.on('run', () => this.blackHoleDetector.incomingRequest());
|
||||
this.on('ran', () => this.blackHoleDetector.doneWithRequest());
|
||||
this.on('fail', () => this.blackHoleDetector.doneWithRequest());
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
const instance = container.resolve(RPCRegistry);
|
||||
export default instance;
|
||||
export const { Method, RPCMethod, RPCReflect, Param, Ctx, } = instance.decorators();
|
129
src/services/robots-text.ts
Normal file
129
src/services/robots-text.ts
Normal file
@ -0,0 +1,129 @@
|
||||
import { singleton } from 'tsyringe';
|
||||
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { HashManager } from 'civkit/hash';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { BraveSearchHTTP } from '../shared/3rd-party/brave-search';
|
||||
import { FirebaseStorageBucketControl } from '../shared';
|
||||
import { URL } from 'url';
|
||||
import { Threaded } from '../services/threaded';
|
||||
|
||||
|
||||
export const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@singleton()
|
||||
export class RobotsTxtService extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
braveSearchHTTP!: BraveSearchHTTP;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async getCachedRobotTxt(origin: string) {
|
||||
const digest = md5Hasher.hash(origin.toLowerCase());
|
||||
const cacheLoc = `/robot-txt/${digest}`;
|
||||
let buff;
|
||||
buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
|
||||
if (buff) {
|
||||
return buff.toString();
|
||||
}
|
||||
|
||||
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
|
||||
if (!r.ok) {
|
||||
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
|
||||
}
|
||||
buff = Buffer.from(await r.arrayBuffer());
|
||||
|
||||
this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
|
||||
contentType: 'text/plain'
|
||||
}).catch((err) => {
|
||||
this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) });
|
||||
});
|
||||
|
||||
return buff.toString();
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
async assertAccessAllowed(url: URL, inputMyUa = '*') {
|
||||
let robotTxt: string = '';
|
||||
try {
|
||||
robotTxt = await this.getCachedRobotTxt(url.origin);
|
||||
} catch (err) {
|
||||
if (err instanceof DownstreamServiceFailureError) {
|
||||
return true;
|
||||
}
|
||||
throw err;
|
||||
}
|
||||
const myUa = inputMyUa.toLowerCase();
|
||||
const lines = robotTxt.split(/\r?\n/g);
|
||||
|
||||
let currentUa = myUa || '*';
|
||||
let uaLine = 'User-Agent: *';
|
||||
const pathNormalized = `${url.pathname}?`;
|
||||
|
||||
for (const line of lines) {
|
||||
const trimmed = line.trim();
|
||||
if (trimmed.startsWith('#') || !trimmed) {
|
||||
continue;
|
||||
}
|
||||
const [k, ...rest] = trimmed.split(':');
|
||||
const key = k.trim().toLowerCase();
|
||||
const value = rest.join(':').trim();
|
||||
|
||||
if (key === 'user-agent') {
|
||||
currentUa = value.toLowerCase();
|
||||
if (value === '*') {
|
||||
currentUa = myUa;
|
||||
}
|
||||
uaLine = line;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentUa !== myUa) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (key === 'disallow') {
|
||||
if (!value) {
|
||||
return true;
|
||||
}
|
||||
if (value.includes('*')) {
|
||||
const [head, tail] = value.split('*');
|
||||
if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
|
||||
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
|
||||
}
|
||||
} else if (pathNormalized.startsWith(value)) {
|
||||
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
if (key === 'allow') {
|
||||
if (!value) {
|
||||
return true;
|
||||
}
|
||||
if (pathNormalized.startsWith(value)) {
|
||||
return true;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
@ -1,11 +1,12 @@
|
||||
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
|
||||
import type { Request, Response } from 'express';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { SecretExposer } from '../shared/services/secrets';
|
||||
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
|
||||
import { AsyncContext } from '../shared';
|
||||
import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
import { Context } from './registry';
|
||||
|
||||
@singleton()
|
||||
export class SerperSearchService extends AsyncService {
|
||||
@ -19,6 +20,7 @@ export class SerperSearchService extends AsyncService {
|
||||
protected secretExposer: SecretExposer,
|
||||
protected geoipControl: GeoIPService,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected blackHoleDetector: BlackHoleDetector,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
@ -61,6 +63,7 @@ export class SerperSearchService extends AsyncService {
|
||||
try {
|
||||
this.logger.debug(`Doing external search`, query);
|
||||
const r = await this.serperSearchHTTP.webSearch(query);
|
||||
this.blackHoleDetector.itWorked();
|
||||
|
||||
return r.parsed;
|
||||
} catch (err: any) {
|
||||
@ -132,15 +135,12 @@ export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
req: Request,
|
||||
res: Response,
|
||||
} | undefined;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
|
||||
|
||||
const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
|
||||
|
||||
for (const p of params) {
|
||||
const customValue = ctx?.req.get(`x-${p}`) || ctx?.req.get(`${p}`);
|
||||
const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
|
||||
if (!customValue) {
|
||||
continue;
|
||||
}
|
@ -1,19 +1,22 @@
|
||||
import { randomUUID } from 'crypto';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
||||
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
|
||||
import TurndownService, { Filter, Rule } from 'turndown';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
import { AsyncContext } from '../shared/services/async-context';
|
||||
import { Threaded } from '../shared/services/threaded';
|
||||
import { Threaded } from '../services/threaded';
|
||||
import { JSDomControl } from './jsdom';
|
||||
import { AltTextService } from './alt-text';
|
||||
import { PDFExtractor } from './pdf-extract';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
import { STATUS_CODES } from 'http';
|
||||
import type { CrawlerOptions } from '../dto/scrapping-options';
|
||||
import type { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { pathToFileURL } from 'url';
|
||||
import { countGPTToken } from '../shared';
|
||||
|
||||
|
||||
export interface FormattedPage {
|
||||
@ -189,7 +192,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
(!mode.includes('markdown') && !mode.includes('content')))
|
||||
) {
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
const formatted: FormattedPage = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
@ -401,7 +404,9 @@ export class SnapshotFormatter extends AsyncService {
|
||||
const n = code - 200;
|
||||
if (n < 0 || n >= 200) {
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
formatted.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
formatted.warning ??= '';
|
||||
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -428,7 +433,31 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||
formatted.links = links;
|
||||
} else {
|
||||
formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
|
||||
formatted.links = _(links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value();
|
||||
}
|
||||
}
|
||||
|
||||
if (countGPTToken(formatted.content) < 200) {
|
||||
formatted.warning ??= '';
|
||||
if (snapshot.isIntermediate) {
|
||||
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
|
||||
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
|
||||
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
|
||||
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
if (snapshot.isFromCache) {
|
||||
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
|
||||
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -468,7 +497,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
}
|
||||
|
||||
if (this.warning) {
|
||||
mixins.push(`Warning: ${this.warning}`);
|
||||
mixins.push(this.warning.split('\n').map((v) => `Warning: ${v}`).join('\n'));
|
||||
}
|
||||
|
||||
if (mode.includes('markdown')) {
|
||||
@ -488,7 +517,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
Object.defineProperty(f, 'textRepresentation', { value: textRepresentation, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return f as FormattedPage;
|
||||
}
|
||||
@ -526,7 +555,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
if (this.threadLocal.get('withLinksSummary') === 'all') {
|
||||
mixin.links = inferred.links;
|
||||
} else {
|
||||
mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
|
||||
mixin.links = _(inferred.links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value();
|
||||
}
|
||||
}
|
||||
if (snapshot.status) {
|
||||
@ -534,7 +563,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
const n = code - 200;
|
||||
if (n < 0 || n >= 200) {
|
||||
const text = snapshot.statusText || STATUS_CODES[code];
|
||||
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
mixin.warning ??= '';
|
||||
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
|
||||
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
|
||||
}
|
||||
}
|
||||
|
||||
@ -697,6 +728,52 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
async createSnapshotFromFile(url: URL, file: FancyFile, overrideContentType?: string, overrideFileName?: string) {
|
||||
if (overrideContentType === 'application/octet-stream') {
|
||||
overrideContentType = undefined;
|
||||
}
|
||||
|
||||
const contentType = (overrideContentType || await file.mimeType).toLowerCase();
|
||||
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
|
||||
const snapshot: PageSnapshot = {
|
||||
title: '',
|
||||
href: url.href,
|
||||
html: '',
|
||||
text: ''
|
||||
};
|
||||
|
||||
if (contentType.startsWith('image/')) {
|
||||
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${fileName}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${url.href}"></body></html>`;
|
||||
snapshot.title = fileName;
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('text/html')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
|
||||
if ((await file.size) > 1024 * 1024 * 32) {
|
||||
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
|
||||
}
|
||||
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
|
||||
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
if (contentType.startsWith('application/pdf')) {
|
||||
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
|
||||
}
|
||||
}
|
||||
|
||||
const snapshotFormatter = container.resolve(SnapshotFormatter);
|
22
src/services/temp-file.ts
Normal file
22
src/services/temp-file.ts
Normal file
@ -0,0 +1,22 @@
|
||||
import { AbstractTempFileManger } from 'civkit/temp';
|
||||
import { unlink } from 'fs/promises';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
@singleton()
|
||||
export class TempFileManager extends AbstractTempFileManger {
|
||||
|
||||
rootDir = '';
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
await super.init();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
override async standDown() {
|
||||
await super.standDown();
|
||||
|
||||
await unlink(this.rootDir);
|
||||
|
||||
}
|
||||
}
|
66
src/services/threaded.ts
Normal file
66
src/services/threaded.ts
Normal file
@ -0,0 +1,66 @@
|
||||
import 'reflect-metadata';
|
||||
|
||||
import { singleton, container } from 'tsyringe';
|
||||
import { AbstractThreadedServiceRegistry } from 'civkit/threaded';
|
||||
import _ from 'lodash';
|
||||
|
||||
import { GlobalLogger } from './logger';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
import { PseudoTransfer } from './pseudo-transfer';
|
||||
import { cpus } from 'os';
|
||||
import { isMainThread } from 'worker_threads';
|
||||
|
||||
@singleton()
|
||||
export class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry {
|
||||
container = container;
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
public asyncContext: AsyncLocalContext,
|
||||
public pseudoTransfer: PseudoTransfer,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
setMaxWorkersByCpu() {
|
||||
const cpuStat = cpus();
|
||||
|
||||
const evenCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 0).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
|
||||
const oddCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 1).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
|
||||
|
||||
const isLikelyHyperThreaded = (oddCpuCycles / evenCpuCycles) < 0.5;
|
||||
|
||||
this.maxWorkers = isLikelyHyperThreaded ? cpuStat.length / 2 : cpuStat.length;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
await super.init();
|
||||
|
||||
if (isMainThread) {
|
||||
this.setMaxWorkersByCpu();
|
||||
await Promise.all(
|
||||
_.range(0, 2).map(
|
||||
(_n) =>
|
||||
new Promise<void>(
|
||||
(resolve, reject) => {
|
||||
this.createWorker()
|
||||
.once('message', resolve)
|
||||
.once('error', reject);
|
||||
}
|
||||
)
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
const instance = container.resolve(ThreadedServiceRegistry);
|
||||
export default instance;
|
||||
export const { Method, Param, Ctx, RPCReflect, Threaded } = instance.decorators();
|
1
src/shared
Symbolic link
1
src/shared
Symbolic link
@ -0,0 +1 @@
|
||||
../thinapps-shared/backend
|
139
src/stand-alone/crawl.ts
Normal file
139
src/stand-alone/crawl.ts
Normal file
@ -0,0 +1,139 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
|
||||
import { KoaServer } from 'civkit/civ-rpc/koa';
|
||||
import http2 from 'http2';
|
||||
import { CrawlerHost } from '../api/crawler';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { Context, Next } from 'koa';
|
||||
import { RPCRegistry } from '../services/registry';
|
||||
import { AsyncResource } from 'async_hooks';
|
||||
import { runOnce } from 'civkit/decorators';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { ThreadedServiceRegistry } from '../services/threaded';
|
||||
import globalLogger, { GlobalLogger } from '../services/logger';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
globalLogger.warn('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
globalLogger.error('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
globalLogger.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class CrawlStandAloneServer extends KoaServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected registry: RPCRegistry,
|
||||
protected crawlerHost: CrawlerHost,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected threads: ThreadedServiceRegistry,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
const fn = this.koaApp.callback();
|
||||
this.httpServer = http2.createServer((req, res) => {
|
||||
const ar = new AsyncResource('HTTP2ServerRequest');
|
||||
ar.runInAsyncScope(fn, this.koaApp, req, res);
|
||||
});
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (ctx: Context, next: Next) => {
|
||||
const requestPath = ctx.path;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
|
||||
ctx.body = fs.createReadStream(asset.path);
|
||||
ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
|
||||
ctx.set('Content-Length', asset.stats.size.toString());
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
registerRoutes(): void {
|
||||
this.koaApp.use(this.makeAssetsServingController());
|
||||
this.koaApp.use(this.registry.makeShimController());
|
||||
}
|
||||
|
||||
// Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
|
||||
// TraceId is expected to be request-bound and unique. So these two has to be distinguished.
|
||||
@runOnce()
|
||||
override insertAsyncHookMiddleware() {
|
||||
const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
|
||||
const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
|
||||
this.threadLocal.setup({
|
||||
traceId: randomUUID(),
|
||||
traceT0: new Date(),
|
||||
googleTraceId,
|
||||
});
|
||||
|
||||
return next();
|
||||
};
|
||||
|
||||
this.koaApp.use(asyncHookMiddleware);
|
||||
}
|
||||
|
||||
}
|
||||
const instance = container.resolve(CrawlStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
|
148
src/stand-alone/search.ts
Normal file
148
src/stand-alone/search.ts
Normal file
@ -0,0 +1,148 @@
|
||||
import 'reflect-metadata';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
|
||||
import { KoaServer } from 'civkit/civ-rpc/koa';
|
||||
import http2 from 'http2';
|
||||
import { SearcherHost } from '../api/searcher-serper';
|
||||
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
|
||||
import path from 'path';
|
||||
import fs from 'fs';
|
||||
import { mimeOfExt } from 'civkit/mime';
|
||||
import { Context, Next } from 'koa';
|
||||
import { RPCRegistry } from '../services/registry';
|
||||
import { AsyncResource } from 'async_hooks';
|
||||
import { runOnce } from 'civkit/decorators';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { ThreadedServiceRegistry } from '../services/threaded';
|
||||
import globalLogger, { GlobalLogger } from '../services/logger';
|
||||
import { AsyncLocalContext } from '../services/async-context';
|
||||
|
||||
process.on('unhandledRejection', (err) => {
|
||||
globalLogger.warn('Unhandled rejection', err);
|
||||
});
|
||||
|
||||
process.on('uncaughtException', (err) => {
|
||||
globalLogger.error('Uncaught exception', err);
|
||||
|
||||
// Looks like Firebase runtime does not handle error properly.
|
||||
// Make sure to quit the process.
|
||||
globalLogger.error('Uncaught exception, process quit.');
|
||||
process.nextTick(() => process.exit(1));
|
||||
});
|
||||
|
||||
@singleton()
|
||||
export class SearchStandAloneServer extends KoaServer {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
httpAlternativeServer?: typeof this['httpServer'];
|
||||
assets = new Map<string, WalkOutEntity>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected registry: RPCRegistry,
|
||||
protected searcherHost: SearcherHost,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected threads: ThreadedServiceRegistry,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
h2c() {
|
||||
this.httpAlternativeServer = this.httpServer;
|
||||
const fn = this.koaApp.callback();
|
||||
this.httpServer = http2.createServer((req, res) => {
|
||||
const ar = new AsyncResource('HTTP2ServerRequest');
|
||||
ar.runInAsyncScope(fn, this.koaApp, req, res);
|
||||
});
|
||||
// useResourceBasedDefaultTracker();
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.walkForAssets();
|
||||
await this.dependencyReady();
|
||||
|
||||
for (const [k,v] of this.registry.conf.entries()) {
|
||||
if (v.tags?.includes('crawl')) {
|
||||
this.registry.conf.delete(k);
|
||||
}
|
||||
}
|
||||
|
||||
await super.init();
|
||||
}
|
||||
|
||||
async walkForAssets() {
|
||||
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
|
||||
|
||||
for (const file of files) {
|
||||
if (file.type !== 'file') {
|
||||
continue;
|
||||
}
|
||||
this.assets.set(file.relativePath.toString(), file);
|
||||
}
|
||||
}
|
||||
|
||||
override listen(port: number) {
|
||||
const r = super.listen(port);
|
||||
if (this.httpAlternativeServer) {
|
||||
const altPort = port + 1;
|
||||
this.httpAlternativeServer.listen(altPort, () => {
|
||||
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
|
||||
});
|
||||
}
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
makeAssetsServingController() {
|
||||
return (ctx: Context, next: Next) => {
|
||||
const requestPath = ctx.path;
|
||||
const file = requestPath.slice(1);
|
||||
if (!file) {
|
||||
return next();
|
||||
}
|
||||
|
||||
const asset = this.assets.get(file);
|
||||
if (asset?.type !== 'file') {
|
||||
return next();
|
||||
}
|
||||
|
||||
ctx.body = fs.createReadStream(asset.path);
|
||||
ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
|
||||
ctx.set('Content-Length', asset.stats.size.toString());
|
||||
|
||||
return;
|
||||
};
|
||||
}
|
||||
|
||||
registerRoutes(): void {
|
||||
this.koaApp.use(this.makeAssetsServingController());
|
||||
this.koaApp.use(this.registry.makeShimController());
|
||||
}
|
||||
|
||||
|
||||
// Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
|
||||
// TraceId is expected to be request-bound and unique. So these two has to be distinguished.
|
||||
@runOnce()
|
||||
override insertAsyncHookMiddleware() {
|
||||
const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
|
||||
const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
|
||||
this.threadLocal.setup({
|
||||
traceId: randomUUID(),
|
||||
traceT0: new Date(),
|
||||
googleTraceId,
|
||||
});
|
||||
|
||||
return next();
|
||||
};
|
||||
|
||||
this.koaApp.use(asyncHookMiddleware);
|
||||
}
|
||||
|
||||
}
|
||||
const instance = container.resolve(SearchStandAloneServer);
|
||||
|
||||
export default instance;
|
||||
|
||||
instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));
|
@ -1 +1 @@
|
||||
Subproject commit b80a917835031da9ab7073b6b4005402eece0746
|
||||
Subproject commit 0c62acf45e4749ecf4bb7f4bfc7ed49533e239cb
|
Loading…
x
Reference in New Issue
Block a user