From 23a3b807c99b5804610faa2914cfa547595e7c9a Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Sat, 8 Mar 2025 00:46:52 +0800 Subject: [PATCH] restructure: nolonger a firebase application (#1160) * fix: fine allow redefining Function.prototype.toString * wip * wip * wip * wip * wip * wip * wip * fix: contentType encoding * wip * fix: error throwing * wip * fix * wip * fix * fix * fix: jsdom * wip * wip * fix: links summary uniqueness * wip * wip * robots-txt catch no robots.txt * deps: remove puppeteer-extra-plugin-stealth * fix: dont change waring type * fix: curl * fix: replace firebase-roundtrip-check with blackhole-detector * fix: black hole detection * sercher: black hole detecting * fix: no h2c for searcher * fix: bhd * fix: search and crawl conflict * fix: bhd * fix * fix: server script * canvas: fixed avif issue * logging: move some to debug * fix * fix: pptr declare ready only when page can be created without issues * fix: bhd * cd: cloud run deploy-health-check cannot complete pptr newPage * cd: fix * fix: curl body can be null * fix * fix * fix: major fix regarding TC pdfs * fix * fix * deps: fix civkit trie router issue * fix * boom: total restructure * cd: fix docker ctx * fix * fix: switch to h2c * cd: ensure http2 --- .github/workflows/cd.yml | 12 +- .gitignore | 79 +++- .vscode/launch.json | 59 +-- .vscode/tasks.json | 132 +----- backend/functions/Dockerfile => Dockerfile | 0 README.md | 4 - backend/.firebaserc | 5 - backend/.gitignore | 79 ---- backend/firebase.json | 43 -- backend/firestore.indexes.json | 19 - backend/firestore.rules | 32 -- backend/functions/.dockerignore | 1 - backend/functions/.editorconfig | 36 -- backend/functions/.env.example | 0 backend/functions/.puppeteerrc.cjs | 9 - backend/functions/package.json | 93 ----- backend/functions/src/services/curl.ts | 218 ---------- backend/functions/src/shared | 1 - backend/functions/src/stand-alone/crawl.ts | 168 -------- backend/functions/src/stand-alone/search.ts | 168 -------- backend/storage.rules | 8 - ...integrity-check.cjs => integrity-check.cjs | 0 .../package-lock.json => package-lock.json | 238 +++++++---- package.json | 97 ++++- .../functions/public => public}/favicon.ico | Bin .../cloud-functions => src/api}/crawler.ts | 379 ++++++++++++----- .../api}/searcher-serper.ts | 112 ++--- .../cloud-functions => src/api}/searcher.ts | 96 +++-- .../cloud-functions/adaptive-crawler.ts | 2 +- .../cloud-functions/data-crunching.ts | 2 +- .../src => src}/db/adaptive-crawl-task.ts | 0 {backend/functions/src => src}/db/crawled.ts | 0 .../src => src}/db/domain-blockade.ts | 0 .../src => src}/db/domain-profile.ts | 2 +- {backend/functions/src => src}/db/img-alt.ts | 0 {backend/functions/src => src}/db/pdf.ts | 0 {backend/functions/src => src}/db/searched.ts | 0 .../dto/adaptive-crawler-options.ts | 0 .../dto/crawler-options.ts | 107 ++--- src/dto/jina-embeddings-auth.ts | 216 ++++++++++ {backend/functions/src => src}/fetch.d.ts | 0 {backend/functions/src => src}/index.ts | 0 src/lib/transform-server-event-stream.ts | 169 ++++++++ .../src => src}/services/alt-text.ts | 0 src/services/async-context.ts | 10 + src/services/blackhole-detector.ts | 72 ++++ .../src => src}/services/brave-search.ts | 3 + src/services/cf-browser-rendering.ts | 38 ++ src/services/curl.ts | 387 ++++++++++++++++++ src/services/errors.ts | 70 ++++ src/services/finalizer.ts | 24 ++ .../functions/src => src}/services/geoip.ts | 0 .../functions/src => src}/services/jsdom.ts | 24 +- {backend/functions/src => src}/services/lm.ts | 0 src/services/logger.ts | 57 +++ .../src => src}/services/pdf-extract.ts | 21 +- src/services/pseudo-transfer.ts | 65 +++ .../src => src}/services/puppeteer.ts | 196 +++++++-- src/services/registry.ts | 60 +++ src/services/robots-text.ts | 129 ++++++ .../src => src}/services/serper-search.ts | 12 +- .../services/snapshot-formatter.ts | 97 ++++- src/services/temp-file.ts | 22 + src/services/threaded.ts | 66 +++ src/shared | 1 + src/stand-alone/crawl.ts | 139 +++++++ src/stand-alone/search.ts | 148 +++++++ {backend/functions/src => src}/types.d.ts | 0 .../src => src}/utils/get-function-url.ts | 0 .../functions/src => src}/utils/markdown.ts | 0 {backend/functions/src => src}/utils/misc.ts | 0 .../src => src}/utils/tailwind-classes.ts | 0 thinapps-shared | 2 +- .../functions/tsconfig.json => tsconfig.json | 0 74 files changed, 2765 insertions(+), 1464 deletions(-) rename backend/functions/Dockerfile => Dockerfile (100%) delete mode 100644 backend/.firebaserc delete mode 100644 backend/.gitignore delete mode 100644 backend/firebase.json delete mode 100644 backend/firestore.indexes.json delete mode 100644 backend/firestore.rules delete mode 100644 backend/functions/.dockerignore delete mode 100644 backend/functions/.editorconfig delete mode 100644 backend/functions/.env.example delete mode 100644 backend/functions/.puppeteerrc.cjs delete mode 100644 backend/functions/package.json delete mode 100644 backend/functions/src/services/curl.ts delete mode 120000 backend/functions/src/shared delete mode 100644 backend/functions/src/stand-alone/crawl.ts delete mode 100644 backend/functions/src/stand-alone/search.ts delete mode 100644 backend/storage.rules rename backend/functions/integrity-check.cjs => integrity-check.cjs (100%) rename backend/functions/package-lock.json => package-lock.json (98%) rename {backend/functions/public => public}/favicon.ico (100%) rename {backend/functions/src/cloud-functions => src/api}/crawler.ts (72%) rename {backend/functions/src/cloud-functions => src/api}/searcher-serper.ts (89%) rename {backend/functions/src/cloud-functions => src/api}/searcher.ts (90%) rename {backend/functions/src => src}/cloud-functions/adaptive-crawler.ts (99%) rename {backend/functions/src => src}/cloud-functions/data-crunching.ts (99%) rename {backend/functions/src => src}/db/adaptive-crawl-task.ts (100%) rename {backend/functions/src => src}/db/crawled.ts (100%) rename {backend/functions/src => src}/db/domain-blockade.ts (100%) rename {backend/functions/src => src}/db/domain-profile.ts (90%) rename {backend/functions/src => src}/db/img-alt.ts (100%) rename {backend/functions/src => src}/db/pdf.ts (100%) rename {backend/functions/src => src}/db/searched.ts (100%) rename {backend/functions/src => src}/dto/adaptive-crawler-options.ts (100%) rename backend/functions/src/dto/scrapping-options.ts => src/dto/crawler-options.ts (85%) create mode 100644 src/dto/jina-embeddings-auth.ts rename {backend/functions/src => src}/fetch.d.ts (100%) rename {backend/functions/src => src}/index.ts (100%) create mode 100644 src/lib/transform-server-event-stream.ts rename {backend/functions/src => src}/services/alt-text.ts (100%) create mode 100644 src/services/async-context.ts create mode 100644 src/services/blackhole-detector.ts rename {backend/functions/src => src}/services/brave-search.ts (97%) create mode 100644 src/services/cf-browser-rendering.ts create mode 100644 src/services/curl.ts create mode 100644 src/services/errors.ts create mode 100644 src/services/finalizer.ts rename {backend/functions/src => src}/services/geoip.ts (100%) rename {backend/functions/src => src}/services/jsdom.ts (93%) rename {backend/functions/src => src}/services/lm.ts (100%) create mode 100644 src/services/logger.ts rename {backend/functions/src => src}/services/pdf-extract.ts (95%) create mode 100644 src/services/pseudo-transfer.ts rename {backend/functions/src => src}/services/puppeteer.ts (86%) create mode 100644 src/services/registry.ts create mode 100644 src/services/robots-text.ts rename {backend/functions/src => src}/services/serper-search.ts (94%) rename {backend/functions/src => src}/services/snapshot-formatter.ts (84%) create mode 100644 src/services/temp-file.ts create mode 100644 src/services/threaded.ts create mode 120000 src/shared create mode 100644 src/stand-alone/crawl.ts create mode 100644 src/stand-alone/search.ts rename {backend/functions/src => src}/types.d.ts (100%) rename {backend/functions/src => src}/utils/get-function-url.ts (100%) rename {backend/functions/src => src}/utils/markdown.ts (100%) rename {backend/functions/src => src}/utils/misc.ts (100%) rename {backend/functions/src => src}/utils/tailwind-classes.ts (100%) rename backend/functions/tsconfig.json => tsconfig.json (100%) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index 15d30dd..bd0a08f 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -14,9 +14,6 @@ jobs: concurrency: group: ${{ github.ref_type == 'branch' && github.ref }} cancel-in-progress: true - defaults: - run: - working-directory: backend/functions permissions: contents: read steps: @@ -30,6 +27,8 @@ jobs: credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}' - name: 'Set up Cloud SDK' uses: 'google-github-actions/setup-gcloud@v2' + with: + install_components: beta - name: "Docker auth" run: |- gcloud auth configure-docker us-docker.pkg.dev --quiet @@ -40,7 +39,6 @@ jobs: with: node-version: 22.12.0 cache: npm - cache-dependency-path: backend/functions/package-lock.json - name: npm install run: npm ci @@ -65,13 +63,13 @@ jobs: id: container uses: docker/build-push-action@v6 with: - context: backend/functions + context: . push: true tags: ${{ steps.meta.outputs.tags }} labels: ${{ steps.meta.outputs.labels }} - name: Deploy CRAWL with Tag run: | - gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 + gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2 - name: Deploy SEARCH with Tag run: | - gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 \ No newline at end of file + gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 437330c..b1b850b 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,79 @@ +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +firebase-debug.log* +firebase-debug.*.log* + +# Firebase cache +.firebase/ + +# Firebase config + +# Uncomment this if you'd like others to create their own Firebase project. +# For a team working on the same Firebase project(s), it is recommended to leave +# it commented so all members can deploy to the same project(s) in .firebaserc. +# .firebaserc + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (http://nodejs.org/api/addons.html) +build/Release + +# Dependency directories node_modules/ + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.secret.local + +toy*.ts + .DS_Store -/package-lock.json -backend/functions/test.js +build/ +.firebase-emu/ +*.log +.DS_Store + +*.local +.secret.* +licensed/ \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index c7cab1b..c05221a 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -1,26 +1,6 @@ { "version": "0.2.0", "configurations": [ - { - "name": "Debug Fullstack: attach", - "request": "attach", - "cwd": "${workspaceFolder}/backend/functions", - "skipFiles": [ - "/**" - ], - "type": "node", - "preLaunchTask": "Fullstack:debug" - }, - { - "name": "Debug Fullstack: attach: with proxy", - "request": "attach", - "cwd": "${workspaceFolder}/backend/functions", - "skipFiles": [ - "/**" - ], - "type": "node", - "preLaunchTask": "Fullstack:debug:with-proxy" - }, { "name": "Attach", "port": 9229, @@ -40,21 +20,44 @@ "type": "node" }, { - "name": "Debug Fullstack", + "name": "Debug Stand Alone Crawl", "request": "launch", "runtimeArgs": [ - "emulators:start", - "--import=../.firebase-emu", - "--export-on-exit=../.firebase-emu", + "--env-file=.secret.local", ], - "cwd": "${workspaceFolder}/backend/functions", - "runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase", + "env": { + "GCLOUD_PROJECT": "reader-6b7dc", + "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" + }, + "cwd": "${workspaceFolder}", + "program": "build/stand-alone/crawl.js", "skipFiles": [ "/**" ], "type": "node", - "preLaunchTask": "Fullstack:prepare", - "killBehavior": "polite" + "outputCapture": "std", + "preLaunchTask": "Backend:build:watch", + "killBehavior": "forceful" + }, + { + "name": "Debug Stand Alone Search", + "request": "launch", + "runtimeArgs": [ + "--env-file=.secret.local", + ], + "env": { + "GCLOUD_PROJECT": "reader-6b7dc", + "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" + }, + "cwd": "${workspaceFolder}", + "program": "build/stand-alone/search.js", + "skipFiles": [ + "/**" + ], + "type": "node", + "outputCapture": "std", + "preLaunchTask": "Backend:build:watch", + "killBehavior": "forceful" }, ] } \ No newline at end of file diff --git a/.vscode/tasks.json b/.vscode/tasks.json index fc4489b..5cba448 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -6,29 +6,18 @@ "script": "build", "group": "build", "options": { - "cwd": "${workspaceFolder}/backend/functions" + "cwd": "${workspaceFolder}" }, "problemMatcher": [], "label": "Backend:rebuild", "detail": "Backend:rebuild" }, - { - "type": "npm", - "script": "emu:reset", - "group": "build", - "options": { - "cwd": "${workspaceFolder}/backend/functions" - }, - "problemMatcher": [], - "label": "Backend:reset-emulator", - "detail": "Backend:reset-emulator" - }, { "type": "typescript", "options": { - "cwd": "${workspaceFolder}/backend/functions" + "cwd": "${workspaceFolder}" }, - "tsconfig": "backend/functions/tsconfig.json", + "tsconfig": "tsconfig.json", "option": "watch", "isBackground": true, "problemMatcher": [ @@ -36,121 +25,6 @@ ], "group": "build", "label": "Backend:build:watch" - }, - { - "type": "npm", - "script": "emu:debug", - "group": "none", - "options": { - "cwd": "${workspaceFolder}/backend/functions" - }, - "problemMatcher": [ - { - "base": "$tsc", - "background": { - "activeOnStart": false, - "beginsPattern": "shutdown requested|Starting emulators", - "endsPattern": "Debugger listening" - } - } - ], - "label": "Backend:start-emulator-debug", - "detail": "Backend:start-emulator-debug", - "dependsOn": [ - "Backend:build:watch" - ], - "isBackground": true, - }, - { - "type": "npm", - "script": "dev", - "options": { - "cwd": "${workspaceFolder}/webapp", - }, - "group": "build", - "label": "Frontend:start:dev", - "detail": "Frontend:start:dev", - "isBackground": true, - "problemMatcher": { - "base": "$vite", - "background": { - "activeOnStart": true, - "endsPattern": "OK", - "beginsPattern": "vite" - } - }, - }, - { - "type": "npm", - "script": "dev", - "options": { - "cwd": "${workspaceFolder}/webapp", - "env": { - "FIREBASE_EMULATE": "true", - } - }, - "group": "build", - "label": "Frontend:start:emu", - "detail": "Frontend:start:emu", - "isBackground": true, - "problemMatcher": { - "base": "$vite", - "background": { - "activeOnStart": true, - "endsPattern": "OK", - "beginsPattern": "vite" - } - }, - }, - { - "type": "npm", - "script": "emu:debug2", - "group": "none", - "options": { - "cwd": "${workspaceFolder}/backend/functions", - "env": { - "https_proxy": "http://127.0.0.1:7890", - "http_proxy": "http://127.0.0.1:7890", - "all_proxy": "socks5://127.0.0.1:7890" - } - }, - "problemMatcher": [ - { - "base": "$tsc", - "background": { - "activeOnStart": false, - "beginsPattern": "shutdown requested|Starting emulators", - "endsPattern": "Debugger listening" - } - } - ], - "label": "Backend:start-emulator-debug:with-proxy", - "detail": "Backend:start-emulator-debug:with-proxy", - "dependsOn": [ - "Backend:build:watch" - ], - "isBackground": true, - }, - { - "label": "Fullstack:prepare", - "dependsOn": [ - "Frontend:start:emu", - "Backend:build:watch", - ], - }, - { - "label": "Fullstack:debug", - "dependsOn": [ - // "Frontend:start:emu", - "Backend:start-emulator-debug", - ], - }, - { - "label": "Fullstack:debug:with-proxy", - "dependsOn": [ - "Frontend:start:emu", - "Backend:start-emulator-debug:with-proxy", - ], } ] } \ No newline at end of file diff --git a/backend/functions/Dockerfile b/Dockerfile similarity index 100% rename from backend/functions/Dockerfile rename to Dockerfile diff --git a/README.md b/README.md index 6b7a612..bd0f648 100644 --- a/README.md +++ b/README.md @@ -158,13 +158,9 @@ curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.or You will need the following tools to run the project: - Node v18 (The build fails for Node version >18) -- Firebase CLI (`npm install -g firebase-tools`) - -For backend, go to the `backend/functions` directory and install the npm dependencies. ```bash git clone git@github.com:jina-ai/reader.git -cd backend/functions npm install ``` diff --git a/backend/.firebaserc b/backend/.firebaserc deleted file mode 100644 index f585142..0000000 --- a/backend/.firebaserc +++ /dev/null @@ -1,5 +0,0 @@ -{ - "projects": { - "default": "reader-6b7dc" - } -} diff --git a/backend/.gitignore b/backend/.gitignore deleted file mode 100644 index b1b850b..0000000 --- a/backend/.gitignore +++ /dev/null @@ -1,79 +0,0 @@ -# Logs -logs -*.log -npm-debug.log* -yarn-debug.log* -yarn-error.log* -firebase-debug.log* -firebase-debug.*.log* - -# Firebase cache -.firebase/ - -# Firebase config - -# Uncomment this if you'd like others to create their own Firebase project. -# For a team working on the same Firebase project(s), it is recommended to leave -# it commented so all members can deploy to the same project(s) in .firebaserc. -# .firebaserc - -# Runtime data -pids -*.pid -*.seed -*.pid.lock - -# Directory for instrumented libs generated by jscoverage/JSCover -lib-cov - -# Coverage directory used by tools like istanbul -coverage - -# nyc test coverage -.nyc_output - -# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files) -.grunt - -# Bower dependency directory (https://bower.io/) -bower_components - -# node-waf configuration -.lock-wscript - -# Compiled binary addons (http://nodejs.org/api/addons.html) -build/Release - -# Dependency directories -node_modules/ - -# Optional npm cache directory -.npm - -# Optional eslint cache -.eslintcache - -# Optional REPL history -.node_repl_history - -# Output of 'npm pack' -*.tgz - -# Yarn Integrity file -.yarn-integrity - -# dotenv environment variables file -.env -.secret.local - -toy*.ts - -.DS_Store -build/ -.firebase-emu/ -*.log -.DS_Store - -*.local -.secret.* -licensed/ \ No newline at end of file diff --git a/backend/firebase.json b/backend/firebase.json deleted file mode 100644 index 2240bfd..0000000 --- a/backend/firebase.json +++ /dev/null @@ -1,43 +0,0 @@ -{ - "firestore": { - "rules": "firestore.rules", - "indexes": "firestore.indexes.json" - }, - "functions": [ - { - "source": "functions", - "codebase": "default", - "ignore": [ - "node_modules", - "src", - ".git", - "*.log", - "*.local", - ".secret.*", - ".firebase-emu" - ], - "predeploy": [ - "npm --prefix \"$RESOURCE_DIR\" run build:clean", - "npm --prefix \"$RESOURCE_DIR\" run build" - ] - } - ], - "storage": { - "rules": "storage.rules" - }, - "emulators": { - "ui": { - "enabled": true - }, - "singleProjectMode": true, - "functions": { - "port": 5001 - }, - "firestore": { - "port": 9098 - }, - "storage": { - "port": 9097 - } - } -} \ No newline at end of file diff --git a/backend/firestore.indexes.json b/backend/firestore.indexes.json deleted file mode 100644 index a4752ee..0000000 --- a/backend/firestore.indexes.json +++ /dev/null @@ -1,19 +0,0 @@ -{ - "indexes": [ - { - "collectionGroup": "prompts", - "queryScope": "COLLECTION_GROUP", - "fields": [ - { - "fieldPath": "id", - "order": "ASCENDING" - }, - { - "fieldPath": "isPublic", - "order": "ASCENDING" - } - ] - } - ], - "fieldOverrides": [] -} \ No newline at end of file diff --git a/backend/firestore.rules b/backend/firestore.rules deleted file mode 100644 index 09d4ede..0000000 --- a/backend/firestore.rules +++ /dev/null @@ -1,32 +0,0 @@ -rules_version = '2'; -service cloud.firestore { - match /databases/{database}/documents { - // match /questions/{document=**} { - // allow read: if request.auth != null - // } - - // match /answers/{userId}/profiles/default { - // allow read, write: if request.auth != null && request.auth.uid == userId - // } - - match /credits/{userId}/{document=**} { - allow read: if request.auth != null && request.auth.uid == userId - } - - match /users/{userId}/prompts/{document=**} { - allow read: if request.auth != null && request.auth.uid == userId - } - - // match /users/{userId}/profiles/{document=**} { - // allow read: if request.auth != null && request.auth.uid == userId - // } - - match /users/{userId}/creditHistory/{document=**} { - allow read: if request.auth != null && request.auth.uid == userId - } - - match /{document=**} { - allow read, write: if false; - } - } -} diff --git a/backend/functions/.dockerignore b/backend/functions/.dockerignore deleted file mode 100644 index c2658d7..0000000 --- a/backend/functions/.dockerignore +++ /dev/null @@ -1 +0,0 @@ -node_modules/ diff --git a/backend/functions/.editorconfig b/backend/functions/.editorconfig deleted file mode 100644 index 17d2fbb..0000000 --- a/backend/functions/.editorconfig +++ /dev/null @@ -1,36 +0,0 @@ -root = true - -[*] -end_of_line = lf -charset = utf-8 -indent_style = space -insert_final_newline = true -trim_trailing_whitespace = true -indent_size = 4 -quote_type = single -max_line_length = 120 - -[*.py] -indent_size = 4 - -[*.ts] -indent_size = 4 - -[*.js] -indent_size = 2 - -[*.vue] -indent_size = 2 - -[*.*sx] -indent_size = 2 - -[*.*ml] -indent_size = 2 - -[*.json] -indent_size = 2 - -[*.md] -indent_size = 2 -trim_trailing_whitespace = false diff --git a/backend/functions/.env.example b/backend/functions/.env.example deleted file mode 100644 index e69de29..0000000 diff --git a/backend/functions/.puppeteerrc.cjs b/backend/functions/.puppeteerrc.cjs deleted file mode 100644 index 574a6bd..0000000 --- a/backend/functions/.puppeteerrc.cjs +++ /dev/null @@ -1,9 +0,0 @@ -const { join } = require('path'); - -/** - * @type {import("puppeteer").Configuration} - */ -module.exports = { - // Changes the cache location for Puppeteer. - cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'), -}; diff --git a/backend/functions/package.json b/backend/functions/package.json deleted file mode 100644 index 193de8a..0000000 --- a/backend/functions/package.json +++ /dev/null @@ -1,93 +0,0 @@ -{ - "name": "reader", - "scripts": { - "lint": "eslint --ext .js,.ts .", - "build": "node ./integrity-check.cjs && tsc -p .", - "build:watch": "tsc --watch", - "build:clean": "rm -rf ./build", - "shell": "npm run build && firebase functions:shell", - "emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu", - "emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../", - "emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit", - "emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", - "emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions", - "emu:kill": "killall java", - "serve": "npm run build && npm run emu:start", - "debug": "npm run build && npm run emu:start -- --inspect-functions", - "from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit", - "from-preset": "npm run build && npm run emu:reset && npm run emu:start", - "start": "npm run shell", - "deploy": "firebase deploy --only functions", - "logs": "firebase functions:log", - "gcp-build": "node node_modules/puppeteer/install.mjs" - }, - "engines": { - "node": "20" - }, - "main": "build/index.js", - "dependencies": { - "@esm2cjs/normalize-url": "^8.0.0", - "@google-cloud/translate": "^8.2.0", - "@mozilla/readability": "^0.5.0", - "@napi-rs/canvas": "^0.1.67", - "@types/turndown": "^5.0.4", - "@xmldom/xmldom": "^0.9.3", - "archiver": "^6.0.1", - "axios": "^1.3.3", - "bcrypt": "^5.1.0", - "busboy": "^1.6.0", - "civkit": "^0.8.3-3e69606", - "core-js": "^3.37.1", - "cors": "^2.8.5", - "dayjs": "^1.11.9", - "express": "^4.19.2", - "firebase-admin": "^12.1.0", - "firebase-functions": "^6.1.1", - "htmlparser2": "^9.0.0", - "jose": "^5.1.0", - "langdetect": "^0.2.1", - "linkedom": "^0.18.4", - "maxmind": "^4.3.18", - "minio": "^7.1.3", - "node-libcurl": "^4.1.0", - "openai": "^4.20.0", - "pdfjs-dist": "^4.2.67", - "puppeteer": "^23.3.0", - "puppeteer-extra": "^3.3.6", - "puppeteer-extra-plugin-block-resources": "^2.4.3", - "puppeteer-extra-plugin-page-proxy": "^1.3.1", - "puppeteer-page-proxy": "^1.3.0", - "robots-parser": "^3.0.1", - "set-cookie-parser": "^2.6.0", - "simple-zstd": "^1.4.2", - "stripe": "^11.11.0", - "tiktoken": "^1.0.16", - "tld-extract": "^2.1.0", - "turndown": "^7.1.3", - "turndown-plugin-gfm": "^1.0.2", - "undici": "^5.24.0" - }, - "devDependencies": { - "@types/archiver": "^5.3.4", - "@types/bcrypt": "^5.0.0", - "@types/busboy": "^1.5.4", - "@types/cors": "^2.8.17", - "@types/generic-pool": "^3.8.1", - "@types/node": "^20.14.13", - "@types/set-cookie-parser": "^2.4.7", - "@types/xmldom": "^0.1.34", - "@typescript-eslint/eslint-plugin": "^5.12.0", - "@typescript-eslint/parser": "^5.12.0", - "eslint": "^8.9.0", - "eslint-config-google": "^0.14.0", - "eslint-plugin-import": "^2.25.4", - "firebase-functions-test": "^3.0.0", - "pino-pretty": "^13.0.0", - "replicate": "^0.16.1", - "typescript": "^5.5.4" - }, - "private": true, - "exports": { - ".": "./build/index.js" - } -} diff --git a/backend/functions/src/services/curl.ts b/backend/functions/src/services/curl.ts deleted file mode 100644 index d1ab22e..0000000 --- a/backend/functions/src/services/curl.ts +++ /dev/null @@ -1,218 +0,0 @@ -import { marshalErrorLike } from 'civkit/lang'; -import { AsyncService } from 'civkit/async-service'; -import { singleton } from 'tsyringe'; - -import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl'; -import { PageSnapshot, ScrappingOptions } from './puppeteer'; -import { Logger } from '../shared/services/logger'; -import { JSDomControl } from './jsdom'; -import { AssertionFailureError, FancyFile } from 'civkit'; -import { TempFileManager } from '../shared'; -import { readFile } from 'fs/promises'; -import { pathToFileURL } from 'url'; -import { createBrotliDecompress, createInflate, createGunzip } from 'zlib'; -import { ZSTDDecompress } from 'simple-zstd'; - -@singleton() -export class CurlControl extends AsyncService { - - logger = this.globalLogger.child({ service: this.constructor.name }); - - constructor( - protected globalLogger: Logger, - protected jsdomControl: JSDomControl, - protected tempFileManager: TempFileManager, - ) { - super(...arguments); - } - - override async init() { - await this.dependencyReady(); - - this.emit('ready'); - } - - curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) { - const mixinHeaders = { - 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`, - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': 'Windows', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`, - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', - 'Sec-Fetch-Site': 'none', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-User': '?1', - 'Sec-Fetch-Dest': 'document', - 'Accept-Encoding': 'gzip, deflate, br, zstd', - 'Accept-Language': 'en-US,en;q=0.9', - }; - - curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`)); - - return curl; - } - - async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise { - const snapshot = { - href: urlToCrawl.toString(), - html: '', - title: '', - text: '', - } as PageSnapshot; - - let contentType = ''; - const result = await new Promise<{ - statusCode: number, - data?: FancyFile, - headers: Buffer | HeaderInfo[], - }>((resolve, reject) => { - const curl = new Curl(); - curl.enable(CurlFeature.StreamResponse); - curl.setOpt('URL', urlToCrawl.toString()); - curl.setOpt(Curl.option.FOLLOWLOCATION, true); - - curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000)); - - if (crawlOpts?.overrideUserAgent) { - curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent); - } - - this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders); - // if (crawlOpts?.extraHeaders) { - // curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`)); - // } - if (crawlOpts?.proxyUrl) { - curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl); - } - if (crawlOpts?.cookies?.length) { - const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`); - curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; ')); - } - if (crawlOpts?.referer) { - curl.setOpt(Curl.option.REFERER, crawlOpts.referer); - } - - curl.on('end', (statusCode, _data, headers) => { - this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers }); - curl.close(); - }); - - curl.on('error', (err) => { - curl.close(); - this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) }); - reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`)); - }); - curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB - let status = -1; - let contentEncoding = ''; - curl.on('stream', (stream, statusCode, headers) => { - status = statusCode; - const lastResHeaders = headers[headers.length - 1]; - for (const [k, v] of Object.entries(lastResHeaders)) { - const kl = k.toLowerCase(); - if (kl === 'content-type') { - contentType = v.toLowerCase(); - } - if (kl === 'content-encoding') { - contentEncoding = v.toLowerCase(); - } - if (contentType && contentEncoding) { - break; - } - } - - if (!contentType) { - reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`)); - stream.destroy(); - return; - } - if (contentType.startsWith('image/')) { - snapshot.html = `${urlToCrawl.origin}${urlToCrawl.pathname}`; - stream.destroy(); - resolve({ - statusCode: status, - headers, - }); - return; - } - - switch (contentEncoding) { - case 'gzip': { - const decompressed = createGunzip(); - stream.pipe(decompressed); - stream = decompressed; - break; - } - case 'deflate': { - const decompressed = createInflate(); - stream.pipe(decompressed); - stream = decompressed; - break; - } - case 'br': { - const decompressed = createBrotliDecompress(); - stream.pipe(decompressed); - stream = decompressed; - break; - } - case 'zstd': { - const decompressed = ZSTDDecompress(); - stream.pipe(decompressed); - stream = decompressed; - break; - } - default: { - break; - } - } - - const fpath = this.tempFileManager.alloc(); - const fancyFile = FancyFile.auto(stream, fpath); - this.tempFileManager.bindPathTo(fancyFile, fpath); - resolve({ - statusCode: status, - data: fancyFile, - headers, - }); - }); - - curl.perform(); - }); - - if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) { - throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`); - } - - if (contentType === 'application/octet-stream') { - // Content declared as binary is same as unknown. - contentType = ''; - } - - if (result.data) { - const mimeType: string = contentType || await result.data.mimeType; - if (mimeType.startsWith('text/html')) { - if ((await result.data.size) > 1024 * 1024 * 32) { - throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); - } - snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' }); - } else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) { - if ((await result.data.size) > 1024 * 1024 * 32) { - throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`); - } - snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' }); - snapshot.html = `
${snapshot.text}
`; - } else if (mimeType.startsWith('application/pdf')) { - snapshot.pdfs = [pathToFileURL(await result.data.filePath).href]; - } else { - throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`); - } - } - - const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); - - return curlSnapshot!; - } - - -} diff --git a/backend/functions/src/shared b/backend/functions/src/shared deleted file mode 120000 index c8c3836..0000000 --- a/backend/functions/src/shared +++ /dev/null @@ -1 +0,0 @@ -../../../thinapps-shared/backend \ No newline at end of file diff --git a/backend/functions/src/stand-alone/crawl.ts b/backend/functions/src/stand-alone/crawl.ts deleted file mode 100644 index 0e2f75d..0000000 --- a/backend/functions/src/stand-alone/crawl.ts +++ /dev/null @@ -1,168 +0,0 @@ -import 'reflect-metadata'; -import { container, singleton } from 'tsyringe'; -import { initializeApp, applicationDefault } from 'firebase-admin/app'; - -process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ - projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', - storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, - credential: applicationDefault(), -}); - -initializeApp(); - - -import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared'; -import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; -import { ExpressServer } from 'civkit/civ-rpc/express'; -import http2 from 'http2'; -import { CrawlerHost } from '../cloud-functions/crawler'; -import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; -import path from 'path'; -import fs from 'fs'; -import { mimeOfExt } from 'civkit/mime'; -import { NextFunction, Request, Response } from 'express'; - -process.on('unhandledRejection', (err) => { - console.error('Unhandled rejection', err); -}); - -process.on('uncaughtException', (err) => { - console.log('Uncaught exception', err); - - // Looks like Firebase runtime does not handle error properly. - // Make sure to quit the process. - console.error('Uncaught exception, process quit.'); - process.nextTick(() => process.exit(1)); -}); - -@singleton() -export class CrawlStandAloneServer extends ExpressServer { - logger = this.globalLogger.child({ service: this.constructor.name }); - - httpAlternativeServer?: typeof this['httpServer']; - assets = new Map(); - - constructor( - protected globalLogger: Logger, - protected registry: CloudFunctionRegistry, - protected crawlerHost: CrawlerHost, - protected threadLocal: AsyncContext, - ) { - super(...arguments); - - registry.allHandsOnDeck().catch(() => void 0); - registry.title = 'reader'; - registry.version = '0.1.0'; - } - - h2c() { - this.httpAlternativeServer = this.httpServer; - this.httpServer = http2.createServer(this.expressApp); - // useResourceBasedDefaultTracker(); - - return this; - } - - override async init() { - await this.walkForAssets(); - await super.init(); - } - - async walkForAssets() { - const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); - - for (const file of files) { - if (file.type !== 'file') { - continue; - } - this.assets.set(file.relativePath.toString(), file); - } - } - - makeAssetsServingController() { - return (req: Request, res: Response, next: NextFunction) => { - const requestPath = req.url; - const file = requestPath.slice(1); - if (!file) { - return next(); - } - - const asset = this.assets.get(file); - if (asset?.type !== 'file') { - return next(); - } - res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); - res.set('Content-Length', asset.stats.size.toString()); - fs.createReadStream(asset.path).pipe(res); - - return; - }; - } - - makeMiscMiddleware() { - return (req: Request, res: Response, next: NextFunction) => { - if (req.method === 'OPTIONS') { - return res.status(200).end(); - } - this.threadLocal.set('ip', req.ip); - - return next(); - }; - } - - override listen(port: number) { - const r = super.listen(port); - if (this.httpAlternativeServer) { - const altPort = port + 1; - this.httpAlternativeServer.listen(altPort, () => { - this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); - }); - } - - return r; - } - - override registerRoutes(): void { - - const openAPIManager = new OpenAPIManager(); - openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!); - const openapiJsonPath = '/openapi.json'; - this.expressRootRouter.get(openapiJsonPath, (req, res) => { - const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); - baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); - baseURL.search = ''; - const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { - info: { - title: this.registry.title, - description: `${this.registry.title} openAPI documentations`, - 'x-logo': { - url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` - } - } - }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); - res.statusCode = 200; - res.end(JSON.stringify(content)); - }); - - this.expressRootRouter.use('/', - ...this.registry.expressMiddlewares, - this.makeAssetsServingController(), - this.makeMiscMiddleware(), - this.registry.makeShimController('crawl') - ); - } - - protected override featureSelect(): void { - this.insertAsyncHookMiddleware(); - this.insertHealthCheckMiddleware(this.healthCheckEndpoint); - this.insertLogRequestsMiddleware(); - this.registerOpenAPIDocsRoutes('/docs'); - - this.registerRoutes(); - } -} -const instance = container.resolve(CrawlStandAloneServer); - -export default instance; - -instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/backend/functions/src/stand-alone/search.ts b/backend/functions/src/stand-alone/search.ts deleted file mode 100644 index 2f53554..0000000 --- a/backend/functions/src/stand-alone/search.ts +++ /dev/null @@ -1,168 +0,0 @@ -import 'reflect-metadata'; -import { container, singleton } from 'tsyringe'; -import { initializeApp, applicationDefault } from 'firebase-admin/app'; - -process.env['FIREBASE_CONFIG'] ??= JSON.stringify({ - projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc', - storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`, - credential: applicationDefault(), -}); - -initializeApp(); - - -import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared'; -import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc'; -import { ExpressServer } from 'civkit/civ-rpc/express'; -import http2 from 'http2'; -import { SearcherHost } from '../cloud-functions/searcher-serper'; -import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; -import path from 'path'; -import fs from 'fs'; -import { mimeOfExt } from 'civkit/mime'; -import { NextFunction, Request, Response } from 'express'; - -process.on('unhandledRejection', (err) => { - console.error('Unhandled rejection', err); -}); - -process.on('uncaughtException', (err) => { - console.log('Uncaught exception', err); - - // Looks like Firebase runtime does not handle error properly. - // Make sure to quit the process. - console.error('Uncaught exception, process quit.'); - process.nextTick(() => process.exit(1)); -}); - -@singleton() -export class SearchStandAloneServer extends ExpressServer { - logger = this.globalLogger.child({ service: this.constructor.name }); - - httpAlternativeServer?: typeof this['httpServer']; - assets = new Map(); - - constructor( - protected globalLogger: Logger, - protected registry: CloudFunctionRegistry, - protected searcherHost: SearcherHost, - protected threadLocal: AsyncContext, - ) { - super(...arguments); - - registry.allHandsOnDeck().catch(() => void 0); - registry.title = 'reader'; - registry.version = '0.1.0'; - } - - h2c() { - this.httpAlternativeServer = this.httpServer; - this.httpServer = http2.createServer(this.expressApp); - // useResourceBasedDefaultTracker(); - - return this; - } - - override async init() { - await this.walkForAssets(); - await super.init(); - } - - async walkForAssets() { - const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); - - for (const file of files) { - if (file.type !== 'file') { - continue; - } - this.assets.set(file.relativePath.toString(), file); - } - } - - makeAssetsServingController() { - return (req: Request, res: Response, next: NextFunction) => { - const requestPath = req.url; - const file = requestPath.slice(1); - if (!file) { - return next(); - } - - const asset = this.assets.get(file); - if (asset?.type !== 'file') { - return next(); - } - res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'); - res.set('Content-Length', asset.stats.size.toString()); - fs.createReadStream(asset.path).pipe(res); - - return; - }; - } - - makeMiscMiddleware() { - return (req: Request, res: Response, next: NextFunction) => { - if (req.method === 'OPTIONS') { - return res.status(200).end(); - } - this.threadLocal.set('ip', req.ip); - - return next(); - }; - } - - override listen(port: number) { - const r = super.listen(port); - if (this.httpAlternativeServer) { - const altPort = port + 1; - this.httpAlternativeServer.listen(altPort, () => { - this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); - }); - } - - return r; - } - - override registerRoutes(): void { - - const openAPIManager = new OpenAPIManager(); - openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!); - const openapiJsonPath = '/openapi.json'; - this.expressRootRouter.get(openapiJsonPath, (req, res) => { - const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`); - baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, ''); - baseURL.search = ''; - const content = openAPIManager.createOpenAPIObject(baseURL.toString(), { - info: { - title: this.registry.title, - description: `${this.registry.title} openAPI documentations`, - 'x-logo': { - url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png` - } - } - }, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any); - res.statusCode = 200; - res.end(JSON.stringify(content)); - }); - - this.expressRootRouter.use('/', - ...this.registry.expressMiddlewares, - this.makeMiscMiddleware(), - this.makeAssetsServingController(), - this.registry.makeShimController('search') - ); - } - - protected override featureSelect(): void { - this.insertAsyncHookMiddleware(); - this.insertHealthCheckMiddleware(this.healthCheckEndpoint); - this.insertLogRequestsMiddleware(); - this.registerOpenAPIDocsRoutes('/docs'); - - this.registerRoutes(); - } -} -const instance = container.resolve(SearchStandAloneServer); - -export default instance; - -instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/backend/storage.rules b/backend/storage.rules deleted file mode 100644 index 9f33d22..0000000 --- a/backend/storage.rules +++ /dev/null @@ -1,8 +0,0 @@ -rules_version = '2'; -service firebase.storage { - match /b/{bucket}/o { - match /{allPaths=**} { - allow read, write: if false; - } - } -} diff --git a/backend/functions/integrity-check.cjs b/integrity-check.cjs similarity index 100% rename from backend/functions/integrity-check.cjs rename to integrity-check.cjs diff --git a/backend/functions/package-lock.json b/package-lock.json similarity index 98% rename from backend/functions/package-lock.json rename to package-lock.json index c227ec1..df4fa2f 100644 --- a/backend/functions/package-lock.json +++ b/package-lock.json @@ -8,15 +8,16 @@ "dependencies": { "@esm2cjs/normalize-url": "^8.0.0", "@google-cloud/translate": "^8.2.0", + "@koa/bodyparser": "^5.1.1", "@mozilla/readability": "^0.5.0", - "@napi-rs/canvas": "^0.1.67", + "@napi-rs/canvas": "^0.1.68", "@types/turndown": "^5.0.4", "@xmldom/xmldom": "^0.9.3", "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.3-3e69606", + "civkit": "^0.8.4-32482a3", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -31,7 +32,7 @@ "minio": "^7.1.3", "node-libcurl": "^4.1.0", "openai": "^4.20.0", - "pdfjs-dist": "^4.2.67", + "pdfjs-dist": "^4.10.38", "puppeteer": "^23.3.0", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-block-resources": "^2.4.3", @@ -53,6 +54,7 @@ "@types/busboy": "^1.5.4", "@types/cors": "^2.8.17", "@types/generic-pool": "^3.8.1", + "@types/koa": "^2.15.0", "@types/node": "^20.14.13", "@types/set-cookie-parser": "^2.4.7", "@types/xmldom": "^0.1.34", @@ -62,6 +64,7 @@ "eslint-config-google": "^0.14.0", "eslint-plugin-import": "^2.25.4", "firebase-functions-test": "^3.0.0", + "koa": "^2.16.0", "pino-pretty": "^13.0.0", "replicate": "^0.16.1", "typescript": "^5.5.4" @@ -1626,6 +1629,23 @@ "url": "https://opencollective.com/js-sdsl" } }, + "node_modules/@koa/bodyparser": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/@koa/bodyparser/-/bodyparser-5.1.1.tgz", + "integrity": "sha512-ZBF49xqNVxnmJ+8iXegq+fXPQm9RSX8giNl/aXS5rW1VpNct92wnFbGR/47vfoRJVLARGQ4HVL4WaQ0u8IJVoA==", + "license": "MIT", + "dependencies": { + "co-body": "^6.1.0", + "lodash.merge": "^4.6.2", + "type-is": "^1.6.18" + }, + "engines": { + "node": ">= 16" + }, + "peerDependencies": { + "koa": "^2.14.1" + } + }, "node_modules/@koa/router": { "version": "12.0.1", "resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz", @@ -1679,30 +1699,30 @@ } }, "node_modules/@napi-rs/canvas": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.67.tgz", - "integrity": "sha512-VA4Khm/5Kg2bQGx3jXotTC4MloOG8b1Ung80exafUK0k5u6yJmIz3Q2iXeeWZs5weV+LQOEB+CPKsYwEYaGAjw==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.68.tgz", + "integrity": "sha512-LQESrePLEBLvhuFkXx9jjBXRC2ClYsO5mqQ1m/puth5z9SOuM3N/B3vDuqnC3RJFktDktyK9khGvo7dTkqO9uQ==", "license": "MIT", "engines": { "node": ">= 10" }, "optionalDependencies": { - "@napi-rs/canvas-android-arm64": "0.1.67", - "@napi-rs/canvas-darwin-arm64": "0.1.67", - "@napi-rs/canvas-darwin-x64": "0.1.67", - "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.67", - "@napi-rs/canvas-linux-arm64-gnu": "0.1.67", - "@napi-rs/canvas-linux-arm64-musl": "0.1.67", - "@napi-rs/canvas-linux-riscv64-gnu": "0.1.67", - "@napi-rs/canvas-linux-x64-gnu": "0.1.67", - "@napi-rs/canvas-linux-x64-musl": "0.1.67", - "@napi-rs/canvas-win32-x64-msvc": "0.1.67" + "@napi-rs/canvas-android-arm64": "0.1.68", + "@napi-rs/canvas-darwin-arm64": "0.1.68", + "@napi-rs/canvas-darwin-x64": "0.1.68", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.68", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.68", + "@napi-rs/canvas-linux-arm64-musl": "0.1.68", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.68", + "@napi-rs/canvas-linux-x64-gnu": "0.1.68", + "@napi-rs/canvas-linux-x64-musl": "0.1.68", + "@napi-rs/canvas-win32-x64-msvc": "0.1.68" } }, "node_modules/@napi-rs/canvas-android-arm64": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.67.tgz", - "integrity": "sha512-W+3DFG5h0WU8Vqqb3W5fNmm5/TPH5ECZRinQDK4CAKFSUkc4iZcDwrmyFG9sB4KdHazf1mFVHCpEeVMO6Mk6Zg==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.68.tgz", + "integrity": "sha512-h1KcSR4LKLfRfzeBH65xMxbWOGa1OtMFQbCMVlxPCkN1Zr+2gK+70pXO5ktojIYcUrP6KDcOwoc8clho5ccM/w==", "cpu": [ "arm64" ], @@ -1716,9 +1736,9 @@ } }, "node_modules/@napi-rs/canvas-darwin-arm64": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.67.tgz", - "integrity": "sha512-xzrv7QboI47yhIHR5P5u/9KGswokuOKLiKSukr1Ku03RRJxP6lGuVtrAZAgdRg7F9FsuF2REf2yK53YVb6pMlA==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.68.tgz", + "integrity": "sha512-/VURlrAD4gDoxW1GT/b0nP3fRz/fhxmHI/xznTq2FTwkQLPOlLkDLCvTmQ7v6LtGKdc2Ed6rvYpRan+JXThInQ==", "cpu": [ "arm64" ], @@ -1732,9 +1752,9 @@ } }, "node_modules/@napi-rs/canvas-darwin-x64": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.67.tgz", - "integrity": "sha512-SNk9lYBr84N0gW8MZ2IrjygFtbFBILr3SEqMdHzHHuph20SQmssFvJGPZwSSCMEyKAvyqhogbmlew0te5Z4w9Q==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.68.tgz", + "integrity": "sha512-tEpvGR6vCLTo1Tx9wmDnoOKROpw57wiCWwCpDOuVlj/7rqEJOUYr9ixW4aRJgmeGBrZHgevI0EURys2ER6whmg==", "cpu": [ "x64" ], @@ -1748,9 +1768,9 @@ } }, "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.67.tgz", - "integrity": "sha512-qmBlSvUpl567bzH8tNXi82u5FrL4d0qINqd6K9O7GWGGGFmKMJdrgi2/SW3wwCTxqHBasIDdVWc4KSJfwyaoDQ==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.68.tgz", + "integrity": "sha512-U9xbJsumPOiAYeAFZMlHf62b9dGs2HJ6Q5xt7xTB0uEyPeurwhgYBWGgabdsEidyj38YuzI/c3LGBbSQB3vagw==", "cpu": [ "arm" ], @@ -1764,9 +1784,9 @@ } }, "node_modules/@napi-rs/canvas-linux-arm64-gnu": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.67.tgz", - "integrity": "sha512-k3nAPQefkMeFuJ65Rqdnx92KX1JXQhEKjjWeKsCJB+7sIBgQUWtHo9c3etfVLv5pkWJJDFi/Zc2soNkH3E8dRA==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.68.tgz", + "integrity": "sha512-KFkn8wEm3mPnWD4l8+OUUkxylSJuN5q9PnJRZJgv15RtCA1bgxIwTkBhI/+xuyVMcHqON9sXq7cDkEJtHm35dg==", "cpu": [ "arm64" ], @@ -1780,9 +1800,9 @@ } }, "node_modules/@napi-rs/canvas-linux-arm64-musl": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.67.tgz", - "integrity": "sha512-lZwHWR1cCP408l86n3Qbs3X1oFeAYMjJIQvQl1VMZh6wo5PfI+jaZSKBUOd8x44TnVllX9yhLY9unNRztk/sUQ==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.68.tgz", + "integrity": "sha512-IQzts91rCdOALXBWQxLZRCEDrfFTGDtNRJMNu+2SKZ1uT8cmPQkPwVk5rycvFpvgAcmiFiOSCp1aRrlfU8KPpQ==", "cpu": [ "arm64" ], @@ -1796,9 +1816,9 @@ } }, "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.67.tgz", - "integrity": "sha512-PdBC9p6bLHA1W3OdA0vTHj701SB/kioGQ1uCFBRMs5KBCaMLb/H4aNi8uaIUIEvBWnxeAjoNcLU7//q0FxEosw==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.68.tgz", + "integrity": "sha512-e9AS5UttoIKqXSmBzKZdd3NErSVyOEYzJfNOCGtafGk1//gibTwQXGlSXmAKuErqMp09pyk9aqQRSYzm1AQfBw==", "cpu": [ "riscv64" ], @@ -1812,9 +1832,9 @@ } }, "node_modules/@napi-rs/canvas-linux-x64-gnu": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.67.tgz", - "integrity": "sha512-kJJX6eWzjipL/LdKOWCJctc88e5yzuXri8+s0V/lN06OwuLGW62TWS3lvi8qlUrGMOfRGabSWWlB4omhASSB8w==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.68.tgz", + "integrity": "sha512-Pa/I36VE3j57I3Obhrr+J48KGFfkZk2cJN/2NmW/vCgmoF7kCP6aTVq5n+cGdGWLd/cN9CJ9JvNwEoMRDghu0g==", "cpu": [ "x64" ], @@ -1828,9 +1848,9 @@ } }, "node_modules/@napi-rs/canvas-linux-x64-musl": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.67.tgz", - "integrity": "sha512-jLKiPWGeN6ZzhnaLG7ex7eexsiHJ1mdtPK1qKvETIcu45dApMXyUIHvdL6XWB5gFFtj5ScHzLUxv1vkfPZsoxA==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.68.tgz", + "integrity": "sha512-9c6rkc5195wNxuUHJdf4/mmnq433OQey9TNvQ9LspJazvHbfSkTij8wtKjASVQsJyPDva4fkWOeV/OQ7cLw0GQ==", "cpu": [ "x64" ], @@ -1844,9 +1864,9 @@ } }, "node_modules/@napi-rs/canvas-win32-x64-msvc": { - "version": "0.1.67", - "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.67.tgz", - "integrity": "sha512-K/JmkOFbc4iRZYUqJhj0jwqfHA/wNQEmTiGNsgZ6d59yF/IBNp5T0D5eg3B8ghjI8GxDYCiSJ6DNX8mC3Oh2EQ==", + "version": "0.1.68", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.68.tgz", + "integrity": "sha512-Fc5Dez23u0FoSATurT6/w1oMytiRnKWEinHivdMvXpge6nG4YvhrASrtqMk8dGJMVQpHr8QJYF45rOrx2YU2Aw==", "cpu": [ "x64" ], @@ -2238,6 +2258,16 @@ "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz", "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==" }, + "node_modules/@types/accepts": { + "version": "1.3.7", + "resolved": "https://registry.npmjs.org/@types/accepts/-/accepts-1.3.7.tgz", + "integrity": "sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@types/archiver": { "version": "5.3.4", "resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz", @@ -2344,6 +2374,26 @@ "@types/node": "*" } }, + "node_modules/@types/content-disposition": { + "version": "0.5.8", + "resolved": "https://registry.npmjs.org/@types/content-disposition/-/content-disposition-0.5.8.tgz", + "integrity": "sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/cookies": { + "version": "0.9.0", + "resolved": "https://registry.npmjs.org/@types/cookies/-/cookies-0.9.0.tgz", + "integrity": "sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/connect": "*", + "@types/express": "*", + "@types/keygrip": "*", + "@types/node": "*" + } + }, "node_modules/@types/cors": { "version": "2.8.17", "resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz", @@ -2403,6 +2453,13 @@ "@types/node": "*" } }, + "node_modules/@types/http-assert": { + "version": "1.5.6", + "resolved": "https://registry.npmjs.org/@types/http-assert/-/http-assert-1.5.6.tgz", + "integrity": "sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/http-cache-semantics": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz", @@ -2460,6 +2517,13 @@ "@types/node": "*" } }, + "node_modules/@types/keygrip": { + "version": "1.0.6", + "resolved": "https://registry.npmjs.org/@types/keygrip/-/keygrip-1.0.6.tgz", + "integrity": "sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/keyv": { "version": "3.1.4", "resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz", @@ -2468,6 +2532,33 @@ "@types/node": "*" } }, + "node_modules/@types/koa": { + "version": "2.15.0", + "resolved": "https://registry.npmjs.org/@types/koa/-/koa-2.15.0.tgz", + "integrity": "sha512-7QFsywoE5URbuVnG3loe03QXuGajrnotr3gQkXcEBShORai23MePfFYdhz90FEtBBpkyIYQbVD+evKtloCgX3g==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/accepts": "*", + "@types/content-disposition": "*", + "@types/cookies": "*", + "@types/http-assert": "*", + "@types/http-errors": "*", + "@types/keygrip": "*", + "@types/koa-compose": "*", + "@types/node": "*" + } + }, + "node_modules/@types/koa-compose": { + "version": "3.2.8", + "resolved": "https://registry.npmjs.org/@types/koa-compose/-/koa-compose-3.2.8.tgz", + "integrity": "sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/koa": "*" + } + }, "node_modules/@types/lodash": { "version": "4.17.0", "resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz", @@ -3836,7 +3927,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz", "integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==", - "optional": true, "dependencies": { "mime-types": "^2.1.18", "ylru": "^1.2.0" @@ -4005,9 +4095,10 @@ } }, "node_modules/civkit": { - "version": "0.8.3-3e69606", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.3-3e69606.tgz", - "integrity": "sha512-niV5U11ySIiVNSnGpW49KJlExmIiuQQfnyQEXeYuKCE+B+wkqYCBG+3tlY3E882tmPkaQQKpDlF/yTeqEU2q2Q==", + "version": "0.8.4-32482a3", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-32482a3.tgz", + "integrity": "sha512-VQwRreeVKYEoSMlhwYrPGpAA5na6lrIavGKmYNrhsHVJEvSfgkWKEete/btZzer4+WBxnNRw+PpRPrq6xjt13Q==", + "license": "AGPL", "dependencies": { "lodash": "^4.17.21", "tslib": "^2.5.0" @@ -4138,7 +4229,6 @@ "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", "integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==", - "devOptional": true, "engines": { "iojs": ">= 1.0.0", "node": ">= 0.12.0" @@ -4148,7 +4238,6 @@ "version": "6.1.0", "resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz", "integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==", - "optional": true, "dependencies": { "inflation": "^2.0.0", "qs": "^6.5.2", @@ -4273,7 +4362,6 @@ "version": "0.9.1", "resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz", "integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==", - "optional": true, "dependencies": { "depd": "~2.0.0", "keygrip": "~1.1.0" @@ -4582,8 +4670,7 @@ "node_modules/deep-equal": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz", - "integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw==", - "optional": true + "integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw==" }, "node_modules/deep-extend": { "version": "0.6.0", @@ -6701,7 +6788,6 @@ "version": "1.5.0", "resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz", "integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==", - "optional": true, "dependencies": { "deep-equal": "~1.0.1", "http-errors": "~1.8.0" @@ -6714,7 +6800,6 @@ "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==", - "optional": true, "engines": { "node": ">= 0.6" } @@ -6723,7 +6808,6 @@ "version": "1.8.1", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz", "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==", - "optional": true, "dependencies": { "depd": "~1.1.2", "inherits": "2.0.4", @@ -6739,7 +6823,6 @@ "version": "1.5.0", "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==", - "optional": true, "engines": { "node": ">= 0.6" } @@ -6940,7 +7023,6 @@ "version": "2.1.0", "resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz", "integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==", - "optional": true, "engines": { "node": ">= 0.8.0" } @@ -8316,7 +8398,6 @@ "version": "1.1.0", "resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz", "integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==", - "optional": true, "dependencies": { "tsscmp": "1.0.6" }, @@ -8354,10 +8435,10 @@ } }, "node_modules/koa": { - "version": "2.15.3", - "resolved": "https://registry.npmjs.org/koa/-/koa-2.15.3.tgz", - "integrity": "sha512-j/8tY9j5t+GVMLeioLaxweJiKUayFhlGqNTzf2ZGwL0ZCQijd2RLHK0SLW5Tsko8YyyqCZC2cojIb0/s62qTAg==", - "optional": true, + "version": "2.16.0", + "resolved": "https://registry.npmjs.org/koa/-/koa-2.16.0.tgz", + "integrity": "sha512-Afhqq0Vq3W7C+/rW6IqHVBDLzqObwZ07JaUNUEF8yCQ6afiyFE3RAy+i7V0E46XOWlH7vPWn/x0vsZwNy6PWxw==", + "license": "MIT", "dependencies": { "accepts": "^1.3.5", "cache-content-type": "^1.0.0", @@ -8404,14 +8485,12 @@ "node_modules/koa-compose": { "version": "4.1.0", "resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz", - "integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw==", - "optional": true + "integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw==" }, "node_modules/koa-convert": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz", "integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==", - "optional": true, "dependencies": { "co": "^4.6.0", "koa-compose": "^4.1.0" @@ -8424,7 +8503,6 @@ "version": "1.8.1", "resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz", "integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==", - "optional": true, "dependencies": { "depd": "~1.1.2", "inherits": "2.0.4", @@ -8440,7 +8518,6 @@ "version": "1.1.2", "resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz", "integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==", - "optional": true, "engines": { "node": ">= 0.6" } @@ -8449,7 +8526,6 @@ "version": "1.5.0", "resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz", "integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==", - "optional": true, "engines": { "node": ">= 0.6" } @@ -8644,8 +8720,7 @@ "node_modules/lodash.merge": { "version": "4.6.2", "resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz", - "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==", - "dev": true + "integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==" }, "node_modules/lodash.once": { "version": "4.1.1", @@ -9853,8 +9928,7 @@ "node_modules/only": { "version": "0.0.2", "resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz", - "integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==", - "optional": true + "integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==" }, "node_modules/openai": { "version": "4.33.0", @@ -10118,15 +10192,15 @@ } }, "node_modules/pdfjs-dist": { - "version": "4.2.67", - "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz", - "integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==", + "version": "4.10.38", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz", + "integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==", + "license": "Apache-2.0", "engines": { - "node": ">=18" + "node": ">=20" }, "optionalDependencies": { - "canvas": "^2.11.2", - "path2d": "^0.2.0" + "@napi-rs/canvas": "^0.1.65" } }, "node_modules/peek-stream": { @@ -12443,7 +12517,6 @@ "version": "1.0.6", "resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz", "integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==", - "optional": true, "engines": { "node": ">=0.6.x" } @@ -13136,7 +13209,6 @@ "version": "1.4.0", "resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz", "integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==", - "optional": true, "engines": { "node": ">= 4.0.0" } diff --git a/package.json b/package.json index 402d61a..c0efd11 100644 --- a/package.json +++ b/package.json @@ -1,15 +1,84 @@ { - "name": "reader", - "version": "1.0.0", - "description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)", - "main": "index.js", - "scripts": { - "test": "echo \"Error: no test specified\" && exit 1" - }, - "author": "", - "license": "ISC", - "devDependencies": { - "firebase-tools": "^13.6.2", - "typescript": "^5.1.6" - } -} \ No newline at end of file + "name": "reader", + "scripts": { + "lint": "eslint --ext .js,.ts .", + "build": "node ./integrity-check.cjs && tsc -p .", + "build:watch": "tsc --watch", + "build:clean": "rm -rf ./build", + "serve": "npm run build && npm run start", + "debug": "npm run build && npm run dev", + "start": "npm run shell" + }, + "engines": { + "node": "20" + }, + "main": "build/index.js", + "dependencies": { + "@esm2cjs/normalize-url": "^8.0.0", + "@google-cloud/translate": "^8.2.0", + "@koa/bodyparser": "^5.1.1", + "@mozilla/readability": "^0.5.0", + "@napi-rs/canvas": "^0.1.68", + "@types/turndown": "^5.0.4", + "@xmldom/xmldom": "^0.9.3", + "archiver": "^6.0.1", + "axios": "^1.3.3", + "bcrypt": "^5.1.0", + "busboy": "^1.6.0", + "civkit": "^0.8.4-32482a3", + "core-js": "^3.37.1", + "cors": "^2.8.5", + "dayjs": "^1.11.9", + "express": "^4.19.2", + "firebase-admin": "^12.1.0", + "firebase-functions": "^6.1.1", + "htmlparser2": "^9.0.0", + "jose": "^5.1.0", + "langdetect": "^0.2.1", + "linkedom": "^0.18.4", + "maxmind": "^4.3.18", + "minio": "^7.1.3", + "node-libcurl": "^4.1.0", + "openai": "^4.20.0", + "pdfjs-dist": "^4.10.38", + "puppeteer": "^23.3.0", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-block-resources": "^2.4.3", + "puppeteer-extra-plugin-page-proxy": "^1.3.1", + "puppeteer-page-proxy": "^1.3.0", + "robots-parser": "^3.0.1", + "set-cookie-parser": "^2.6.0", + "simple-zstd": "^1.4.2", + "stripe": "^11.11.0", + "tiktoken": "^1.0.16", + "tld-extract": "^2.1.0", + "turndown": "^7.1.3", + "turndown-plugin-gfm": "^1.0.2", + "undici": "^5.24.0" + }, + "devDependencies": { + "@types/archiver": "^5.3.4", + "@types/bcrypt": "^5.0.0", + "@types/busboy": "^1.5.4", + "@types/cors": "^2.8.17", + "@types/generic-pool": "^3.8.1", + "@types/koa": "^2.15.0", + "@types/node": "^20.14.13", + "@types/set-cookie-parser": "^2.4.7", + "@types/xmldom": "^0.1.34", + "@typescript-eslint/eslint-plugin": "^5.12.0", + "@typescript-eslint/parser": "^5.12.0", + "eslint": "^8.9.0", + "eslint-config-google": "^0.14.0", + "eslint-plugin-import": "^2.25.4", + "firebase-functions-test": "^3.0.0", + "koa": "^2.16.0", + "pino-pretty": "^13.0.0", + "replicate": "^0.16.1", + "typescript": "^5.5.4" + }, + "private": true, + "exports": { + ".": "./build/index.js" + } +} diff --git a/backend/functions/public/favicon.ico b/public/favicon.ico similarity index 100% rename from backend/functions/public/favicon.ico rename to public/favicon.ico diff --git a/backend/functions/src/cloud-functions/crawler.ts b/src/api/crawler.ts similarity index 72% rename from backend/functions/src/cloud-functions/crawler.ts rename to src/api/crawler.ts index 86abf9d..2928c82 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/src/api/crawler.ts @@ -1,30 +1,45 @@ -import { - assignTransferProtocolMeta, marshalErrorLike, - RPCHost, RPCReflection, - AssertionFailureError, ParamValidationError, Defer, -} from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; -import _ from 'lodash'; -import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; -import { Request, Response } from 'express'; -const pNormalizeUrl = import("@esm2cjs/normalize-url"); -import { Crawled } from '../db/crawled'; +import { pathToFileURL } from 'url'; import { randomUUID } from 'crypto'; -import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; +import _ from 'lodash'; -import { countGPTToken as estimateToken } from '../shared/utils/openai'; -import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options'; -import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; +import { + assignTransferProtocolMeta, RPCHost, RPCReflection, + AssertionFailureError, ParamValidationError, + RawString, + ApplicationError, +} from 'civkit/civ-rpc'; +import { marshalErrorLike } from 'civkit/lang'; +import { Defer } from 'civkit/defer'; +import { retryWith } from 'civkit/decorators'; + +import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options'; + +import { Crawled } from '../db/crawled'; import { DomainBlockade } from '../db/domain-blockade'; import { DomainProfile } from '../db/domain-profile'; -import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker'; +import { OutputServerEventStream } from '../lib/transform-server-event-stream'; + +import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; import { JSDomControl } from '../services/jsdom'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; import { CurlControl } from '../services/curl'; import { LmControl } from '../services/lm'; import { tryDecodeURIComponent } from '../utils/misc'; +import { CFBrowserRendering } from '../services/cf-browser-rendering'; + +import { GlobalLogger } from '../services/logger'; +import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { AsyncLocalContext } from '../services/async-context'; +import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; +import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors'; + +import { countGPTToken as estimateToken } from '../shared/utils/openai'; +import { ProxyProvider } from '../shared/services/proxy-provider'; +import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; +import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; +import { RobotsTxtService } from '../services/robots-text'; +import { ServiceBadAttemptError } from '../shared/lib/errors'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -33,6 +48,8 @@ export interface ExtraScrappingOptions extends ScrappingOptions { removeSelector?: string | string[]; keepImgDataUrl?: boolean; engine?: string; + allocProxy?: string; + private?: boolean; } const indexProto = { @@ -56,16 +73,18 @@ export class CrawlerHost extends RPCHost { domainProfileRetentionMs = 1000 * 3600 * 24 * 30; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected puppeteerControl: PuppeteerControl, protected curlControl: CurlControl, + protected cfBrowserRendering: CFBrowserRendering, + protected proxyProvider: ProxyProvider, protected lmControl: LmControl, protected jsdomControl: JSDomControl, protected snapshotFormatter: SnapshotFormatter, protected firebaseObjectStorage: FirebaseStorageBucketControl, protected rateLimitControl: RateLimitControl, - protected threadLocal: AsyncContext, - protected fbHealthCheck: FirebaseRoundTripChecker, + protected threadLocal: AsyncLocalContext, + protected robotsTxtService: RobotsTxtService, ) { super(...arguments); @@ -73,7 +92,7 @@ export class CrawlerHost extends RPCHost { if (!snapshot.title?.trim() && !snapshot.pdfs?.length) { return; } - if (options.cookies?.length) { + if (options.cookies?.length || options.private) { // Potential privacy issue, dont cache if cookies are used return; } @@ -84,9 +103,14 @@ export class CrawlerHost extends RPCHost { if (options.locale) { Reflect.set(snapshot, 'locale', options.locale); } - await this.setToCache(options.url, snapshot); - await this.exploreDirectEngine(snapshot).catch(() => undefined); + const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html); + if (analyzed.tokens < 200) { + // Does not contain enough content + return; + } + + await this.setToCache(options.url, snapshot); }); puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => { @@ -108,12 +132,19 @@ export class CrawlerHost extends RPCHost { override async init() { await this.dependencyReady(); + this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, '')); + this.emit('ready'); } - getIndex(user?: JinaEmbeddingsTokenAccount) { + async getIndex(auth?: JinaEmbeddingsAuthDTO) { const indexObject: Record = Object.create(indexProto); - + // Object.assign(indexObject, { + // usage1: `${ctx.origin}/YOUR_URL`, + // usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`, + // homepage: 'https://jina.ai/reader', + // sourceCode: 'https://github.com/jina-ai/reader', + // }); Object.assign(indexObject, { usage1: 'https://r.jina.ai/YOUR_URL', usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', @@ -121,71 +152,83 @@ export class CrawlerHost extends RPCHost { sourceCode: 'https://github.com/jina-ai/reader', }); - if (user) { + await auth?.solveUID(); + if (auth && auth.user) { indexObject[''] = undefined; - indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`; - indexObject.balanceLeft = user.wallet.total_balance; + indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`; + indexObject.balanceLeft = auth.user.wallet.total_balance; } return indexObject; } - @CloudHTTPv2({ - name: 'crawl2', - runtime: { - memory: '4GiB', - timeoutSeconds: 300, - concurrency: 22, + @Method({ + name: 'getIndex', + description: 'Index of the service', + proto: { + http: { + action: 'get', + path: '/', + } }, - tags: ['Crawler'], - httpMethod: ['get', 'post'], - returnType: [String, OutputServerEventStream], - exposeRoot: true, + tags: ['misc', 'crawl'], + returnType: [String, Object], }) - @CloudHTTPv2({ - runtime: { - memory: '4GiB', - cpu: 2, - timeoutSeconds: 300, - concurrency: 10, - maxInstances: 1000, - minInstances: 1, + async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) { + const indexObject = await this.getIndex(auth); + + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { + return indexObject; + } + + return assignTransferProtocolMeta(`${indexObject}`, + { contentType: 'text/plain; charset=utf-8', envelope: null } + ); + } + + + @Method({ + name: 'crawlByPostingToIndex', + description: 'Crawl any url into markdown', + proto: { + http: { + action: 'POST', + path: '/', + } }, - tags: ['Crawler'], - httpMethod: ['get', 'post'], + tags: ['crawl'], returnType: [String, OutputServerEventStream], - exposeRoot: true, + }) + @Method({ + description: 'Crawl any url into markdown', + proto: { + http: { + action: ['GET', 'POST'], + path: '::url', + } + }, + tags: ['crawl'], + returnType: [String, OutputServerEventStream, RawString], }) async crawl( @RPCReflect() rpcReflect: RPCReflection, - @Ctx() ctx: { - req: Request, - res: Response, - }, + @Ctx() ctx: Context, auth: JinaEmbeddingsAuthDTO, crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly, crawlerOptionsParamsAllowed: CrawlerOptions, ) { const uid = await auth.solveUID(); let chargeAmount = 0; - const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; + const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; - // Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url. - const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions); + const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.path), crawlerOptions); if (!targetUrl) { - const latestUser = uid ? await auth.assertUser() : undefined; - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { - return this.getIndex(latestUser); - } - - return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`, - { contentType: 'text/plain', envelope: null } - ); + return await this.getIndex(auth); } // Prevent circular crawling this.puppeteerControl.circuitBreakerHosts.add( - ctx.req.hostname.toLowerCase() + ctx.hostname.toLowerCase() ); if (uid) { @@ -222,8 +265,8 @@ export class CrawlerHost extends RPCHost { apiRoll.chargeAmount = chargeAmount; } }); - } else if (ctx.req.ip) { - const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, [rpcReflect.name.toUpperCase()], + } else if (ctx.ip) { + const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, [rpcReflect.name.toUpperCase()], [ // 20 requests per minute new Date(Date.now() - 60 * 1000), 20 @@ -254,9 +297,12 @@ export class CrawlerHost extends RPCHost { } } + if (crawlerOptions.robotsTxt) { + await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt); + } const crawlOpts = await this.configure(crawlerOptions); - if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { + if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); @@ -265,8 +311,11 @@ export class CrawlerHost extends RPCHost { if (!scrapped) { continue; } + if (rpcReflect.signal.aborted) { + break; + } - const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -293,17 +342,20 @@ export class CrawlerHost extends RPCHost { } let lastScrapped; - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; + if (rpcReflect.signal.aborted) { + break; + } if (!crawlerOptions.isEarlyReturnApplicable()) { continue; } - if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) { + if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) { continue; } - const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { @@ -324,7 +376,7 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -342,16 +394,18 @@ export class CrawlerHost extends RPCHost { for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; - + if (rpcReflect.signal.aborted) { + break; + } if (!crawlerOptions.isEarlyReturnApplicable()) { continue; } - if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) { + if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) { continue; } - const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -370,7 +424,7 @@ export class CrawlerHost extends RPCHost { ); } - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } if (!lastScrapped) { @@ -380,7 +434,7 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts); chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -399,7 +453,7 @@ export class CrawlerHost extends RPCHost { ); } - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null }); } @@ -419,7 +473,7 @@ export class CrawlerHost extends RPCHost { } let result: URL; - const normalizeUrl = (await pNormalizeUrl).default; + const normalizeUrl = require('@esm2cjs/normalize-url').default; try { result = new URL( normalizeUrl( @@ -638,7 +692,25 @@ export class CrawlerHost extends RPCHost { } if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) { - yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? + await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) : + await this.curlControl.sideLoad(urlToCrawl, crawlOpts); + if (!sideLoaded.file) { + throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); + } + const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName); + yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts); + return; + } + if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) { + const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href); + const snapshot = { + href: urlToCrawl.toString(), + html, + title: '', + text: '', + } as PageSnapshot; + yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts); return; } @@ -653,26 +725,68 @@ export class CrawlerHost extends RPCHost { (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) && (_.get(cache.snapshot, 'locale') === crawlOpts?.locale) ) { + if (cache.snapshot) { + cache.snapshot.isFromCache = true; + } yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts); return; } - if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) { - const { digest } = this.getDomainProfileUrlDigest(urlToCrawl); - const domainProfile = await DomainProfile.fromFirestore(digest); - if (domainProfile?.engine === ENGINE_TYPE.DIRECT) { - try { - const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts); + try { + const altOpts = { ...crawlOpts }; + let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ? + await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) : + await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => { + this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href }); - // Expect downstream code to "break" here if it's satisfied with the direct engine - yield snapshot; - if (crawlOpts?.engine === ENGINE_TYPE.AUTO) { - return; + if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) { + return Promise.reject(err); } - } catch (err: any) { - this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) }); + + return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); + }); + if (!sideLoaded.file) { + throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); + } + let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName); + if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) { + yield draftSnapshot; + return; + } + + let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html); + draftSnapshot.title ??= analyzed.title; + let fallbackProxyIsUsed = false; + if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) { + const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts); + if (!proxyLoaded.file) { + throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`); } + const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName); + analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html); + if (proxyLoaded.status === 200 || analyzed.tokens >= 200) { + draftSnapshot = proxySnapshot; + sideLoaded = proxyLoaded; + fallbackProxyIsUsed = true; + } + } + + if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) { + yield draftSnapshot; + } + + if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) { + this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href }); + crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts; + if (fallbackProxyIsUsed) { + this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href }); + } + } + } catch (err: any) { + this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href }); + if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) { + throw err; } } @@ -782,6 +896,8 @@ export class CrawlerHost extends RPCHost { this.threadLocal.set('withImagesSummary', opts.withImagesSummary); this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); + this.threadLocal.set('withIframe', opts.withIframe); + this.threadLocal.set('withShadowDom', opts.withShadowDom); this.threadLocal.set('userAgent', opts.userAgent); if (opts.timeout) { this.threadLocal.set('timeout', opts.timeout * 1000); @@ -804,6 +920,9 @@ export class CrawlerHost extends RPCHost { referer: opts.referer, viewport: opts.viewport, engine: opts.engine, + allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy, + proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false, + private: Boolean(opts.doNotTrack), }; if (opts.locale) { @@ -842,14 +961,15 @@ export class CrawlerHost extends RPCHost { return crawlOpts; } - formatSnapshot( + protected async formatSnapshot( crawlerOptions: CrawlerOptions, snapshot: PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }, nominalUrl?: URL, - urlValidMs?: number + urlValidMs?: number, + scrappingOptions?: ScrappingOptions ) { const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl; @@ -870,7 +990,29 @@ export class CrawlerHost extends RPCHost { return output; } - return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs); + return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions); + } + + async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) { + const snapshotCopy = _.cloneDeep(snapshot); + + if (snapshotCopy.pdfs?.length) { + const pdfUrl = snapshotCopy.pdfs[0]; + if (pdfUrl.startsWith('http')) { + const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl]; + if (sideLoaded?.body) { + snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href; + return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs); + } + + const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions); + if (r.file) { + snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href; + } + } + } + + return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs); } async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise { @@ -967,6 +1109,26 @@ export class CrawlerHost extends RPCHost { return; } + async snapshotNotGoodEnough(snapshot: PageSnapshot) { + if (snapshot.pdfs?.length) { + return false; + } + if (!snapshot.title) { + return true; + } + if (snapshot.parsed?.content) { + return false; + } + if (snapshot.html) { + const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html); + const tokens = r.tokens; + if (tokens < 200) { + return true; + } + } + return false; + } + getDomainProfileUrlDigest(url: URL) { const pathname = url.pathname; const pathVec = pathname.split('/'); @@ -981,4 +1143,29 @@ export class CrawlerHost extends RPCHost { path: finalPath, }; } + + @retryWith((err) => { + if (err instanceof ServiceBadAttemptError) { + // Keep trying + return true; + } + if (err instanceof ApplicationError) { + // Quit with this error + return false; + } + return undefined; + }, 3) + async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) { + const proxy = await this.proxyProvider.alloc(opts?.allocProxy); + const r = await this.curlControl.sideLoad(url, { + ...opts, + proxyUrl: proxy.href, + }); + + if (opts && opts.allocProxy) { + opts.proxyUrl ??= proxy.href; + } + + return { ...r, proxy }; + } } diff --git a/backend/functions/src/cloud-functions/searcher-serper.ts b/src/api/searcher-serper.ts similarity index 89% rename from backend/functions/src/cloud-functions/searcher-serper.ts rename to src/api/searcher-serper.ts index 0d145d4..911f8cd 100644 --- a/backend/functions/src/cloud-functions/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -1,21 +1,25 @@ -import { - assignTransferProtocolMeta, marshalErrorLike, - RPCHost, RPCReflection, - AssertionFailureError, - objHashMd5B64Of, - assignMeta, -} from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { + assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString, +} from 'civkit/civ-rpc'; +import { marshalErrorLike } from 'civkit/lang'; +import { objHashMd5B64Of } from 'civkit/hash'; import _ from 'lodash'; -import { Request, Response } from 'express'; -import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; + +import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; + import { CrawlerHost, ExtraScrappingOptions } from './crawler'; import { SerperSearchResult } from '../db/searched'; -import { CrawlerOptions } from '../dto/scrapping-options'; +import { CrawlerOptions } from '../dto/crawler-options'; import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter'; import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search'; + +import { GlobalLogger } from '../services/logger'; +import { AsyncLocalContext } from '../services/async-context'; +import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; +import { OutputServerEventStream } from '../lib/transform-server-event-stream'; +import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; +import { InsufficientBalanceError } from '../services/errors'; import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES); @@ -33,9 +37,9 @@ export class SearcherHost extends RPCHost { targetResultCount = 5; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected rateLimitControl: RateLimitControl, - protected threadLocal: AsyncContext, + protected threadLocal: AsyncLocalContext, protected serperSearchService: SerperSearchService, protected crawler: CrawlerHost, protected snapshotFormatter: SnapshotFormatter, @@ -49,39 +53,30 @@ export class SearcherHost extends RPCHost { this.emit('ready'); } - @CloudHTTPv2({ - name: 'search2', - runtime: { - cpu: 4, - memory: '4GiB', - timeoutSeconds: 300, - concurrency: 4, + @Method({ + name: 'searchIndex', + ext: { + http: { + action: ['get', 'post'], + path: '/search' + } }, - tags: ['Searcher'], - httpMethod: ['get', 'post'], + tags: ['search'], returnType: [String, OutputServerEventStream], - exposeRoot: true, }) - @CloudHTTPv2({ - runtime: { - cpu: 4, - memory: '16GiB', - timeoutSeconds: 300, - concurrency: 4, - maxInstances: 200, - minInstances: 1, + @Method({ + ext: { + http: { + action: ['get', 'post'], + path: '::q' + } }, - tags: ['Searcher'], - httpMethod: ['get', 'post'], - returnType: [String, OutputServerEventStream], - exposeRoot: true, + tags: ['search'], + returnType: [String, OutputServerEventStream, RawString], }) async search( @RPCReflect() rpcReflect: RPCReflection, - @Ctx() ctx: { - req: Request, - res: Response, - }, + @Ctx() ctx: Context, auth: JinaEmbeddingsAuthDTO, crawlerOptions: CrawlerOptions, searchExplicitOperators: GoogleSearchExplicitOperatorsDto, @@ -102,19 +97,17 @@ export class SearcherHost extends RPCHost { const uid = await auth.solveUID(); // Return content by default - const respondWith = ctx.req.get('X-Respond-With') ?? 'content'; - const crawlWithoutContent = respondWith.includes('no-content'); - const withFavicon = ctx.req.get('X-With-Favicons') === 'true'; + const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content'); + const withFavicon = Boolean(ctx.get('X-With-Favicons')); let chargeAmount = 0; - const noSlashPath = decodeURIComponent(ctx.req.path).slice(1); + const noSlashPath = decodeURIComponent(ctx.path).slice(1); if (!noSlashPath && !q) { - const latestUser = uid ? await auth.assertUser() : undefined; - const index = this.crawler.getIndex(latestUser); + const index = await this.crawler.getIndex(auth); if (!uid) { index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.'; } - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { return index; } @@ -189,7 +182,7 @@ export class SearcherHost extends RPCHost { chargeAmount = 10000; } this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent); - if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) { + if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) { return lastScrapped; } return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null }); @@ -201,7 +194,7 @@ export class SearcherHost extends RPCHost { withFavicon ); - if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { + if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); @@ -210,6 +203,9 @@ export class SearcherHost extends RPCHost { if (!scrapped) { continue; } + if (rpcReflect.signal.aborted) { + break; + } chargeAmount = this.assignChargeAmount(scrapped); sseStream.write({ @@ -233,7 +229,7 @@ export class SearcherHost extends RPCHost { } let earlyReturn = false; - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { let earlyReturnTimer: ReturnType | undefined; const setEarlyReturnTimer = () => { if (earlyReturnTimer) { @@ -251,6 +247,9 @@ export class SearcherHost extends RPCHost { for await (const scrapped of it) { lastScrapped = scrapped; + if (rpcReflect.signal.aborted) { + break; + } if (_.some(scrapped, (x) => this.pageQualified(x))) { setEarlyReturnTimer(); } @@ -299,7 +298,9 @@ export class SearcherHost extends RPCHost { for await (const scrapped of it) { lastScrapped = scrapped; - + if (rpcReflect.signal.aborted) { + break; + } if (_.some(scrapped, (x) => this.pageQualified(x))) { setEarlyReturnTimer(); } @@ -367,8 +368,8 @@ export class SearcherHost extends RPCHost { const dataItems = [ { key: 'title', label: 'Title' }, { key: 'url', label: 'URL Source' }, - { key: 'description', label: 'Description'}, - ] + { key: 'description', label: 'Description' }, + ]; if (withContent) { result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : ''; @@ -386,7 +387,7 @@ export class SearcherHost extends RPCHost { result.toString = function () { const self = this as any; return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n'; - } + }; return result; })); @@ -408,7 +409,6 @@ export class SearcherHost extends RPCHost { if (!searchResults) { return; } - const urls = searchResults.map((x) => new URL(x.link)); const snapshotMap = new WeakMap(); for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) { @@ -427,7 +427,7 @@ export class SearcherHost extends RPCHost { if (snapshotMap.has(x)) { return snapshotMap.get(x); } - return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => { + return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => { r.title ??= upstreamSearchResult.title; r.description = upstreamSearchResult.snippet; snapshotMap.set(x, r); diff --git a/backend/functions/src/cloud-functions/searcher.ts b/src/api/searcher.ts similarity index 90% rename from backend/functions/src/cloud-functions/searcher.ts rename to src/api/searcher.ts index 06b7710..ad765e0 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/src/api/searcher.ts @@ -1,22 +1,30 @@ -import { - assignTransferProtocolMeta, marshalErrorLike, - RPCHost, RPCReflection, - AssertionFailureError, - objHashMd5B64Of, -} from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; -import { Request, Response } from 'express'; -import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth'; -import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search'; -import { CrawlerHost, ExtraScrappingOptions } from './crawler'; -import { WebSearchQueryParams } from '../shared/3rd-party/brave-search'; -import { SearchResult } from '../db/searched'; + +import { + assignTransferProtocolMeta, RPCHost, RPCReflection, + AssertionFailureError, + RawString, +} from 'civkit/civ-rpc'; +import { marshalErrorLike } from 'civkit/lang'; +import { objHashMd5B64Of } from 'civkit/hash'; + +import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types'; -import { CrawlerOptions } from '../dto/scrapping-options'; +import { WebSearchQueryParams } from '../shared/3rd-party/brave-search'; + +import { CrawlerHost, ExtraScrappingOptions } from './crawler'; +import { SearchResult } from '../db/searched'; +import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; +import { CrawlerOptions } from '../dto/crawler-options'; +import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search'; + import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter'; +import { GlobalLogger } from '../services/logger'; +import { AsyncLocalContext } from '../services/async-context'; +import { OutputServerEventStream } from '../lib/transform-server-event-stream'; +import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; +import { InsufficientBalanceError } from '../services/errors'; @singleton() @@ -32,9 +40,9 @@ export class SearcherHost extends RPCHost { targetResultCount = 5; constructor( - protected globalLogger: Logger, + protected globalLogger: GlobalLogger, protected rateLimitControl: RateLimitControl, - protected threadLocal: AsyncContext, + protected threadLocal: AsyncLocalContext, protected braveSearchService: BraveSearchService, protected crawler: CrawlerHost, protected snapshotFormatter: SnapshotFormatter, @@ -48,39 +56,30 @@ export class SearcherHost extends RPCHost { this.emit('ready'); } - @CloudHTTPv2({ - name: 'search2', - runtime: { - cpu: 4, - memory: '4GiB', - timeoutSeconds: 300, - concurrency: 4, + @Method({ + name: 'searchIndex', + ext: { + http: { + action: ['get', 'post'], + path: '/search' + } }, - tags: ['Searcher'], - httpMethod: ['get', 'post'], + tags: ['search'], returnType: [String, OutputServerEventStream], - exposeRoot: true, }) - @CloudHTTPv2({ - runtime: { - cpu: 4, - memory: '16GiB', - timeoutSeconds: 300, - concurrency: 4, - maxInstances: 200, - minInstances: 1, + @Method({ + ext: { + http: { + action: ['get', 'post'], + path: '::q' + } }, - tags: ['Searcher'], - httpMethod: ['get', 'post'], - returnType: [String, OutputServerEventStream], - exposeRoot: true, + tags: ['search'], + returnType: [String, OutputServerEventStream, RawString], }) async search( @RPCReflect() rpcReflect: RPCReflection, - @Ctx() ctx: { - req: Request, - res: Response, - }, + @Ctx() ctx: Context, auth: JinaEmbeddingsAuthDTO, @Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 }) count: number, @@ -90,14 +89,13 @@ export class SearcherHost extends RPCHost { ) { const uid = await auth.solveUID(); let chargeAmount = 0; - const noSlashPath = decodeURIComponent(ctx.req.path).slice(1); + const noSlashPath = decodeURIComponent(ctx.path).slice(1); if (!noSlashPath && !q) { - const latestUser = uid ? await auth.assertUser() : undefined; - const index = this.crawler.getIndex(latestUser); + const index = await this.crawler.getIndex(auth); if (!uid) { index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.'; } - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { return index; } @@ -160,7 +158,7 @@ export class SearcherHost extends RPCHost { count, ); - if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { + if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); rpcReflect.return(sseStream); @@ -193,7 +191,7 @@ export class SearcherHost extends RPCHost { let lastScrapped: any[] | undefined; let earlyReturn = false; - if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { + if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) { let earlyReturnTimer: ReturnType | undefined; const setEarlyReturnTimer = () => { if (earlyReturnTimer) { diff --git a/backend/functions/src/cloud-functions/adaptive-crawler.ts b/src/cloud-functions/adaptive-crawler.ts similarity index 99% rename from backend/functions/src/cloud-functions/adaptive-crawler.ts rename to src/cloud-functions/adaptive-crawler.ts index 8b740eb..971a0fc 100644 --- a/backend/functions/src/cloud-functions/adaptive-crawler.ts +++ b/src/cloud-functions/adaptive-crawler.ts @@ -14,7 +14,7 @@ import robotsParser from 'robots-parser'; import { DOMParser } from '@xmldom/xmldom'; import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options'; -import { CrawlerOptions } from '../dto/scrapping-options'; +import { CrawlerOptions } from '../dto/crawler-options'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task'; import { getFunctions } from 'firebase-admin/functions'; diff --git a/backend/functions/src/cloud-functions/data-crunching.ts b/src/cloud-functions/data-crunching.ts similarity index 99% rename from backend/functions/src/cloud-functions/data-crunching.ts rename to src/cloud-functions/data-crunching.ts index 40fee60..fd7488e 100644 --- a/backend/functions/src/cloud-functions/data-crunching.ts +++ b/src/cloud-functions/data-crunching.ts @@ -9,7 +9,7 @@ import { FirebaseStorageBucketControl, Logger, Param, TempFileManager } from '../shared'; import _ from 'lodash'; -import { CrawlerHost } from './crawler'; +import { CrawlerHost } from '../api/crawler'; import { Crawled } from '../db/crawled'; import dayjs from 'dayjs'; diff --git a/backend/functions/src/db/adaptive-crawl-task.ts b/src/db/adaptive-crawl-task.ts similarity index 100% rename from backend/functions/src/db/adaptive-crawl-task.ts rename to src/db/adaptive-crawl-task.ts diff --git a/backend/functions/src/db/crawled.ts b/src/db/crawled.ts similarity index 100% rename from backend/functions/src/db/crawled.ts rename to src/db/crawled.ts diff --git a/backend/functions/src/db/domain-blockade.ts b/src/db/domain-blockade.ts similarity index 100% rename from backend/functions/src/db/domain-blockade.ts rename to src/db/domain-blockade.ts diff --git a/backend/functions/src/db/domain-profile.ts b/src/db/domain-profile.ts similarity index 90% rename from backend/functions/src/db/domain-profile.ts rename to src/db/domain-profile.ts index 6e552c1..3ee9476 100644 --- a/backend/functions/src/db/domain-profile.ts +++ b/src/db/domain-profile.ts @@ -1,6 +1,6 @@ import { Also, Prop } from 'civkit'; import { FirestoreRecord } from '../shared/lib/firestore'; -import { ENGINE_TYPE } from '../dto/scrapping-options'; +import { ENGINE_TYPE } from '../dto/crawler-options'; @Also({ dictOf: Object diff --git a/backend/functions/src/db/img-alt.ts b/src/db/img-alt.ts similarity index 100% rename from backend/functions/src/db/img-alt.ts rename to src/db/img-alt.ts diff --git a/backend/functions/src/db/pdf.ts b/src/db/pdf.ts similarity index 100% rename from backend/functions/src/db/pdf.ts rename to src/db/pdf.ts diff --git a/backend/functions/src/db/searched.ts b/src/db/searched.ts similarity index 100% rename from backend/functions/src/db/searched.ts rename to src/db/searched.ts diff --git a/backend/functions/src/dto/adaptive-crawler-options.ts b/src/dto/adaptive-crawler-options.ts similarity index 100% rename from backend/functions/src/dto/adaptive-crawler-options.ts rename to src/dto/adaptive-crawler-options.ts diff --git a/backend/functions/src/dto/scrapping-options.ts b/src/dto/crawler-options.ts similarity index 85% rename from backend/functions/src/dto/scrapping-options.ts rename to src/dto/crawler-options.ts index e2cfd41..391aed6 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/src/dto/crawler-options.ts @@ -1,6 +1,6 @@ import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined -import type { Request, Response } from 'express'; import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser'; +import { Context } from '../services/registry'; export enum CONTENT_FORMAT { CONTENT = 'content', @@ -19,6 +19,7 @@ export enum ENGINE_TYPE { DIRECT = 'direct', VLM = 'vlm', READER_LM = 'readerlm-v2', + CF_BROWSER_RENDERING = 'cf-browser-rendering', } const CONTENT_FORMAT_VALUES = new Set(Object.values(CONTENT_FORMAT)); @@ -125,6 +126,11 @@ class Viewport extends AutoCastable { in: 'header', schema: { type: 'string' } }, + 'X-Proxy': { + description: `Use a proxy server provided by Jina AI.\n\nOptionally specify two-letter country code.`, + in: 'header', + schema: { type: 'string' } + }, 'X-Set-Cookie': { description: `Sets cookie(s) to the headless browser for your request. \n\n` + `Syntax is the same with standard Set-Cookie`, @@ -297,6 +303,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() proxyUrl?: string; + @Prop() + proxy?: string; + @Prop() userAgent?: string; @@ -338,15 +347,18 @@ export class CrawlerOptions extends AutoCastable { @Prop() jsonSchema?: object; + @Prop() + robotsTxt?: string; + + @Prop() + doNotTrack?: number | null; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; - const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { - req: Request, - res: Response, - } | undefined; + const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; - const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format'); - if (customMode !== undefined) { + const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format'); + if (customMode) { instance.respondWith = customMode; } if (instance.respondWith) { @@ -361,74 +373,74 @@ export class CrawlerOptions extends AutoCastable { } } - const locale = ctx?.req.get('x-locale'); - if (locale !== undefined) { + const locale = ctx?.get('x-locale'); + if (locale) { instance.locale = locale; } - const referer = ctx?.req.get('x-referer'); - if (referer !== undefined) { + const referer = ctx?.get('x-referer'); + if (referer) { instance.referer = referer; } - const withGeneratedAlt = ctx?.req.get('x-with-generated-alt'); - if (withGeneratedAlt !== undefined) { + const withGeneratedAlt = ctx?.get('x-with-generated-alt'); + if (withGeneratedAlt) { instance.withGeneratedAlt = Boolean(withGeneratedAlt); } - const withLinksSummary = ctx?.req.get('x-with-links-summary'); - if (withLinksSummary !== undefined) { + const withLinksSummary = ctx?.get('x-with-links-summary'); + if (withLinksSummary) { if (withLinksSummary === 'all') { instance.withLinksSummary = withLinksSummary; } else { instance.withLinksSummary = Boolean(withLinksSummary); } } - const withImagesSummary = ctx?.req.get('x-with-images-summary'); - if (withImagesSummary !== undefined) { + const withImagesSummary = ctx?.get('x-with-images-summary'); + if (withImagesSummary) { instance.withImagesSummary = Boolean(withImagesSummary); } - const retainImages = ctx?.req.get('x-retain-images'); + const retainImages = ctx?.get('x-retain-images'); if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) { instance.retainImages = retainImages as any; } if (instance.withGeneratedAlt) { instance.retainImages = 'all_p'; } - const noCache = ctx?.req.get('x-no-cache'); - if (noCache !== undefined) { + const noCache = ctx?.get('x-no-cache'); + if (noCache) { instance.noCache = Boolean(noCache); } if (instance.noCache && instance.cacheTolerance === undefined) { instance.cacheTolerance = 0; } - let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || ''); + let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || ''); if (!isNaN(cacheTolerance)) { instance.cacheTolerance = cacheTolerance; } - const noGfm = ctx?.req.get('x-no-gfm'); + const noGfm = ctx?.get('x-no-gfm'); if (noGfm) { instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm); } - let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || ''); + let timeoutSeconds = parseInt(ctx?.get('x-timeout') || ''); if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) { instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180; - } else if (ctx?.req.get('x-timeout')) { + } else if (ctx?.get('x-timeout')) { instance.timeout = null; } - const removeSelector = ctx?.req.get('x-remove-selector')?.split(', '); - instance.removeSelector ??= removeSelector; - const targetSelector = ctx?.req.get('x-target-selector')?.split(', '); - instance.targetSelector ??= targetSelector; - const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', '); - instance.waitForSelector ??= waitForSelector || instance.targetSelector; + const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean); + instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined; + const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean); + instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined; + const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean); + instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector; instance.targetSelector = filterSelector(instance.targetSelector); - const overrideUserAgent = ctx?.req.get('x-user-agent'); + const overrideUserAgent = ctx?.get('x-user-agent') || undefined; instance.userAgent ??= overrideUserAgent; - const engine = ctx?.req.get('x-engine'); + const engine = ctx?.get('x-engine'); if (engine) { instance.engine = engine; } @@ -443,18 +455,18 @@ export class CrawlerOptions extends AutoCastable { instance.respondWith = CONTENT_FORMAT.READER_LM; } - const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url'); - if (keepImgDataUrl !== undefined) { + const keepImgDataUrl = ctx?.get('x-keep-img-data-url'); + if (keepImgDataUrl) { instance.keepImgDataUrl = Boolean(keepImgDataUrl); } - const withIframe = ctx?.req.get('x-with-iframe'); - if (withIframe !== undefined) { + const withIframe = ctx?.get('x-with-iframe'); + if (withIframe) { instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe); } if (instance.withIframe) { instance.timeout ??= null; } - const withShadowDom = ctx?.req.get('x-with-shadow-dom'); + const withShadowDom = ctx?.get('x-with-shadow-dom'); if (withShadowDom) { instance.withShadowDom = Boolean(withShadowDom); } @@ -463,7 +475,7 @@ export class CrawlerOptions extends AutoCastable { } const cookies: Cookie[] = []; - const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); + const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean); if (Array.isArray(setCookieHeaders)) { for (const setCookie of setCookieHeaders) { cookies.push({ @@ -477,21 +489,24 @@ export class CrawlerOptions extends AutoCastable { } instance.setCookies = cookies; - const proxyUrl = ctx?.req.get('x-proxy-url'); - instance.proxyUrl ??= proxyUrl; + const proxyUrl = ctx?.get('x-proxy-url'); + instance.proxyUrl ??= proxyUrl || undefined; + const proxy = ctx?.get('x-proxy'); + instance.proxy ??= proxy || undefined; + const robotsTxt = ctx?.get('x-robots-txt'); + instance.robotsTxt ??= robotsTxt || undefined; - if (instance.cacheTolerance) { - instance.cacheTolerance = instance.cacheTolerance * 1000; - } - - const tokenBudget = ctx?.req.get('x-token-budget') || undefined; + const tokenBudget = ctx?.get('x-token-budget'); instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined; - const baseMode = ctx?.req.get('x-base') || undefined; + const baseMode = ctx?.get('x-base'); if (baseMode) { instance.base = baseMode as any; } + const dnt = ctx?.get('dnt'); + instance.doNotTrack ??= (parseInt(dnt || '') || null); + if (instance.cacheTolerance) { instance.cacheTolerance = instance.cacheTolerance * 1000; } diff --git a/src/dto/jina-embeddings-auth.ts b/src/dto/jina-embeddings-auth.ts new file mode 100644 index 0000000..5a55834 --- /dev/null +++ b/src/dto/jina-embeddings-auth.ts @@ -0,0 +1,216 @@ +import _ from 'lodash'; +import { + Also, AuthenticationFailedError, AuthenticationRequiredError, + DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT, + AutoCastable, +} from 'civkit/civ-rpc'; +import { htmlEscape } from 'civkit/escape'; +import { marshalErrorLike } from 'civkit/lang'; + +import type { Context } from 'koa'; + +import logger from '../services/logger'; +import { InjectProperty } from '../services/registry'; +import { AsyncLocalContext } from '../services/async-context'; + +import envConfig from '../shared/services/secrets'; +import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings'; +import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; + + +const authDtoLogger = logger.child({ service: 'JinaAuthDTO' }); + +const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY); + +@Also({ + openapi: { + operation: { + parameters: { + 'Authorization': { + description: htmlEscape`Jina Token for authentication.\n\n` + + htmlEscape`- Member of \n\n` + + `- Authorization: Bearer {YOUR_JINA_TOKEN}` + , + in: 'header', + schema: { + anyOf: [ + { type: 'string', format: 'token' } + ] + } + } + } + } + } +}) +export class JinaEmbeddingsAuthDTO extends AutoCastable { + uid?: string; + bearerToken?: string; + user?: JinaEmbeddingsTokenAccount; + + @InjectProperty(AsyncLocalContext) + ctxMgr!: AsyncLocalContext; + + jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT; + + static override from(input: any) { + const instance = super.from(input) as JinaEmbeddingsAuthDTO; + + const ctx = input[RPC_CALL_ENVIRONMENT] as Context; + + if (ctx) { + const authorization = ctx.get('authorization'); + + if (authorization) { + const authToken = authorization.split(' ')[1] || authorization; + instance.bearerToken = authToken; + } + + } + + if (!instance.bearerToken && input._token) { + instance.bearerToken = input._token; + } + + return instance; + } + + async getBrief(ignoreCache?: boolean | string) { + if (!this.bearerToken) { + throw new AuthenticationRequiredError({ + message: 'Jina API key is required to authenticate. Please get one from https://jina.ai' + }); + } + + let account; + try { + account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken); + } catch (err) { + // FireStore would not accept any string as input and may throw if not happy with it + void 0; + } + + + const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.getTime() : Infinity; + + if (account && !ignoreCache) { + if (account && age < 180_000) { + this.user = account; + this.uid = this.user?.user_id; + + return account; + } + } + + try { + const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken); + const brief = r.data; + const draftAccount = JinaEmbeddingsTokenAccount.from({ + ...account, ...brief, _id: this.bearerToken, + lastSyncedAt: new Date() + }); + await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true }); + + this.user = draftAccount; + this.uid = this.user?.user_id; + + return draftAccount; + } catch (err: any) { + authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) }); + + if (err?.status === 401) { + throw new AuthenticationFailedError({ + message: 'Invalid API key, please get a new one from https://jina.ai' + }); + } + + if (account) { + this.user = account; + this.uid = this.user?.user_id; + + return account; + } + + + throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`); + } + } + + async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') { + const user = await this.assertUser(); + const uid = user.user_id; + user.wallet.total_balance -= tokenCount; + + return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, { + model_name: mdl, + api_endpoint: endpoint, + consumer: { + id: uid, + user_id: uid, + }, + usage: { + total_tokens: tokenCount + }, + labels: { + model_name: mdl + } + }).then((r) => { + JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!) + .update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) }) + .catch((err) => { + authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) }); + }); + + return r; + }).catch((err) => { + user.wallet.total_balance += tokenCount; + authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) }); + }); + } + + async solveUID() { + if (this.uid) { + this.ctxMgr.set('uid', this.uid); + + return this.uid; + } + + if (this.bearerToken) { + await this.getBrief(); + this.ctxMgr.set('uid', this.uid); + + return this.uid; + } + + return undefined; + } + + async assertUID() { + const uid = await this.solveUID(); + + if (!uid) { + throw new AuthenticationRequiredError('Authentication failed'); + } + + return uid; + } + + async assertUser() { + if (this.user) { + return this.user; + } + + await this.getBrief(); + + return this.user!; + } + + getRateLimits(...tags: string[]) { + const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective()); + + if (descs.length) { + return descs; + } + + return undefined; + } +} diff --git a/backend/functions/src/fetch.d.ts b/src/fetch.d.ts similarity index 100% rename from backend/functions/src/fetch.d.ts rename to src/fetch.d.ts diff --git a/backend/functions/src/index.ts b/src/index.ts similarity index 100% rename from backend/functions/src/index.ts rename to src/index.ts diff --git a/src/lib/transform-server-event-stream.ts b/src/lib/transform-server-event-stream.ts new file mode 100644 index 0000000..1f833a7 --- /dev/null +++ b/src/lib/transform-server-event-stream.ts @@ -0,0 +1,169 @@ +import { TPM, parseJSONText } from 'civkit'; +import { Transform, TransformCallback, TransformOptions } from 'stream'; + +export class InputServerEventStream extends Transform { + cache: string[] = []; + + constructor(options?: TransformOptions) { + super({ + ...options, + readableObjectMode: true + }); + } + + decodeRoutine() { + if (!this.cache.length) { + return; + } + + const vecs = this.cache.join('').split(/\r?\n\r?\n/); + this.cache.length = 0; + const lastVec = vecs.pop(); + if (lastVec) { + this.cache.push(lastVec); + } + + for (const x of vecs) { + const lines: string[] = x.split(/\r?\n/); + + const event: { + event?: string; + data?: string; + id?: string; + retry?: number; + } = {}; + + for (const l of lines) { + const columnPos = l.indexOf(':'); + if (columnPos <= 0) { + continue; + } + const key = l.substring(0, columnPos); + const rawValue = l.substring(columnPos + 1); + const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue; + if (key === 'data') { + if (event.data) { + event.data += value || '\n'; + } else if (event.data === '') { + event.data += '\n'; + event.data += value || '\n'; + } else { + event.data = value; + } + } else if (key === 'retry') { + event.retry = parseInt(value, 10); + } else { + Reflect.set(event, key, value); + } + } + + if (event.data) { + const parsed = parseJSONText(event.data); + if (parsed && typeof parsed === 'object') { + event.data = parsed; + } + } + + if (Object.keys(event).length) { + this.push(event); + } + } + } + + override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void { + if (chunk === null) { + this.push(null); + } + + this.cache.push(chunk.toString()); + this.decodeRoutine(); + + callback(); + } + + override _final(callback: (error?: Error | null | undefined) => void): void { + this.decodeRoutine(); + callback(); + } +} + +@TPM({ + contentType: 'text/event-stream', +}) +export class OutputServerEventStream extends Transform { + n: number = 0; + + constructor(options?: TransformOptions) { + super({ + ...options, writableObjectMode: true, encoding: 'utf-8' + }); + } + + encodeRoutine(chunk: { + event?: string; + data?: any; + id?: string; + retry?: number; + } | string) { + if (typeof chunk === 'object') { + const lines: string[] = []; + + if (chunk.event) { + lines.push(`event: ${chunk.event}`); + } + if (chunk.data) { + if (typeof chunk.data === 'string') { + for (const x of chunk.data.split(/\r?\n/)) { + lines.push(`data: ${x}`); + } + } else { + lines.push(`data: ${JSON.stringify(chunk.data)}`); + } + } + if (chunk.id) { + lines.push(`id: ${chunk.id}`); + } + if (chunk.retry) { + lines.push(`retry: ${chunk.retry}`); + } + if (!lines.length) { + lines.push(`data: ${JSON.stringify(chunk)}`); + } + this.push(lines.join('\n')); + this.push('\n\n'); + this.n++; + + return; + } else if (typeof chunk === 'string') { + const lines: string[] = []; + for (const x of chunk.split(/\r?\n/)) { + lines.push(`data: ${x}`); + } + + this.push(lines.join('\n')); + this.push('\n\n'); + this.n++; + } + } + + override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void { + if (chunk === null) { + this.push(null); + } + + this.encodeRoutine(chunk); + + callback(); + } +} + +export interface OutputServerEventStream extends Transform { + write(chunk: string | { + event?: string; + data?: any; + id?: string; + retry?: number; + }, callback?: (error: Error | null | undefined) => void): boolean; + write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean; + write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean; +} diff --git a/backend/functions/src/services/alt-text.ts b/src/services/alt-text.ts similarity index 100% rename from backend/functions/src/services/alt-text.ts rename to src/services/alt-text.ts diff --git a/src/services/async-context.ts b/src/services/async-context.ts new file mode 100644 index 0000000..f99e506 --- /dev/null +++ b/src/services/async-context.ts @@ -0,0 +1,10 @@ +import { GlobalAsyncContext } from 'civkit/async-context'; +import { container, singleton } from 'tsyringe'; + +@singleton() +export class AsyncLocalContext extends GlobalAsyncContext { } + +const instance = container.resolve(AsyncLocalContext); +Reflect.set(process, 'asyncLocalContext', instance); + +export default instance; diff --git a/src/services/blackhole-detector.ts b/src/services/blackhole-detector.ts new file mode 100644 index 0000000..fded4f4 --- /dev/null +++ b/src/services/blackhole-detector.ts @@ -0,0 +1,72 @@ +import { singleton } from 'tsyringe'; +import { AsyncService } from 'civkit/async-service'; +import { GlobalLogger } from './logger'; + + +@singleton() +export class BlackHoleDetector extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + lastWorkedTs?: number; + lastDoneRequestTs?: number; + lastIncomingRequestTs?: number; + + maxDelay = 1000 * 30; + concurrentRequests = 0; + + strikes = 0; + + constructor(protected globalLogger: GlobalLogger) { + super(...arguments); + + if (process.env.NODE_ENV?.startsWith('prod')) { + setInterval(() => { + this.routine(); + }, 1000 * 15).unref(); + } + } + + override async init() { + await this.dependencyReady(); + this.logger.debug('BlackHoleDetector started'); + this.emit('ready'); + } + + routine() { + const now = Date.now(); + const lastWorked = this.lastWorkedTs; + if (!lastWorked) { + return; + } + const dt = (now - lastWorked); + if (this.concurrentRequests > 0 && + this.lastIncomingRequestTs && lastWorked && + this.lastIncomingRequestTs >= lastWorked && + (dt > (this.maxDelay * (this.strikes + 1))) + ) { + this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`); + this.strikes += 1; + } + + if (this.strikes >= 3) { + this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`); + this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`)); + } + } + + incomingRequest() { + this.lastIncomingRequestTs = Date.now(); + this.lastWorkedTs ??= Date.now(); + this.concurrentRequests++; + } + doneWithRequest() { + this.concurrentRequests--; + this.lastDoneRequestTs = Date.now(); + } + + itWorked() { + this.lastWorkedTs = Date.now(); + this.strikes = 0; + } + +}; diff --git a/backend/functions/src/services/brave-search.ts b/src/services/brave-search.ts similarity index 97% rename from backend/functions/src/services/brave-search.ts rename to src/services/brave-search.ts index 2fe7ffc..ec83c45 100644 --- a/backend/functions/src/services/brave-search.ts +++ b/src/services/brave-search.ts @@ -7,6 +7,7 @@ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; import { AsyncContext } from '../shared'; import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types'; import type { Request, Response } from 'express'; +import { BlackHoleDetector } from './blackhole-detector'; @singleton() export class BraveSearchService extends AsyncService { @@ -20,6 +21,7 @@ export class BraveSearchService extends AsyncService { protected secretExposer: SecretExposer, protected geoipControl: GeoIPService, protected threadLocal: AsyncContext, + protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); } @@ -69,6 +71,7 @@ export class BraveSearchService extends AsyncService { while (maxTries--) { try { const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record }); + this.blackHoleDetector.itWorked(); return r.parsed; } catch (err: any) { diff --git a/src/services/cf-browser-rendering.ts b/src/services/cf-browser-rendering.ts new file mode 100644 index 0000000..1bb724c --- /dev/null +++ b/src/services/cf-browser-rendering.ts @@ -0,0 +1,38 @@ +import { container, singleton } from 'tsyringe'; +import { AsyncService } from 'civkit/async-service'; +import { Logger, SecretExposer } from '../shared'; +import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare'; + +@singleton() +export class CFBrowserRendering extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + client!: CloudFlareHTTP; + + constructor( + protected globalLogger: Logger, + protected secretExposer: SecretExposer, + ) { + super(...arguments); + } + + + override async init() { + await this.dependencyReady(); + const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':'); + this.client = new CloudFlareHTTP(account, key); + + this.emit('ready'); + } + + async fetchContent(url: string) { + const r = await this.client.fetchBrowserRenderedHTML({ url }); + + return r.parsed.result; + } + +} + +const instance = container.resolve(CFBrowserRendering); + +export default instance; diff --git a/src/services/curl.ts b/src/services/curl.ts new file mode 100644 index 0000000..2cb8983 --- /dev/null +++ b/src/services/curl.ts @@ -0,0 +1,387 @@ +import { marshalErrorLike } from 'civkit/lang'; +import { AsyncService } from 'civkit/async-service'; +import { singleton } from 'tsyringe'; + +import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl'; +import { parseString as parseSetCookieString } from 'set-cookie-parser'; + +import { ScrappingOptions } from './puppeteer'; +import { Logger } from '../shared/services/logger'; +import { AssertionFailureError, FancyFile } from 'civkit'; +import { ServiceBadAttemptError, TempFileManager } from '../shared'; +import { createBrotliDecompress, createInflate, createGunzip } from 'zlib'; +import { ZSTDDecompress } from 'simple-zstd'; +import _ from 'lodash'; +import { Readable } from 'stream'; +import { AsyncLocalContext } from './async-context'; + +export interface CURLScrappingOptions extends ScrappingOptions { + method?: string; + body?: string | Buffer; +} + +@singleton() +export class CurlControl extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + chromeVersion: string = `132`; + safariVersion: string = `537.36`; + platform: string = `Linux`; + ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`; + + lifeCycleTrack = new WeakMap(); + + constructor( + protected globalLogger: Logger, + protected tempFileManager: TempFileManager, + protected asyncLocalContext: AsyncLocalContext, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + if (process.platform === 'darwin') { + this.platform = `macOS`; + } else if (process.platform === 'win32') { + this.platform = `Windows`; + } + + this.emit('ready'); + } + + impersonateChrome(ua: string) { + this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1]; + this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1]; + this.ua = ua; + } + + curlImpersonateHeader(curl: Curl, headers?: object) { + const mixinHeaders: Record = { + 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, + 'sec-ch-ua-mobile': '?0', + 'sec-ch-ua-platform': this.platform, + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': this.ua, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', + 'Sec-Fetch-Site': 'none', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-User': '?1', + 'Sec-Fetch-Dest': 'document', + 'Accept-Encoding': 'gzip, deflate, br, zstd', + 'Accept-Language': 'en-US,en;q=0.9', + }; + const headersCopy: Record = { ...headers }; + for (const k of Object.keys(mixinHeaders)) { + const lowerK = k.toLowerCase(); + if (headersCopy[lowerK]) { + mixinHeaders[k] = headersCopy[lowerK]; + delete headersCopy[lowerK]; + } + } + Object.assign(mixinHeaders, headersCopy); + + curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => { + if (Array.isArray(v) && v.length) { + return v.map((v2) => `${k}: ${v2}`); + } + return [`${k}: ${v}`]; + })); + + return curl; + } + + urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { + return new Promise<{ + statusCode: number, + data?: FancyFile, + headers: HeaderInfo[], + }>((resolve, reject) => { + let contentType = ''; + const curl = new Curl(); + curl.enable(CurlFeature.StreamResponse); + curl.setOpt('URL', urlToCrawl.toString()); + curl.setOpt(Curl.option.FOLLOWLOCATION, false); + curl.setOpt(Curl.option.SSL_VERIFYPEER, false); + curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(30_000, crawlOpts?.timeoutMs || 30_000)); + curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000); + if (crawlOpts?.method) { + curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase()); + } + if (crawlOpts?.body) { + curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString()); + } + + const headersToSet = { ...crawlOpts?.extraHeaders }; + if (crawlOpts?.cookies?.length) { + const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${encodeURIComponent(cookie.value)}`); + headersToSet.cookie ??= cookieChunks.join('; '); + } + if (crawlOpts?.referer) { + headersToSet.referer ??= crawlOpts.referer; + } + if (crawlOpts?.overrideUserAgent) { + headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent; + } + + this.curlImpersonateHeader(curl, headersToSet); + + if (crawlOpts?.proxyUrl) { + const proxyUrlCopy = new URL(crawlOpts.proxyUrl); + curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href); + } + + let curlStream: Readable | undefined; + curl.on('error', (err, errCode) => { + curl.close(); + this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err: marshalErrorLike(err), urlToCrawl }); + if (curlStream) { + // For some reason, manually emitting error event is required for curlStream. + curlStream.emit('error', err); + curlStream.destroy(err); + } + const err2 = this.digestCurlCode(errCode, err.message); + if (err2) { + reject(err2); + return; + } + reject(new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`)); + }); + curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB + let status = -1; + let contentEncoding = ''; + curl.once('end', () => { + if (curlStream) { + curlStream.once('end', () => curl.close()); + return; + } + curl.close(); + }); + curl.on('stream', (stream, statusCode, headers) => { + this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode }); + status = statusCode; + curlStream = stream; + for (const headerSet of (headers as HeaderInfo[])) { + for (const [k, v] of Object.entries(headerSet)) { + if (k.trim().endsWith(':')) { + Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || ''); + Reflect.deleteProperty(headerSet, k); + continue; + } + if (v === undefined) { + Reflect.set(headerSet, k, ''); + continue; + } + if (k.toLowerCase() === 'content-type' && typeof v === 'string') { + contentType = v.toLowerCase(); + } + } + } + const lastResHeaders = headers[headers.length - 1]; + for (const [k, v] of Object.entries(lastResHeaders)) { + const kl = k.toLowerCase(); + if (kl === 'content-type') { + contentType = v.toLowerCase(); + } + if (kl === 'content-encoding') { + contentEncoding = v.toLowerCase(); + } + if (contentType && contentEncoding) { + break; + } + } + + if ([301, 302, 307, 308].includes(statusCode)) { + if (stream) { + stream.resume(); + } + resolve({ + statusCode: status, + data: undefined, + headers: headers as HeaderInfo[], + }); + return; + } + + if (!stream) { + resolve({ + statusCode: status, + data: undefined, + headers: headers as HeaderInfo[], + }); + return; + } + + switch (contentEncoding) { + case 'gzip': { + const decompressed = createGunzip(); + stream.pipe(decompressed); + stream.once('error', (err) => { + decompressed.destroy(err); + }); + stream = decompressed; + break; + } + case 'deflate': { + const decompressed = createInflate(); + stream.pipe(decompressed); + stream.once('error', (err) => { + decompressed.destroy(err); + }); + stream = decompressed; + break; + } + case 'br': { + const decompressed = createBrotliDecompress(); + stream.pipe(decompressed); + stream.once('error', (err) => { + decompressed.destroy(err); + }); + stream = decompressed; + break; + } + case 'zstd': { + const decompressed = ZSTDDecompress(); + stream.pipe(decompressed); + stream.once('error', (err) => { + decompressed.destroy(err); + }); + stream = decompressed; + break; + } + default: { + break; + } + } + + const fpath = this.tempFileManager.alloc(); + const fancyFile = FancyFile.auto(stream, fpath); + this.tempFileManager.bindPathTo(fancyFile, fpath); + resolve({ + statusCode: status, + data: fancyFile, + headers: headers as HeaderInfo[], + }); + }); + + curl.perform(); + }); + } + + async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { + let leftRedirection = 10; + let opts = { ...crawlOpts }; + let nextHopUrl = urlToCrawl; + const fakeHeaderInfos: HeaderInfo[] = []; + do { + const r = await this.urlToFile1Shot(nextHopUrl, opts); + + if ([301, 302, 307, 308].includes(r.statusCode)) { + const headers = r.headers[r.headers.length - 1]; + const location = headers.Location || headers.location; + if (!location) { + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`); + } + + const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie']; + if (setCookieHeader) { + const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader]; + const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true })); + if (parsed.length) { + opts.cookies = [...(opts.cookies || []), ...parsed]; + } + } + + nextHopUrl = new URL(location, nextHopUrl); + fakeHeaderInfos.push(...r.headers); + leftRedirection -= 1; + continue; + } + + return { + statusCode: r.statusCode, + data: r.data, + headers: fakeHeaderInfos.concat(r.headers), + }; + } while (leftRedirection > 0); + + throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`); + } + + async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) { + const curlResult = await this.urlToFile(targetUrl, crawlOpts); + + let finalURL = targetUrl; + const sideLoadOpts: CURLScrappingOptions['sideLoad'] = { + impersonate: {}, + proxyOrigin: {}, + }; + for (const headers of curlResult.headers) { + sideLoadOpts.impersonate[finalURL.href] = { + status: headers.result?.code || -1, + headers: _.omit(headers, 'result'), + contentType: headers['Content-Type'] || headers['content-type'], + }; + if (crawlOpts?.proxyUrl) { + sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl; + } + if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) { + const location = headers.Location || headers.location; + if (!location) { + throw new Error(`Bad redirection: ${curlResult.headers.length} times`); + } + finalURL = new URL(location, finalURL); + } + } + const lastHeaders = curlResult.headers[curlResult.headers.length - 1]; + const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type']).toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream'; + const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition']; + const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop(); + + if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) { + sideLoadOpts.impersonate[finalURL.href].body = curlResult.data; + } + + // This should keep the file from being garbage collected and deleted until this asyncContext/request is done. + this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data); + + return { + finalURL, + sideLoadOpts, + chain: curlResult.headers, + status: curlResult.statusCode, + headers: lastHeaders, + contentType, + contentDisposition, + fileName, + file: curlResult.data + }; + } + + digestCurlCode(code: CurlCode, msg: string) { + switch (code) { + // 400 User errors + case CurlCode.CURLE_GOT_NOTHING: + case CurlCode.CURLE_COULDNT_RESOLVE_HOST: + case CurlCode.CURLE_REMOTE_ACCESS_DENIED: { + return new AssertionFailureError(msg); + } + + // Retryable errors + case CurlCode.CURLE_SSL_CONNECT_ERROR: + case CurlCode.CURLE_QUIC_CONNECT_ERROR: + case CurlCode.CURLE_COULDNT_RESOLVE_PROXY: + case CurlCode.CURLE_COULDNT_CONNECT: + case CurlCode.CURLE_PARTIAL_FILE: + case CurlCode.CURLE_OPERATION_TIMEDOUT: { + return new ServiceBadAttemptError(msg); + } + + default: { + return undefined; + } + } + } +} diff --git a/src/services/errors.ts b/src/services/errors.ts new file mode 100644 index 0000000..f867021 --- /dev/null +++ b/src/services/errors.ts @@ -0,0 +1,70 @@ +import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc'; +import _ from 'lodash'; +import dayjs from 'dayjs'; +import utc from 'dayjs/plugin/utc'; + +dayjs.extend(utc); + +@StatusCode(50301) +export class ServiceDisabledError extends ApplicationError { } + +@StatusCode(50302) +export class ServiceCrashedError extends ApplicationError { } + +@StatusCode(50303) +export class ServiceNodeResourceDrainError extends ApplicationError { } + +@StatusCode(40104) +export class EmailUnverifiedError extends ApplicationError { } + +@StatusCode(40201) +export class InsufficientCreditsError extends ApplicationError { } + +@StatusCode(40202) +export class FreeFeatureLimitError extends ApplicationError { } + +@StatusCode(40203) +export class InsufficientBalanceError extends ApplicationError { } + +@StatusCode(40903) +export class LockConflictError extends ApplicationError { } + +@StatusCode(40904) +export class BudgetExceededError extends ApplicationError { } + +@StatusCode(45101) +export class HarmfulContentError extends ApplicationError { } + +@StatusCode(45102) +export class SecurityCompromiseError extends ApplicationError { } + +@StatusCode(41201) +export class BatchSizeTooLargeError extends ApplicationError { } + + +@StatusCode(42903) +export class RateLimitTriggeredError extends ApplicationError { + + @Prop({ + desc: 'Retry after seconds', + }) + retryAfter?: number; + + @Prop({ + desc: 'Retry after date', + }) + retryAfterDate?: Date; + + protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() { + const retryAfter = this.retryAfter || this.retryAfterDate; + if (!retryAfter) { + return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]; + } + + return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), { + headers: { + 'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`, + } + }); + } +} diff --git a/src/services/finalizer.ts b/src/services/finalizer.ts new file mode 100644 index 0000000..85bc565 --- /dev/null +++ b/src/services/finalizer.ts @@ -0,0 +1,24 @@ +import { AbstractFinalizerService } from 'civkit/finalizer'; +import { container, singleton } from 'tsyringe'; +import { isMainThread } from 'worker_threads'; +import { GlobalLogger } from './logger'; + +@singleton() +export class FinalizerService extends AbstractFinalizerService { + + container = container; + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor(protected globalLogger: GlobalLogger) { + super(...arguments); + } + +} + +const instance = container.resolve(FinalizerService); +export const { Finalizer } = instance.decorators(); +export default instance; + +if (isMainThread) { + instance.serviceReady(); +} diff --git a/backend/functions/src/services/geoip.ts b/src/services/geoip.ts similarity index 100% rename from backend/functions/src/services/geoip.ts rename to src/services/geoip.ts diff --git a/backend/functions/src/services/jsdom.ts b/src/services/jsdom.ts similarity index 93% rename from backend/functions/src/services/jsdom.ts rename to src/services/jsdom.ts index ffcaaa8..0202442 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/src/services/jsdom.ts @@ -4,9 +4,10 @@ import { Logger } from '../shared/services/logger'; import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; -import { Threaded } from '../shared/services/threaded'; -import type { ExtraScrappingOptions } from '../cloud-functions/crawler'; +import { Threaded } from '../services/threaded'; +import type { ExtraScrappingOptions } from '../api/crawler'; import { tailwindClasses } from '../utils/tailwind-classes'; +import { countGPTToken } from '../shared'; const pLinkedom = import('linkedom'); @@ -37,7 +38,8 @@ export class JSDomControl extends AsyncService { return snapshot; } - return this.actualNarrowSnapshot(snapshot, options); + // SideLoad contains native objects that cannot go through thread boundaries. + return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined }); } @Threaded() @@ -348,6 +350,22 @@ export class JSDomControl extends AsyncService { } } } + + @Threaded() + async analyzeHTMLTextLite(sourceHTML: string) { + let jsdom = this.linkedom.parseHTML(sourceHTML); + if (!jsdom.window.document.documentElement) { + jsdom = this.linkedom.parseHTML(`${sourceHTML}`); + } + jsdom.window.document.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); + const text = jsdom.window.document.body.innerText || ''; + + return { + title: jsdom.window.document.title, + text, + tokens: countGPTToken(text.replaceAll(/[\s\r\n\t]+/g, ' ')), + }; + } } const jsdomControl = container.resolve(JSDomControl); diff --git a/backend/functions/src/services/lm.ts b/src/services/lm.ts similarity index 100% rename from backend/functions/src/services/lm.ts rename to src/services/lm.ts diff --git a/src/services/logger.ts b/src/services/logger.ts new file mode 100644 index 0000000..a930e9b --- /dev/null +++ b/src/services/logger.ts @@ -0,0 +1,57 @@ +import { AbstractPinoLogger } from 'civkit/pino-logger'; +import { singleton, container } from 'tsyringe'; +import { threadId } from 'node:worker_threads'; +import { getTraceCtx } from 'civkit/async-context'; + + +const levelToSeverityMap: { [k: string]: string | undefined; } = { + trace: 'DEFAULT', + debug: 'DEBUG', + info: 'INFO', + warn: 'WARNING', + error: 'ERROR', + fatal: 'CRITICAL', +}; + +@singleton() +export class GlobalLogger extends AbstractPinoLogger { + loggerOptions = { + level: 'debug', + base: { + tid: threadId, + } + }; + + override init(): void { + if (process.env['NODE_ENV']?.startsWith('prod')) { + super.init(process.stdout); + } else { + const PinoPretty = require('pino-pretty').PinoPretty; + super.init(PinoPretty({ + singleLine: true, + colorize: true, + messageFormat(log: any, messageKey: any) { + return `${log['tid'] ? `[${log['tid']}]` : ''}[${log['service'] || 'ROOT'}] ${log[messageKey]}`; + }, + })); + } + + + this.emit('ready'); + } + + override log(...args: any[]) { + const [levelObj, ...rest] = args; + const severity = levelToSeverityMap[levelObj?.level]; + const traceCtx = getTraceCtx(); + const patched: any= { ...levelObj, severity }; + const traceId = traceCtx?.googleTraceId || traceCtx?.traceId; + if (traceId && process.env['GCLOUD_PROJECT']) { + patched['logging.googleapis.com/trace'] = `projects/${process.env['GCLOUD_PROJECT']}/traces/${traceId}`; + } + return super.log(patched, ...rest); + } +} + +const instance = container.resolve(GlobalLogger); +export default instance; diff --git a/backend/functions/src/services/pdf-extract.ts b/src/services/pdf-extract.ts similarity index 95% rename from backend/functions/src/services/pdf-extract.ts rename to src/services/pdf-extract.ts index d6d2abe..b6022c4 100644 --- a/backend/functions/src/services/pdf-extract.ts +++ b/src/services/pdf-extract.ts @@ -8,14 +8,15 @@ import { PDFContent } from '../db/pdf'; import dayjs from 'dayjs'; import { FirebaseStorageBucketControl } from '../shared'; import { randomUUID } from 'crypto'; -import { PDFDocumentLoadingTask } from 'pdfjs-dist'; +import type { PDFDocumentLoadingTask } from 'pdfjs-dist'; +import path from 'path'; const utc = require('dayjs/plugin/utc'); // Import the UTC plugin dayjs.extend(utc); // Extend dayjs with the UTC plugin const timezone = require('dayjs/plugin/timezone'); dayjs.extend(timezone); -const pPdfjs = import('pdfjs-dist'); - +const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs'); +const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/'; const md5Hasher = new HashManager('md5', 'hex'); @@ -26,7 +27,10 @@ function stdDev(numbers: number[]) { return Math.sqrt(avgSquareDiff); } -function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean { +function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean { + if (!transform) { + return false; + } const [a, b, c, d, _e, _f] = transform; // Calculate the rotation angles using arctan(b/a) and arctan(-c/d) @@ -94,13 +98,15 @@ export class PDFExtractor extends AsyncService { loadingTask = this.pdfjs.getDocument({ data: binary, disableFontFace: true, - verbosity: 0 + verbosity: 0, + cMapUrl: nodeCmapUrl, }); } else { loadingTask = this.pdfjs.getDocument({ url, disableFontFace: true, - verbosity: 0 + verbosity: 0, + cMapUrl: nodeCmapUrl, }); } @@ -112,7 +118,7 @@ export class PDFExtractor extends AsyncService { for (const pg of _.range(0, doc.numPages)) { const page = await doc.getPage(pg + 1); - const textContent = await page.getTextContent(); + const textContent = await page.getTextContent({ includeMarkedContent: true }); textItems.push((textContent.items as TextItem[])); } @@ -335,6 +341,7 @@ export class PDFExtractor extends AsyncService { }); } catch (err) { this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err }); + throw err; } return extracted; diff --git a/src/services/pseudo-transfer.ts b/src/services/pseudo-transfer.ts new file mode 100644 index 0000000..45a2c76 --- /dev/null +++ b/src/services/pseudo-transfer.ts @@ -0,0 +1,65 @@ +import { marshalErrorLike } from 'civkit'; +import { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseudo-transfer'; +import { container, singleton } from 'tsyringe'; + + +@singleton() +export class PseudoTransfer extends AbstractPseudoTransfer { + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + } + +} + +const instance = container.resolve(PseudoTransfer); + +Object.defineProperty(Error.prototype, SYM_PSEUDO_TRANSFERABLE, { + value: function () { + const prototype = this; + return { + copyOwnProperty: 'all', + marshall: (input: Error) => marshalErrorLike(input), + unMarshall: (input: object) => { + Object.setPrototypeOf(input, prototype); + return input; + }, + }; + }, + enumerable: false, +}); +instance.expectPseudoTransferableType(Error); +for (const x of [...Object.values(require('./errors')), ...Object.values(require('civkit/civ-rpc'))]) { + if (typeof x === 'function' && x.prototype instanceof Error) { + instance.expectPseudoTransferableType(x as any); + } +} + + +Object.defineProperty(URL.prototype, SYM_PSEUDO_TRANSFERABLE, { + value: function () { + return { + copyOwnProperty: 'none', + marshall: (input: URL) => ({ href: input.href }), + unMarshall: (input: { href: string; }) => new URL(input.href), + }; + }, + enumerable: false, +}); +instance.expectPseudoTransferableType(URL); + +Object.defineProperty(Buffer.prototype, SYM_PSEUDO_TRANSFERABLE, { + value: function () { + return { + copyOwnProperty: 'none', + unMarshall: (input: Uint8Array | Buffer) => Buffer.isBuffer(input) ? input : Buffer.from(input), + marshall: (input: Uint8Array | Buffer) => input, + }; + }, + enumerable: false, +}); +instance.expectPseudoTransferableType(Buffer); + + +export default instance; diff --git a/backend/functions/src/services/puppeteer.ts b/src/services/puppeteer.ts similarity index 86% rename from backend/functions/src/services/puppeteer.ts rename to src/services/puppeteer.ts index 62bf11c..6b25f0b 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -1,7 +1,7 @@ import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; -import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError } from 'civkit'; +import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit'; import { Logger } from '../shared/services/logger'; import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer'; @@ -14,6 +14,9 @@ import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainE import { TimeoutError } from 'puppeteer'; import _ from 'lodash'; import { isIP } from 'net'; +import { CurlControl } from './curl'; +import { readFile } from 'fs/promises'; +import { BlackHoleDetector } from './blackhole-detector'; const tldExtract = require('tld-extract'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); @@ -53,6 +56,8 @@ export interface PageSnapshot { text: string; status?: number; statusText?: string; + isIntermediate?: boolean; + isFromCache?: boolean; parsed?: Partial | null; screenshot?: Buffer; pageshot?: Buffer; @@ -82,17 +87,22 @@ export interface ScrappingOptions { injectFrameScripts?: string[]; injectPageScripts?: string[]; viewport?: Viewport; + proxyResources?: boolean; + + sideLoad?: { + impersonate: { + [url: string]: { + status: number; + headers: { [k: string]: string | string[]; }; + contentType?: string; + body?: FancyFile; + }; + }; + proxyOrigin: { [origin: string]: string; }; + }; + } - -// const puppeteerStealth = require('puppeteer-extra-plugin-stealth'); -// puppeteer.use(puppeteerStealth()); -// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override'); -// puppeteer.use(puppeteerUAOverride({ -// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`, -// platform: `Linux`, -// })) - puppeteer.use(puppeteerBlockResources({ blockedTypes: new Set(['media']), interceptResolutionPriority: 1, @@ -460,6 +470,8 @@ export class PuppeteerControl extends AsyncService { constructor( protected globalLogger: Logger, + protected curlControl: CurlControl, + protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95; @@ -514,10 +526,11 @@ export class PuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); + this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, '')); + + await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); this.emit('ready'); - - this.newPage().then((r) => this.__loadedPage.push(r)); } @perNextTick() @@ -538,8 +551,10 @@ export class PuppeteerControl extends AsyncService { } } - async newPage() { - await this.serviceReady(); + async newPage(bewareDeadLock: any = false) { + if (!bewareDeadLock) { + await this.serviceReady(); + } const sn = this._sn++; let page; try { @@ -687,7 +702,7 @@ export class PuppeteerControl extends AsyncService { `); this.snMap.set(page, sn); - this.logger.info(`Page ${sn} created.`); + this.logger.debug(`Page ${sn} created.`); this.lastPageCratedAt = Date.now(); this.livePages.add(page); this.pagePhase.set(page, 'idle'); @@ -731,7 +746,7 @@ export class PuppeteerControl extends AsyncService { return; } const sn = this.snMap.get(page); - this.logger.info(`Closing page ${sn}`); + this.logger.debug(`Closing page ${sn}`); await Promise.race([ (async () => { const ctx = page.browserContext(); @@ -749,7 +764,7 @@ export class PuppeteerControl extends AsyncService { this.pagePhase.delete(page); } - async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator { + async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator { // parsedUrl.search = ''; const url = parsedUrl.toString(); @@ -761,7 +776,9 @@ export class PuppeteerControl extends AsyncService { const page = await this.getNextPage(); this.pagePhase.set(page, 'active'); page.on('response', (resp) => { - if (resp.request().isNavigationRequest()) { + this.blackHoleDetector.itWorked(); + const req = resp.request(); + if (req.frame() === page.mainFrame() && req.isNavigationRequest()) { navigationResponse = resp; } if (!resp.ok()) { @@ -774,7 +791,111 @@ export class PuppeteerControl extends AsyncService { pdfUrls.push(url); } }); - if (options?.extraHeaders) { + page.on('request', async (req) => { + if (req.isInterceptResolutionHandled()) { + return; + }; + const reqUrlParsed = new URL(req.url()); + if (!reqUrlParsed.protocol.startsWith('http')) { + const overrides = req.continueRequestOverrides(); + + return req.continue(overrides, 0); + } + const typ = req.resourceType(); + if (!options.proxyResources) { + const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ); + if (!isDocRequest) { + const overrides = req.continueRequestOverrides(); + + return req.continue(overrides, 0); + } + } + const sideload = options.sideLoad; + + const impersonate = sideload?.impersonate[reqUrlParsed.href]; + if (impersonate) { + let body; + if (impersonate.body) { + body = await readFile(await impersonate.body.filePath); + if (req.isInterceptResolutionHandled()) { + return; + } + } + return req.respond({ + status: impersonate.status, + headers: impersonate.headers, + contentType: impersonate.contentType, + body: body ? Uint8Array.from(body) : undefined, + }, 999); + } + + const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin]; + + if (proxy) { + try { + const curled = await this.curlControl.sideLoad(reqUrlParsed, { + ...options, + method: req.method(), + body: req.postData(), + extraHeaders: { + ...req.headers(), + ...options.extraHeaders, + }, + proxyUrl: proxy + }); + if (req.isInterceptResolutionHandled()) { + return; + }; + + if (curled.chain.length === 1) { + if (!curled.file) { + return req.respond({ + status: curled.status, + headers: _.omit(curled.headers, 'result'), + contentType: curled.contentType, + }, 999); + } + const body = await readFile(await curled.file.filePath); + if (req.isInterceptResolutionHandled()) { + return; + }; + return req.respond({ + status: curled.status, + headers: _.omit(curled.headers, 'result'), + contentType: curled.contentType, + body: Uint8Array.from(body), + }, 999); + } + options.sideLoad ??= curled.sideLoadOpts; + _.merge(options.sideLoad, curled.sideLoadOpts); + const firstReq = curled.chain[0]; + + return req.respond({ + status: firstReq.result!.code, + headers: _.omit(firstReq, 'result'), + }, 999); + } catch (err: any) { + this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) }); + + } + } + + if (req.isInterceptResolutionHandled()) { + return; + }; + const overrides = req.continueRequestOverrides(); + const continueArgs = [{ + ...overrides, + headers: { + ...req.headers(), + ...overrides?.headers, + ...options.extraHeaders, + } + }, 1] as const; + + return req.continue(continueArgs[0], continueArgs[1]); + }); + if (options.extraHeaders) { page.on('request', async (req) => { if (req.isInterceptResolutionHandled()) { return; @@ -795,7 +916,7 @@ export class PuppeteerControl extends AsyncService { } let pageScriptEvaluations: Promise[] = []; let frameScriptEvaluations: Promise[] = []; - if (options?.injectPageScripts?.length) { + if (options.injectPageScripts?.length) { page.on('framenavigated', (frame) => { if (frame !== page.mainFrame()) { return; @@ -808,7 +929,7 @@ export class PuppeteerControl extends AsyncService { ); }); } - if (options?.injectFrameScripts?.length) { + if (options.injectFrameScripts?.length) { page.on('framenavigated', (frame) => { frameScriptEvaluations.push( Promise.allSettled(options.injectFrameScripts!.map((x) => frame.evaluate(x).catch((err) => { @@ -819,34 +940,28 @@ export class PuppeteerControl extends AsyncService { } const sn = this.snMap.get(page); this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); - if (options?.locale) { + if (options.locale) { // Add headers via request interception to walk around this bug // https://github.com/puppeteer/puppeteer/issues/10235 // await page.setExtraHTTPHeaders({ - // 'Accept-Language': options?.locale + // 'Accept-Language': options.locale // }); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, "language", { get: function () { - return options?.locale; + return options.locale; } }); Object.defineProperty(navigator, "languages", { get: function () { - return [options?.locale]; + return [options.locale]; } }); }); } - if (options?.proxyUrl) { - await page.useProxy(options.proxyUrl, { - headers: options.extraHeaders, - interceptResolutionPriority: 2, - }); - } - if (options?.cookies) { + if (options.cookies) { const mapped = options.cookies.map((x) => { const draft: CookieParam = { name: x.name, @@ -876,10 +991,10 @@ export class PuppeteerControl extends AsyncService { }); } } - if (options?.overrideUserAgent) { + if (options.overrideUserAgent) { await page.setUserAgent(options.overrideUserAgent); } - if (options?.viewport) { + if (options.viewport) { await page.setViewport(options.viewport); } @@ -921,13 +1036,13 @@ export class PuppeteerControl extends AsyncService { ); }); - const timeout = options?.timeoutMs || 30_000; + const timeout = options.timeoutMs || 30_000; const goToOptions: GoToOptions = { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout, }; - if (options?.referer) { + if (options.referer) { goToOptions.referer = options.referer; } @@ -1019,7 +1134,7 @@ export class PuppeteerControl extends AsyncService { }); gotoPromise.catch(() => 'just dont crash anything'); let waitForPromise: Promise | undefined; - if (options?.waitForSelector) { + if (options.waitForSelector) { const t0 = Date.now(); waitForPromise = nextSnapshotDeferred.promise.then(() => { const t1 = Date.now(); @@ -1054,7 +1169,7 @@ export class PuppeteerControl extends AsyncService { if (waitForPromise) { ckpt.push(waitForPromise); } - if (options?.minIntervalMs) { + if (options.minIntervalMs) { ckpt.push(delay(options.minIntervalMs)); } let error; @@ -1074,7 +1189,7 @@ export class PuppeteerControl extends AsyncService { } as PageSnapshot; break; } - if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { + if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { screenshot = Buffer.from(await page.screenshot()); pageshot = Buffer.from(await page.screenshot({ fullPage: true })); lastHTML = snapshot.html; @@ -1084,7 +1199,8 @@ export class PuppeteerControl extends AsyncService { ...snapshot, status: navigationResponse?.status(), statusText: navigationResponse?.statusText(), - pdfs: _.uniq(pdfUrls), screenshot, pageshot + pdfs: _.uniq(pdfUrls), screenshot, pageshot, + isIntermediate: true, } as PageSnapshot; } if (error) { diff --git a/src/services/registry.ts b/src/services/registry.ts new file mode 100644 index 0000000..e4257a1 --- /dev/null +++ b/src/services/registry.ts @@ -0,0 +1,60 @@ +import { propertyInjectorFactory } from 'civkit/property-injector'; +import { KoaRPCRegistry } from 'civkit/civ-rpc/koa'; +import { container, singleton } from 'tsyringe'; +import { IntegrityEnvelope } from 'civkit/civ-rpc'; +import bodyParser from '@koa/bodyparser'; + +import { GlobalLogger } from './logger'; +import { TempFileManager } from './temp-file'; +import { AsyncLocalContext } from './async-context'; +import { BlackHoleDetector } from './blackhole-detector'; +export { Context } from 'koa'; + +export const InjectProperty = propertyInjectorFactory(container); + +@singleton() +export class RPCRegistry extends KoaRPCRegistry { + + title = 'Jina Reader API'; + container = container; + logger = this.globalLogger.child({ service: this.constructor.name }); + static override envelope = IntegrityEnvelope; + override _BODY_PARSER_LIMIT = '102mb'; + override _RESPONSE_STREAM_MODE = 'koa' as const; + + override koaMiddlewares = [ + this.__CORSAllowAllMiddleware.bind(this), + bodyParser({ + encoding: 'utf-8', + enableTypes: ['json', 'form'], + jsonLimit: this._BODY_PARSER_LIMIT, + xmlLimit: this._BODY_PARSER_LIMIT, + formLimit: this._BODY_PARSER_LIMIT, + }), + this.__multiParse.bind(this), + this.__binaryParse.bind(this), + ]; + + constructor( + protected globalLogger: GlobalLogger, + protected ctxMgr: AsyncLocalContext, + protected tempFileManager: TempFileManager, + protected blackHoleDetector: BlackHoleDetector, + ) { + super(...arguments); + + this.on('run', () => this.blackHoleDetector.incomingRequest()); + this.on('ran', () => this.blackHoleDetector.doneWithRequest()); + this.on('fail', () => this.blackHoleDetector.doneWithRequest()); + } + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + } + +} + +const instance = container.resolve(RPCRegistry); +export default instance; +export const { Method, RPCMethod, RPCReflect, Param, Ctx, } = instance.decorators(); diff --git a/src/services/robots-text.ts b/src/services/robots-text.ts new file mode 100644 index 0000000..4e786fb --- /dev/null +++ b/src/services/robots-text.ts @@ -0,0 +1,129 @@ +import { singleton } from 'tsyringe'; +import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc'; +import { AsyncService } from 'civkit/async-service'; +import { HashManager } from 'civkit/hash'; +import { marshalErrorLike } from 'civkit/lang'; + +import { Logger } from '../shared/services/logger'; +import { BraveSearchHTTP } from '../shared/3rd-party/brave-search'; +import { FirebaseStorageBucketControl } from '../shared'; +import { URL } from 'url'; +import { Threaded } from '../services/threaded'; + + +export const md5Hasher = new HashManager('md5', 'hex'); + +@singleton() +export class RobotsTxtService extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + braveSearchHTTP!: BraveSearchHTTP; + + constructor( + protected globalLogger: Logger, + protected firebaseStorageBucketControl: FirebaseStorageBucketControl, + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + this.emit('ready'); + } + + async getCachedRobotTxt(origin: string) { + const digest = md5Hasher.hash(origin.toLowerCase()); + const cacheLoc = `/robot-txt/${digest}`; + let buff; + buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined); + if (buff) { + return buff.toString(); + } + + const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) }); + if (!r.ok) { + throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`); + } + buff = Buffer.from(await r.arrayBuffer()); + + this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, { + contentType: 'text/plain' + }).catch((err) => { + this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) }); + }); + + return buff.toString(); + } + + @Threaded() + async assertAccessAllowed(url: URL, inputMyUa = '*') { + let robotTxt: string = ''; + try { + robotTxt = await this.getCachedRobotTxt(url.origin); + } catch (err) { + if (err instanceof DownstreamServiceFailureError) { + return true; + } + throw err; + } + const myUa = inputMyUa.toLowerCase(); + const lines = robotTxt.split(/\r?\n/g); + + let currentUa = myUa || '*'; + let uaLine = 'User-Agent: *'; + const pathNormalized = `${url.pathname}?`; + + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.startsWith('#') || !trimmed) { + continue; + } + const [k, ...rest] = trimmed.split(':'); + const key = k.trim().toLowerCase(); + const value = rest.join(':').trim(); + + if (key === 'user-agent') { + currentUa = value.toLowerCase(); + if (value === '*') { + currentUa = myUa; + } + uaLine = line; + continue; + } + + if (currentUa !== myUa) { + continue; + } + + if (key === 'disallow') { + if (!value) { + return true; + } + if (value.includes('*')) { + const [head, tail] = value.split('*'); + if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) { + throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); + } + } else if (pathNormalized.startsWith(value)) { + throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`); + } + + continue; + } + + if (key === 'allow') { + if (!value) { + return true; + } + if (pathNormalized.startsWith(value)) { + return true; + } + continue; + } + } + + return true; + } + +} diff --git a/backend/functions/src/services/serper-search.ts b/src/services/serper-search.ts similarity index 94% rename from backend/functions/src/services/serper-search.ts rename to src/services/serper-search.ts index b8f6507..3be7c11 100644 --- a/backend/functions/src/services/serper-search.ts +++ b/src/services/serper-search.ts @@ -1,11 +1,12 @@ import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit'; -import type { Request, Response } from 'express'; import { singleton } from 'tsyringe'; import { Logger } from '../shared/services/logger'; import { SecretExposer } from '../shared/services/secrets'; import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip'; import { AsyncContext } from '../shared'; import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search'; +import { BlackHoleDetector } from './blackhole-detector'; +import { Context } from './registry'; @singleton() export class SerperSearchService extends AsyncService { @@ -19,6 +20,7 @@ export class SerperSearchService extends AsyncService { protected secretExposer: SecretExposer, protected geoipControl: GeoIPService, protected threadLocal: AsyncContext, + protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); } @@ -61,6 +63,7 @@ export class SerperSearchService extends AsyncService { try { this.logger.debug(`Doing external search`, query); const r = await this.serperSearchHTTP.webSearch(query); + this.blackHoleDetector.itWorked(); return r.parsed; } catch (err: any) { @@ -132,15 +135,12 @@ export class GoogleSearchExplicitOperatorsDto extends AutoCastable { static override from(input: any) { const instance = super.from(input) as GoogleSearchExplicitOperatorsDto; - const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { - req: Request, - res: Response, - } | undefined; + const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined; const params = ['ext', 'filetype', 'intitle', 'loc', 'site']; for (const p of params) { - const customValue = ctx?.req.get(`x-${p}`) || ctx?.req.get(`${p}`); + const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`); if (!customValue) { continue; } diff --git a/backend/functions/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts similarity index 84% rename from backend/functions/src/services/snapshot-formatter.ts rename to src/services/snapshot-formatter.ts index 85fff87..3fd6cb1 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -1,19 +1,22 @@ import { randomUUID } from 'crypto'; import { container, singleton } from 'tsyringe'; -import { AsyncService, HashManager, marshalErrorLike } from 'civkit'; +import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit'; import TurndownService, { Filter, Rule } from 'turndown'; import { Logger } from '../shared/services/logger'; import { PageSnapshot } from './puppeteer'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { AsyncContext } from '../shared/services/async-context'; -import { Threaded } from '../shared/services/threaded'; +import { Threaded } from '../services/threaded'; import { JSDomControl } from './jsdom'; import { AltTextService } from './alt-text'; import { PDFExtractor } from './pdf-extract'; import { cleanAttribute } from '../utils/misc'; import _ from 'lodash'; import { STATUS_CODES } from 'http'; -import type { CrawlerOptions } from '../dto/scrapping-options'; +import type { CrawlerOptions } from '../dto/crawler-options'; +import { readFile } from 'fs/promises'; +import { pathToFileURL } from 'url'; +import { countGPTToken } from '../shared'; export interface FormattedPage { @@ -189,7 +192,7 @@ export class SnapshotFormatter extends AsyncService { (!mode.includes('markdown') && !mode.includes('content'))) ) { const dt = Date.now() - t0; - this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); const formatted: FormattedPage = { title: (snapshot.parsed?.title || snapshot.title || '').trim(), @@ -401,7 +404,9 @@ export class SnapshotFormatter extends AsyncService { const n = code - 200; if (n < 0 || n >= 200) { const text = snapshot.statusText || STATUS_CODES[code]; - formatted.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; + formatted.warning ??= ''; + const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; } } @@ -428,7 +433,31 @@ export class SnapshotFormatter extends AsyncService { if (this.threadLocal.get('withLinksSummary') === 'all') { formatted.links = links; } else { - formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:'))); + formatted.links = _(links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value(); + } + } + + if (countGPTToken(formatted.content) < 200) { + formatted.warning ??= ''; + if (snapshot.isIntermediate) { + const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.'; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + } + if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) { + const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.'; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + } + if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) { + const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.'; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + } + if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) { + const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.'; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; + } + if (snapshot.isFromCache) { + const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.'; + formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`; } } @@ -468,7 +497,7 @@ export class SnapshotFormatter extends AsyncService { } if (this.warning) { - mixins.push(`Warning: ${this.warning}`); + mixins.push(this.warning.split('\n').map((v) => `Warning: ${v}`).join('\n')); } if (mode.includes('markdown')) { @@ -488,7 +517,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; Object.defineProperty(f, 'textRepresentation', { value: textRepresentation, enumerable: false }); const dt = Date.now() - t0; - this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); + this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt }); return f as FormattedPage; } @@ -526,7 +555,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; if (this.threadLocal.get('withLinksSummary') === 'all') { mixin.links = inferred.links; } else { - mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:'))); + mixin.links = _(inferred.links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value(); } } if (snapshot.status) { @@ -534,7 +563,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; const n = code - 200; if (n < 0 || n >= 200) { const text = snapshot.statusText || STATUS_CODES[code]; - mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; + mixin.warning ??= ''; + const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`; + mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`; } } @@ -697,6 +728,52 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return false; } + + async createSnapshotFromFile(url: URL, file: FancyFile, overrideContentType?: string, overrideFileName?: string) { + if (overrideContentType === 'application/octet-stream') { + overrideContentType = undefined; + } + + const contentType = (overrideContentType || await file.mimeType).toLowerCase(); + const fileName = overrideFileName || `${url.origin}${url.pathname}`; + const snapshot: PageSnapshot = { + title: '', + href: url.href, + html: '', + text: '' + }; + + if (contentType.startsWith('image/')) { + snapshot.html = `${fileName}`; + snapshot.title = fileName; + + return snapshot; + } + if (contentType.startsWith('text/html')) { + if ((await file.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${url}: file too large`); + } + snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); + + return snapshot; + } + if (contentType.startsWith('text/') || contentType.startsWith('application/json')) { + if ((await file.size) > 1024 * 1024 * 32) { + throw new AssertionFailureError(`Failed to access ${url}: file too large`); + } + snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.html = `
${snapshot.text}
`; + + return snapshot; + } + if (contentType.startsWith('application/pdf')) { + snapshot.pdfs = [pathToFileURL(await file.filePath).href]; + + return snapshot; + } + + throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`); + } } const snapshotFormatter = container.resolve(SnapshotFormatter); diff --git a/src/services/temp-file.ts b/src/services/temp-file.ts new file mode 100644 index 0000000..857f06b --- /dev/null +++ b/src/services/temp-file.ts @@ -0,0 +1,22 @@ +import { AbstractTempFileManger } from 'civkit/temp'; +import { unlink } from 'fs/promises'; +import { singleton } from 'tsyringe'; + +@singleton() +export class TempFileManager extends AbstractTempFileManger { + + rootDir = ''; + + override async init() { + await this.dependencyReady(); + await super.init(); + this.emit('ready'); + } + + override async standDown() { + await super.standDown(); + + await unlink(this.rootDir); + + } +} diff --git a/src/services/threaded.ts b/src/services/threaded.ts new file mode 100644 index 0000000..9107ecd --- /dev/null +++ b/src/services/threaded.ts @@ -0,0 +1,66 @@ +import 'reflect-metadata'; + +import { singleton, container } from 'tsyringe'; +import { AbstractThreadedServiceRegistry } from 'civkit/threaded'; +import _ from 'lodash'; + +import { GlobalLogger } from './logger'; +import { AsyncLocalContext } from './async-context'; +import { PseudoTransfer } from './pseudo-transfer'; +import { cpus } from 'os'; +import { isMainThread } from 'worker_threads'; + +@singleton() +export class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry { + container = container; + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: GlobalLogger, + public asyncContext: AsyncLocalContext, + public pseudoTransfer: PseudoTransfer, + ) { + super(...arguments); + } + + setMaxWorkersByCpu() { + const cpuStat = cpus(); + + const evenCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 0).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0); + const oddCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 1).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0); + + const isLikelyHyperThreaded = (oddCpuCycles / evenCpuCycles) < 0.5; + + this.maxWorkers = isLikelyHyperThreaded ? cpuStat.length / 2 : cpuStat.length; + } + + override async init() { + await this.dependencyReady(); + await super.init(); + + if (isMainThread) { + this.setMaxWorkersByCpu(); + await Promise.all( + _.range(0, 2).map( + (_n) => + new Promise( + (resolve, reject) => { + this.createWorker() + .once('message', resolve) + .once('error', reject); + } + ) + ) + ); + } + + this.emit('ready'); + } + +} + + +const instance = container.resolve(ThreadedServiceRegistry); +export default instance; +export const { Method, Param, Ctx, RPCReflect, Threaded } = instance.decorators(); diff --git a/src/shared b/src/shared new file mode 120000 index 0000000..2cfcd24 --- /dev/null +++ b/src/shared @@ -0,0 +1 @@ +../thinapps-shared/backend \ No newline at end of file diff --git a/src/stand-alone/crawl.ts b/src/stand-alone/crawl.ts new file mode 100644 index 0000000..59e8fa4 --- /dev/null +++ b/src/stand-alone/crawl.ts @@ -0,0 +1,139 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; + +import { KoaServer } from 'civkit/civ-rpc/koa'; +import http2 from 'http2'; +import { CrawlerHost } from '../api/crawler'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { Context, Next } from 'koa'; +import { RPCRegistry } from '../services/registry'; +import { AsyncResource } from 'async_hooks'; +import { runOnce } from 'civkit/decorators'; +import { randomUUID } from 'crypto'; +import { ThreadedServiceRegistry } from '../services/threaded'; +import globalLogger, { GlobalLogger } from '../services/logger'; +import { AsyncLocalContext } from '../services/async-context'; + +process.on('unhandledRejection', (err) => { + globalLogger.warn('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + globalLogger.error('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + globalLogger.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class CrawlStandAloneServer extends KoaServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: GlobalLogger, + protected registry: RPCRegistry, + protected crawlerHost: CrawlerHost, + protected threadLocal: AsyncLocalContext, + protected threads: ThreadedServiceRegistry, + ) { + super(...arguments); + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + const fn = this.koaApp.callback(); + this.httpServer = http2.createServer((req, res) => { + const ar = new AsyncResource('HTTP2ServerRequest'); + ar.runInAsyncScope(fn, this.koaApp, req, res); + }); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + makeAssetsServingController() { + return (ctx: Context, next: Next) => { + const requestPath = ctx.path; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + + ctx.body = fs.createReadStream(asset.path); + ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'; + ctx.set('Content-Length', asset.stats.size.toString()); + + return; + }; + } + + registerRoutes(): void { + this.koaApp.use(this.makeAssetsServingController()); + this.koaApp.use(this.registry.makeShimController()); + } + + // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context + // TraceId is expected to be request-bound and unique. So these two has to be distinguished. + @runOnce() + override insertAsyncHookMiddleware() { + const asyncHookMiddleware = async (ctx: Context, next: () => Promise) => { + const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0]; + this.threadLocal.setup({ + traceId: randomUUID(), + traceT0: new Date(), + googleTraceId, + }); + + return next(); + }; + + this.koaApp.use(asyncHookMiddleware); + } + +} +const instance = container.resolve(CrawlStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/src/stand-alone/search.ts b/src/stand-alone/search.ts new file mode 100644 index 0000000..c3c7cc8 --- /dev/null +++ b/src/stand-alone/search.ts @@ -0,0 +1,148 @@ +import 'reflect-metadata'; +import { container, singleton } from 'tsyringe'; + +import { KoaServer } from 'civkit/civ-rpc/koa'; +import http2 from 'http2'; +import { SearcherHost } from '../api/searcher-serper'; +import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; +import path from 'path'; +import fs from 'fs'; +import { mimeOfExt } from 'civkit/mime'; +import { Context, Next } from 'koa'; +import { RPCRegistry } from '../services/registry'; +import { AsyncResource } from 'async_hooks'; +import { runOnce } from 'civkit/decorators'; +import { randomUUID } from 'crypto'; +import { ThreadedServiceRegistry } from '../services/threaded'; +import globalLogger, { GlobalLogger } from '../services/logger'; +import { AsyncLocalContext } from '../services/async-context'; + +process.on('unhandledRejection', (err) => { + globalLogger.warn('Unhandled rejection', err); +}); + +process.on('uncaughtException', (err) => { + globalLogger.error('Uncaught exception', err); + + // Looks like Firebase runtime does not handle error properly. + // Make sure to quit the process. + globalLogger.error('Uncaught exception, process quit.'); + process.nextTick(() => process.exit(1)); +}); + +@singleton() +export class SearchStandAloneServer extends KoaServer { + logger = this.globalLogger.child({ service: this.constructor.name }); + + httpAlternativeServer?: typeof this['httpServer']; + assets = new Map(); + + constructor( + protected globalLogger: GlobalLogger, + protected registry: RPCRegistry, + protected searcherHost: SearcherHost, + protected threadLocal: AsyncLocalContext, + protected threads: ThreadedServiceRegistry, + ) { + super(...arguments); + } + + h2c() { + this.httpAlternativeServer = this.httpServer; + const fn = this.koaApp.callback(); + this.httpServer = http2.createServer((req, res) => { + const ar = new AsyncResource('HTTP2ServerRequest'); + ar.runInAsyncScope(fn, this.koaApp, req, res); + }); + // useResourceBasedDefaultTracker(); + + return this; + } + + override async init() { + await this.walkForAssets(); + await this.dependencyReady(); + + for (const [k,v] of this.registry.conf.entries()) { + if (v.tags?.includes('crawl')) { + this.registry.conf.delete(k); + } + } + + await super.init(); + } + + async walkForAssets() { + const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public')); + + for (const file of files) { + if (file.type !== 'file') { + continue; + } + this.assets.set(file.relativePath.toString(), file); + } + } + + override listen(port: number) { + const r = super.listen(port); + if (this.httpAlternativeServer) { + const altPort = port + 1; + this.httpAlternativeServer.listen(altPort, () => { + this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`); + }); + } + + return r; + } + + makeAssetsServingController() { + return (ctx: Context, next: Next) => { + const requestPath = ctx.path; + const file = requestPath.slice(1); + if (!file) { + return next(); + } + + const asset = this.assets.get(file); + if (asset?.type !== 'file') { + return next(); + } + + ctx.body = fs.createReadStream(asset.path); + ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream'; + ctx.set('Content-Length', asset.stats.size.toString()); + + return; + }; + } + + registerRoutes(): void { + this.koaApp.use(this.makeAssetsServingController()); + this.koaApp.use(this.registry.makeShimController()); + } + + + // Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context + // TraceId is expected to be request-bound and unique. So these two has to be distinguished. + @runOnce() + override insertAsyncHookMiddleware() { + const asyncHookMiddleware = async (ctx: Context, next: () => Promise) => { + const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0]; + this.threadLocal.setup({ + traceId: randomUUID(), + traceT0: new Date(), + googleTraceId, + }); + + return next(); + }; + + this.koaApp.use(asyncHookMiddleware); + } + +} +const instance = container.resolve(SearchStandAloneServer); + +export default instance; + +instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); diff --git a/backend/functions/src/types.d.ts b/src/types.d.ts similarity index 100% rename from backend/functions/src/types.d.ts rename to src/types.d.ts diff --git a/backend/functions/src/utils/get-function-url.ts b/src/utils/get-function-url.ts similarity index 100% rename from backend/functions/src/utils/get-function-url.ts rename to src/utils/get-function-url.ts diff --git a/backend/functions/src/utils/markdown.ts b/src/utils/markdown.ts similarity index 100% rename from backend/functions/src/utils/markdown.ts rename to src/utils/markdown.ts diff --git a/backend/functions/src/utils/misc.ts b/src/utils/misc.ts similarity index 100% rename from backend/functions/src/utils/misc.ts rename to src/utils/misc.ts diff --git a/backend/functions/src/utils/tailwind-classes.ts b/src/utils/tailwind-classes.ts similarity index 100% rename from backend/functions/src/utils/tailwind-classes.ts rename to src/utils/tailwind-classes.ts diff --git a/thinapps-shared b/thinapps-shared index b80a917..0c62acf 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit b80a917835031da9ab7073b6b4005402eece0746 +Subproject commit 0c62acf45e4749ecf4bb7f4bfc7ed49533e239cb diff --git a/backend/functions/tsconfig.json b/tsconfig.json similarity index 100% rename from backend/functions/tsconfig.json rename to tsconfig.json