restructure: nolonger a firebase application (#1160)

* fix: fine allow redefining Function.prototype.toString

* wip

* wip

* wip

* wip

* wip

* wip

* wip

* fix: contentType encoding

* wip

* fix: error throwing

* wip

* fix

* wip

* fix

* fix

* fix: jsdom

* wip

* wip

* fix: links summary uniqueness

* wip

* wip

* robots-txt catch no robots.txt

* deps: remove puppeteer-extra-plugin-stealth

* fix: dont change waring type

* fix: curl

* fix: replace firebase-roundtrip-check with blackhole-detector

* fix: black hole detection

* sercher: black hole detecting

* fix: no h2c for searcher

* fix: bhd

* fix: search and crawl conflict

* fix: bhd

* fix

* fix: server script

* canvas: fixed avif issue

* logging: move some to debug

* fix

* fix: pptr declare ready only when page can be created without issues

* fix: bhd

* cd: cloud run deploy-health-check cannot complete pptr newPage

* cd: fix

* fix: curl body can be null

* fix

* fix

* fix: major fix regarding TC pdfs

* fix

* fix

* deps: fix civkit trie router issue

* fix

* boom: total restructure

* cd: fix docker ctx

* fix

* fix: switch to h2c

* cd: ensure http2
This commit is contained in:
Yanlong Wang 2025-03-08 00:46:52 +08:00 committed by GitHub
parent ed80c9a4a2
commit 23a3b807c9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
74 changed files with 2765 additions and 1464 deletions

View File

@ -14,9 +14,6 @@ jobs:
concurrency:
group: ${{ github.ref_type == 'branch' && github.ref }}
cancel-in-progress: true
defaults:
run:
working-directory: backend/functions
permissions:
contents: read
steps:
@ -30,6 +27,8 @@ jobs:
credentials_json: '${{ secrets.GCLOUD_SERVICE_ACCOUNT_SECRET_JSON }}'
- name: 'Set up Cloud SDK'
uses: 'google-github-actions/setup-gcloud@v2'
with:
install_components: beta
- name: "Docker auth"
run: |-
gcloud auth configure-docker us-docker.pkg.dev --quiet
@ -40,7 +39,6 @@ jobs:
with:
node-version: 22.12.0
cache: npm
cache-dependency-path: backend/functions/package-lock.json
- name: npm install
run: npm ci
@ -65,13 +63,13 @@ jobs:
id: container
uses: docker/build-push-action@v6
with:
context: backend/functions
context: .
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
- name: Deploy CRAWL with Tag
run: |
gcloud run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0
gcloud beta run deploy crawl --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/crawl.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2
- name: Deploy SEARCH with Tag
run: |
gcloud run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0
gcloud beta run deploy search --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region us-central1 --async --min-instances 0 --deploy-health-check --use-http2

79
.gitignore vendored
View File

@ -1,4 +1,79 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
firebase-debug.log*
firebase-debug.*.log*
# Firebase cache
.firebase/
# Firebase config
# Uncomment this if you'd like others to create their own Firebase project.
# For a team working on the same Firebase project(s), it is recommended to leave
# it commented so all members can deploy to the same project(s) in .firebaserc.
# .firebaserc
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.secret.local
toy*.ts
.DS_Store
/package-lock.json
backend/functions/test.js
build/
.firebase-emu/
*.log
.DS_Store
*.local
.secret.*
licensed/

59
.vscode/launch.json vendored
View File

@ -1,26 +1,6 @@
{
"version": "0.2.0",
"configurations": [
{
"name": "Debug Fullstack: attach",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug"
},
{
"name": "Debug Fullstack: attach: with proxy",
"request": "attach",
"cwd": "${workspaceFolder}/backend/functions",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:debug:with-proxy"
},
{
"name": "Attach",
"port": 9229,
@ -40,21 +20,44 @@
"type": "node"
},
{
"name": "Debug Fullstack",
"name": "Debug Stand Alone Crawl",
"request": "launch",
"runtimeArgs": [
"emulators:start",
"--import=../.firebase-emu",
"--export-on-exit=../.firebase-emu",
"--env-file=.secret.local",
],
"cwd": "${workspaceFolder}/backend/functions",
"runtimeExecutable": "${workspaceFolder}/node_modules/.bin/firebase",
"env": {
"GCLOUD_PROJECT": "reader-6b7dc",
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
},
"cwd": "${workspaceFolder}",
"program": "build/stand-alone/crawl.js",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"preLaunchTask": "Fullstack:prepare",
"killBehavior": "polite"
"outputCapture": "std",
"preLaunchTask": "Backend:build:watch",
"killBehavior": "forceful"
},
{
"name": "Debug Stand Alone Search",
"request": "launch",
"runtimeArgs": [
"--env-file=.secret.local",
],
"env": {
"GCLOUD_PROJECT": "reader-6b7dc",
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
},
"cwd": "${workspaceFolder}",
"program": "build/stand-alone/search.js",
"skipFiles": [
"<node_internals>/**"
],
"type": "node",
"outputCapture": "std",
"preLaunchTask": "Backend:build:watch",
"killBehavior": "forceful"
},
]
}

132
.vscode/tasks.json vendored
View File

@ -6,29 +6,18 @@
"script": "build",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
"cwd": "${workspaceFolder}"
},
"problemMatcher": [],
"label": "Backend:rebuild",
"detail": "Backend:rebuild"
},
{
"type": "npm",
"script": "emu:reset",
"group": "build",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [],
"label": "Backend:reset-emulator",
"detail": "Backend:reset-emulator"
},
{
"type": "typescript",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
"cwd": "${workspaceFolder}"
},
"tsconfig": "backend/functions/tsconfig.json",
"tsconfig": "tsconfig.json",
"option": "watch",
"isBackground": true,
"problemMatcher": [
@ -36,121 +25,6 @@
],
"group": "build",
"label": "Backend:build:watch"
},
{
"type": "npm",
"script": "emu:debug",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions"
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug",
"detail": "Backend:start-emulator-debug",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
},
"group": "build",
"label": "Frontend:start:dev",
"detail": "Frontend:start:dev",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "dev",
"options": {
"cwd": "${workspaceFolder}/webapp",
"env": {
"FIREBASE_EMULATE": "true",
}
},
"group": "build",
"label": "Frontend:start:emu",
"detail": "Frontend:start:emu",
"isBackground": true,
"problemMatcher": {
"base": "$vite",
"background": {
"activeOnStart": true,
"endsPattern": "OK",
"beginsPattern": "vite"
}
},
},
{
"type": "npm",
"script": "emu:debug2",
"group": "none",
"options": {
"cwd": "${workspaceFolder}/backend/functions",
"env": {
"https_proxy": "http://127.0.0.1:7890",
"http_proxy": "http://127.0.0.1:7890",
"all_proxy": "socks5://127.0.0.1:7890"
}
},
"problemMatcher": [
{
"base": "$tsc",
"background": {
"activeOnStart": false,
"beginsPattern": "shutdown requested|Starting emulators",
"endsPattern": "Debugger listening"
}
}
],
"label": "Backend:start-emulator-debug:with-proxy",
"detail": "Backend:start-emulator-debug:with-proxy",
"dependsOn": [
"Backend:build:watch"
],
"isBackground": true,
},
{
"label": "Fullstack:prepare",
"dependsOn": [
"Frontend:start:emu",
"Backend:build:watch",
],
},
{
"label": "Fullstack:debug",
"dependsOn": [
// "Frontend:start:emu",
"Backend:start-emulator-debug",
],
},
{
"label": "Fullstack:debug:with-proxy",
"dependsOn": [
"Frontend:start:emu",
"Backend:start-emulator-debug:with-proxy",
],
}
]
}

View File

@ -158,13 +158,9 @@ curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.or
You will need the following tools to run the project:
- Node v18 (The build fails for Node version >18)
- Firebase CLI (`npm install -g firebase-tools`)
For backend, go to the `backend/functions` directory and install the npm dependencies.
```bash
git clone git@github.com:jina-ai/reader.git
cd backend/functions
npm install
```

View File

@ -1,5 +0,0 @@
{
"projects": {
"default": "reader-6b7dc"
}
}

79
backend/.gitignore vendored
View File

@ -1,79 +0,0 @@
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
firebase-debug.log*
firebase-debug.*.log*
# Firebase cache
.firebase/
# Firebase config
# Uncomment this if you'd like others to create their own Firebase project.
# For a team working on the same Firebase project(s), it is recommended to leave
# it commented so all members can deploy to the same project(s) in .firebaserc.
# .firebaserc
# Runtime data
pids
*.pid
*.seed
*.pid.lock
# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov
# Coverage directory used by tools like istanbul
coverage
# nyc test coverage
.nyc_output
# Grunt intermediate storage (http://gruntjs.com/creating-plugins#storing-task-files)
.grunt
# Bower dependency directory (https://bower.io/)
bower_components
# node-waf configuration
.lock-wscript
# Compiled binary addons (http://nodejs.org/api/addons.html)
build/Release
# Dependency directories
node_modules/
# Optional npm cache directory
.npm
# Optional eslint cache
.eslintcache
# Optional REPL history
.node_repl_history
# Output of 'npm pack'
*.tgz
# Yarn Integrity file
.yarn-integrity
# dotenv environment variables file
.env
.secret.local
toy*.ts
.DS_Store
build/
.firebase-emu/
*.log
.DS_Store
*.local
.secret.*
licensed/

View File

@ -1,43 +0,0 @@
{
"firestore": {
"rules": "firestore.rules",
"indexes": "firestore.indexes.json"
},
"functions": [
{
"source": "functions",
"codebase": "default",
"ignore": [
"node_modules",
"src",
".git",
"*.log",
"*.local",
".secret.*",
".firebase-emu"
],
"predeploy": [
"npm --prefix \"$RESOURCE_DIR\" run build:clean",
"npm --prefix \"$RESOURCE_DIR\" run build"
]
}
],
"storage": {
"rules": "storage.rules"
},
"emulators": {
"ui": {
"enabled": true
},
"singleProjectMode": true,
"functions": {
"port": 5001
},
"firestore": {
"port": 9098
},
"storage": {
"port": 9097
}
}
}

View File

@ -1,19 +0,0 @@
{
"indexes": [
{
"collectionGroup": "prompts",
"queryScope": "COLLECTION_GROUP",
"fields": [
{
"fieldPath": "id",
"order": "ASCENDING"
},
{
"fieldPath": "isPublic",
"order": "ASCENDING"
}
]
}
],
"fieldOverrides": []
}

View File

@ -1,32 +0,0 @@
rules_version = '2';
service cloud.firestore {
match /databases/{database}/documents {
// match /questions/{document=**} {
// allow read: if request.auth != null
// }
// match /answers/{userId}/profiles/default {
// allow read, write: if request.auth != null && request.auth.uid == userId
// }
match /credits/{userId}/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
match /users/{userId}/prompts/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
// match /users/{userId}/profiles/{document=**} {
// allow read: if request.auth != null && request.auth.uid == userId
// }
match /users/{userId}/creditHistory/{document=**} {
allow read: if request.auth != null && request.auth.uid == userId
}
match /{document=**} {
allow read, write: if false;
}
}
}

View File

@ -1 +0,0 @@
node_modules/

View File

@ -1,36 +0,0 @@
root = true
[*]
end_of_line = lf
charset = utf-8
indent_style = space
insert_final_newline = true
trim_trailing_whitespace = true
indent_size = 4
quote_type = single
max_line_length = 120
[*.py]
indent_size = 4
[*.ts]
indent_size = 4
[*.js]
indent_size = 2
[*.vue]
indent_size = 2
[*.*sx]
indent_size = 2
[*.*ml]
indent_size = 2
[*.json]
indent_size = 2
[*.md]
indent_size = 2
trim_trailing_whitespace = false

View File

@ -1,9 +0,0 @@
const { join } = require('path');
/**
* @type {import("puppeteer").Configuration}
*/
module.exports = {
// Changes the cache location for Puppeteer.
cacheDirectory: join(__dirname, 'node_modules', 'puppeteer', 'walk-around-lame-gcp-build'),
};

View File

@ -1,93 +0,0 @@
{
"name": "reader",
"scripts": {
"lint": "eslint --ext .js,.ts .",
"build": "node ./integrity-check.cjs && tsc -p .",
"build:watch": "tsc --watch",
"build:clean": "rm -rf ./build",
"shell": "npm run build && firebase functions:shell",
"emu:stage": "cd .. && tar -czvf firebase-emu-preset.tgz .firebase-emu",
"emu:reset": "rm -rf ../.firebase-emu && tar -xzf ../firebase-emu-preset.tgz --directory ../",
"emu:start": "firebase emulators:start --import ../.firebase-emu --export-on-exit",
"emu:debug": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
"emu:debug2": "firebase emulators:start --import ../.firebase-emu --export-on-exit --inspect-functions",
"emu:kill": "killall java",
"serve": "npm run build && npm run emu:start",
"debug": "npm run build && npm run emu:start -- --inspect-functions",
"from-scratch": "npm run build && rm -rf ../.firebase-emu && firebase emulators:start --export-on-exit",
"from-preset": "npm run build && npm run emu:reset && npm run emu:start",
"start": "npm run shell",
"deploy": "firebase deploy --only functions",
"logs": "firebase functions:log",
"gcp-build": "node node_modules/puppeteer/install.mjs"
},
"engines": {
"node": "20"
},
"main": "build/index.js",
"dependencies": {
"@esm2cjs/normalize-url": "^8.0.0",
"@google-cloud/translate": "^8.2.0",
"@mozilla/readability": "^0.5.0",
"@napi-rs/canvas": "^0.1.67",
"@types/turndown": "^5.0.4",
"@xmldom/xmldom": "^0.9.3",
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.3-3e69606",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.1",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
"linkedom": "^0.18.4",
"maxmind": "^4.3.18",
"minio": "^7.1.3",
"node-libcurl": "^4.1.0",
"openai": "^4.20.0",
"pdfjs-dist": "^4.2.67",
"puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
"puppeteer-page-proxy": "^1.3.0",
"robots-parser": "^3.0.1",
"set-cookie-parser": "^2.6.0",
"simple-zstd": "^1.4.2",
"stripe": "^11.11.0",
"tiktoken": "^1.0.16",
"tld-extract": "^2.1.0",
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"undici": "^5.24.0"
},
"devDependencies": {
"@types/archiver": "^5.3.4",
"@types/bcrypt": "^5.0.0",
"@types/busboy": "^1.5.4",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^20.14.13",
"@types/set-cookie-parser": "^2.4.7",
"@types/xmldom": "^0.1.34",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0",
"eslint-config-google": "^0.14.0",
"eslint-plugin-import": "^2.25.4",
"firebase-functions-test": "^3.0.0",
"pino-pretty": "^13.0.0",
"replicate": "^0.16.1",
"typescript": "^5.5.4"
},
"private": true,
"exports": {
".": "./build/index.js"
}
}

View File

@ -1,218 +0,0 @@
import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl, CurlFeature, HeaderInfo } from 'node-libcurl';
import { PageSnapshot, ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger';
import { JSDomControl } from './jsdom';
import { AssertionFailureError, FancyFile } from 'civkit';
import { TempFileManager } from '../shared';
import { readFile } from 'fs/promises';
import { pathToFileURL } from 'url';
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
import { ZSTDDecompress } from 'simple-zstd';
@singleton()
export class CurlControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected jsdomControl: JSDomControl,
protected tempFileManager: TempFileManager,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
curlImpersonateHeader(curl: Curl, headers?: object, chromeVersion: number = 132) {
const mixinHeaders = {
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${chromeVersion}", "Google Chrome";v="${chromeVersion}"`,
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': 'Windows',
'Upgrade-Insecure-Requests': '1',
'User-Agent': `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/${chromeVersion}.0.0.0 Safari/537.36`,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
};
curl.setOpt(Curl.option.HTTPHEADER, Object.entries({ ...mixinHeaders, ...headers }).map(([k, v]) => `${k}: ${v}`));
return curl;
}
async urlToSnapshot(urlToCrawl: URL, crawlOpts?: ScrappingOptions, throwOnNon200 = false): Promise<PageSnapshot> {
const snapshot = {
href: urlToCrawl.toString(),
html: '',
title: '',
text: '',
} as PageSnapshot;
let contentType = '';
const result = await new Promise<{
statusCode: number,
data?: FancyFile,
headers: Buffer | HeaderInfo[],
}>((resolve, reject) => {
const curl = new Curl();
curl.enable(CurlFeature.StreamResponse);
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, true);
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(10_000, crawlOpts?.timeoutMs || 10_000));
if (crawlOpts?.overrideUserAgent) {
curl.setOpt(Curl.option.USERAGENT, crawlOpts.overrideUserAgent);
}
this.curlImpersonateHeader(curl, crawlOpts?.extraHeaders);
// if (crawlOpts?.extraHeaders) {
// curl.setOpt(Curl.option.HTTPHEADER, Object.entries(crawlOpts.extraHeaders).map(([k, v]) => `${k}: ${v}`));
// }
if (crawlOpts?.proxyUrl) {
curl.setOpt(Curl.option.PROXY, crawlOpts.proxyUrl);
}
if (crawlOpts?.cookies?.length) {
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${cookie.value}`);
curl.setOpt(Curl.option.COOKIE, cookieChunks.join('; '));
}
if (crawlOpts?.referer) {
curl.setOpt(Curl.option.REFERER, crawlOpts.referer);
}
curl.on('end', (statusCode, _data, headers) => {
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl}`, { statusCode, headers });
curl.close();
});
curl.on('error', (err) => {
curl.close();
this.logger.warn(`Curl ${urlToCrawl}: ${err} (Not necessarily an error)`, { err: marshalErrorLike(err) });
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: ${err.message}`));
});
curl.setOpt(Curl.option.MAXFILESIZE, 1024 * 1024 * 1024); // 1GB
let status = -1;
let contentEncoding = '';
curl.on('stream', (stream, statusCode, headers) => {
status = statusCode;
const lastResHeaders = headers[headers.length - 1];
for (const [k, v] of Object.entries(lastResHeaders)) {
const kl = k.toLowerCase();
if (kl === 'content-type') {
contentType = v.toLowerCase();
}
if (kl === 'content-encoding') {
contentEncoding = v.toLowerCase();
}
if (contentType && contentEncoding) {
break;
}
}
if (!contentType) {
reject(new AssertionFailureError(`Failed to directly access ${urlToCrawl}: no content-type`));
stream.destroy();
return;
}
if (contentType.startsWith('image/')) {
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${urlToCrawl.origin}${urlToCrawl.pathname}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${urlToCrawl.href}"></body></html>`;
stream.destroy();
resolve({
statusCode: status,
headers,
});
return;
}
switch (contentEncoding) {
case 'gzip': {
const decompressed = createGunzip();
stream.pipe(decompressed);
stream = decompressed;
break;
}
case 'deflate': {
const decompressed = createInflate();
stream.pipe(decompressed);
stream = decompressed;
break;
}
case 'br': {
const decompressed = createBrotliDecompress();
stream.pipe(decompressed);
stream = decompressed;
break;
}
case 'zstd': {
const decompressed = ZSTDDecompress();
stream.pipe(decompressed);
stream = decompressed;
break;
}
default: {
break;
}
}
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
data: fancyFile,
headers,
});
});
curl.perform();
});
if (throwOnNon200 && result.statusCode && (result.statusCode < 200 || result.statusCode >= 300)) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: HTTP ${result.statusCode}`);
}
if (contentType === 'application/octet-stream') {
// Content declared as binary is same as unknown.
contentType = '';
}
if (result.data) {
const mimeType: string = contentType || await result.data.mimeType;
if (mimeType.startsWith('text/html')) {
if ((await result.data.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
}
snapshot.html = await readFile(await result.data.filePath, { encoding: 'utf-8' });
} else if (mimeType.startsWith('text/') || mimeType.startsWith('application/json')) {
if ((await result.data.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: file too large`);
}
snapshot.text = await readFile(await result.data.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
} else if (mimeType.startsWith('application/pdf')) {
snapshot.pdfs = [pathToFileURL(await result.data.filePath).href];
} else {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: unexpected type ${mimeType}`);
}
}
const curlSnapshot = await this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
return curlSnapshot!;
}
}

View File

@ -1 +0,0 @@
../../../thinapps-shared/backend

View File

@ -1,168 +0,0 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { initializeApp, applicationDefault } from 'firebase-admin/app';
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
credential: applicationDefault(),
});
initializeApp();
import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
import { ExpressServer } from 'civkit/civ-rpc/express';
import http2 from 'http2';
import { CrawlerHost } from '../cloud-functions/crawler';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { NextFunction, Request, Response } from 'express';
process.on('unhandledRejection', (err) => {
console.error('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
console.log('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
console.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class CrawlStandAloneServer extends ExpressServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: Logger,
protected registry: CloudFunctionRegistry,
protected crawlerHost: CrawlerHost,
protected threadLocal: AsyncContext,
) {
super(...arguments);
registry.allHandsOnDeck().catch(() => void 0);
registry.title = 'reader';
registry.version = '0.1.0';
}
h2c() {
this.httpAlternativeServer = this.httpServer;
this.httpServer = http2.createServer(this.expressApp);
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
makeAssetsServingController() {
return (req: Request, res: Response, next: NextFunction) => {
const requestPath = req.url;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
res.set('Content-Length', asset.stats.size.toString());
fs.createReadStream(asset.path).pipe(res);
return;
};
}
makeMiscMiddleware() {
return (req: Request, res: Response, next: NextFunction) => {
if (req.method === 'OPTIONS') {
return res.status(200).end();
}
this.threadLocal.set('ip', req.ip);
return next();
};
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
override registerRoutes(): void {
const openAPIManager = new OpenAPIManager();
openAPIManager.document('/{url}', ['get', 'post'], this.registry.conf.get('crawl')!);
const openapiJsonPath = '/openapi.json';
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
baseURL.search = '';
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
info: {
title: this.registry.title,
description: `${this.registry.title} openAPI documentations`,
'x-logo': {
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
}
}
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
res.statusCode = 200;
res.end(JSON.stringify(content));
});
this.expressRootRouter.use('/',
...this.registry.expressMiddlewares,
this.makeAssetsServingController(),
this.makeMiscMiddleware(),
this.registry.makeShimController('crawl')
);
}
protected override featureSelect(): void {
this.insertAsyncHookMiddleware();
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
this.insertLogRequestsMiddleware();
this.registerOpenAPIDocsRoutes('/docs');
this.registerRoutes();
}
}
const instance = container.resolve(CrawlStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));

View File

@ -1,168 +0,0 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { initializeApp, applicationDefault } from 'firebase-admin/app';
process.env['FIREBASE_CONFIG'] ??= JSON.stringify({
projectId: process.env['GCLOUD_PROJECT'] || 'reader-6b7dc',
storageBucket: `${process.env['GCLOUD_PROJECT'] || 'reader-6b7dc'}.appspot.com`,
credential: applicationDefault(),
});
initializeApp();
import { Logger, CloudFunctionRegistry, AsyncContext } from '../shared';
import { AbstractRPCRegistry, OpenAPIManager } from 'civkit/civ-rpc';
import { ExpressServer } from 'civkit/civ-rpc/express';
import http2 from 'http2';
import { SearcherHost } from '../cloud-functions/searcher-serper';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { NextFunction, Request, Response } from 'express';
process.on('unhandledRejection', (err) => {
console.error('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
console.log('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
console.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class SearchStandAloneServer extends ExpressServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: Logger,
protected registry: CloudFunctionRegistry,
protected searcherHost: SearcherHost,
protected threadLocal: AsyncContext,
) {
super(...arguments);
registry.allHandsOnDeck().catch(() => void 0);
registry.title = 'reader';
registry.version = '0.1.0';
}
h2c() {
this.httpAlternativeServer = this.httpServer;
this.httpServer = http2.createServer(this.expressApp);
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
makeAssetsServingController() {
return (req: Request, res: Response, next: NextFunction) => {
const requestPath = req.url;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
res.type(mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream');
res.set('Content-Length', asset.stats.size.toString());
fs.createReadStream(asset.path).pipe(res);
return;
};
}
makeMiscMiddleware() {
return (req: Request, res: Response, next: NextFunction) => {
if (req.method === 'OPTIONS') {
return res.status(200).end();
}
this.threadLocal.set('ip', req.ip);
return next();
};
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
override registerRoutes(): void {
const openAPIManager = new OpenAPIManager();
openAPIManager.document('/{q}', ['get', 'post'], this.registry.conf.get('search')!);
const openapiJsonPath = '/openapi.json';
this.expressRootRouter.get(openapiJsonPath, (req, res) => {
const baseURL = new URL(req.url, `${req.protocol}://${req.headers.host}`);
baseURL.pathname = baseURL.pathname.replace(new RegExp(`${openapiJsonPath}$`, 'i'), '').replace(/\/+$/g, '');
baseURL.search = '';
const content = openAPIManager.createOpenAPIObject(baseURL.toString(), {
info: {
title: this.registry.title,
description: `${this.registry.title} openAPI documentations`,
'x-logo': {
url: this.registry.logoUrl || `https://www.openapis.org/wp-content/uploads/sites/3/2018/02/OpenAPI_Logo_Pantone-1.png`
}
}
}, (this.registry.constructor as typeof AbstractRPCRegistry).envelope, req.query as any);
res.statusCode = 200;
res.end(JSON.stringify(content));
});
this.expressRootRouter.use('/',
...this.registry.expressMiddlewares,
this.makeMiscMiddleware(),
this.makeAssetsServingController(),
this.registry.makeShimController('search')
);
}
protected override featureSelect(): void {
this.insertAsyncHookMiddleware();
this.insertHealthCheckMiddleware(this.healthCheckEndpoint);
this.insertLogRequestsMiddleware();
this.registerOpenAPIDocsRoutes('/docs');
this.registerRoutes();
}
}
const instance = container.resolve(SearchStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.listen(parseInt(process.env.PORT || '') || 3000));

View File

@ -1,8 +0,0 @@
rules_version = '2';
service firebase.storage {
match /b/{bucket}/o {
match /{allPaths=**} {
allow read, write: if false;
}
}
}

View File

@ -8,15 +8,16 @@
"dependencies": {
"@esm2cjs/normalize-url": "^8.0.0",
"@google-cloud/translate": "^8.2.0",
"@koa/bodyparser": "^5.1.1",
"@mozilla/readability": "^0.5.0",
"@napi-rs/canvas": "^0.1.67",
"@napi-rs/canvas": "^0.1.68",
"@types/turndown": "^5.0.4",
"@xmldom/xmldom": "^0.9.3",
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.3-3e69606",
"civkit": "^0.8.4-32482a3",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -31,7 +32,7 @@
"minio": "^7.1.3",
"node-libcurl": "^4.1.0",
"openai": "^4.20.0",
"pdfjs-dist": "^4.2.67",
"pdfjs-dist": "^4.10.38",
"puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
@ -53,6 +54,7 @@
"@types/busboy": "^1.5.4",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/koa": "^2.15.0",
"@types/node": "^20.14.13",
"@types/set-cookie-parser": "^2.4.7",
"@types/xmldom": "^0.1.34",
@ -62,6 +64,7 @@
"eslint-config-google": "^0.14.0",
"eslint-plugin-import": "^2.25.4",
"firebase-functions-test": "^3.0.0",
"koa": "^2.16.0",
"pino-pretty": "^13.0.0",
"replicate": "^0.16.1",
"typescript": "^5.5.4"
@ -1626,6 +1629,23 @@
"url": "https://opencollective.com/js-sdsl"
}
},
"node_modules/@koa/bodyparser": {
"version": "5.1.1",
"resolved": "https://registry.npmjs.org/@koa/bodyparser/-/bodyparser-5.1.1.tgz",
"integrity": "sha512-ZBF49xqNVxnmJ+8iXegq+fXPQm9RSX8giNl/aXS5rW1VpNct92wnFbGR/47vfoRJVLARGQ4HVL4WaQ0u8IJVoA==",
"license": "MIT",
"dependencies": {
"co-body": "^6.1.0",
"lodash.merge": "^4.6.2",
"type-is": "^1.6.18"
},
"engines": {
"node": ">= 16"
},
"peerDependencies": {
"koa": "^2.14.1"
}
},
"node_modules/@koa/router": {
"version": "12.0.1",
"resolved": "https://registry.npmjs.org/@koa/router/-/router-12.0.1.tgz",
@ -1679,30 +1699,30 @@
}
},
"node_modules/@napi-rs/canvas": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.67.tgz",
"integrity": "sha512-VA4Khm/5Kg2bQGx3jXotTC4MloOG8b1Ung80exafUK0k5u6yJmIz3Q2iXeeWZs5weV+LQOEB+CPKsYwEYaGAjw==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.68.tgz",
"integrity": "sha512-LQESrePLEBLvhuFkXx9jjBXRC2ClYsO5mqQ1m/puth5z9SOuM3N/B3vDuqnC3RJFktDktyK9khGvo7dTkqO9uQ==",
"license": "MIT",
"engines": {
"node": ">= 10"
},
"optionalDependencies": {
"@napi-rs/canvas-android-arm64": "0.1.67",
"@napi-rs/canvas-darwin-arm64": "0.1.67",
"@napi-rs/canvas-darwin-x64": "0.1.67",
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.67",
"@napi-rs/canvas-linux-arm64-gnu": "0.1.67",
"@napi-rs/canvas-linux-arm64-musl": "0.1.67",
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.67",
"@napi-rs/canvas-linux-x64-gnu": "0.1.67",
"@napi-rs/canvas-linux-x64-musl": "0.1.67",
"@napi-rs/canvas-win32-x64-msvc": "0.1.67"
"@napi-rs/canvas-android-arm64": "0.1.68",
"@napi-rs/canvas-darwin-arm64": "0.1.68",
"@napi-rs/canvas-darwin-x64": "0.1.68",
"@napi-rs/canvas-linux-arm-gnueabihf": "0.1.68",
"@napi-rs/canvas-linux-arm64-gnu": "0.1.68",
"@napi-rs/canvas-linux-arm64-musl": "0.1.68",
"@napi-rs/canvas-linux-riscv64-gnu": "0.1.68",
"@napi-rs/canvas-linux-x64-gnu": "0.1.68",
"@napi-rs/canvas-linux-x64-musl": "0.1.68",
"@napi-rs/canvas-win32-x64-msvc": "0.1.68"
}
},
"node_modules/@napi-rs/canvas-android-arm64": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.67.tgz",
"integrity": "sha512-W+3DFG5h0WU8Vqqb3W5fNmm5/TPH5ECZRinQDK4CAKFSUkc4iZcDwrmyFG9sB4KdHazf1mFVHCpEeVMO6Mk6Zg==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.68.tgz",
"integrity": "sha512-h1KcSR4LKLfRfzeBH65xMxbWOGa1OtMFQbCMVlxPCkN1Zr+2gK+70pXO5ktojIYcUrP6KDcOwoc8clho5ccM/w==",
"cpu": [
"arm64"
],
@ -1716,9 +1736,9 @@
}
},
"node_modules/@napi-rs/canvas-darwin-arm64": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.67.tgz",
"integrity": "sha512-xzrv7QboI47yhIHR5P5u/9KGswokuOKLiKSukr1Ku03RRJxP6lGuVtrAZAgdRg7F9FsuF2REf2yK53YVb6pMlA==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.68.tgz",
"integrity": "sha512-/VURlrAD4gDoxW1GT/b0nP3fRz/fhxmHI/xznTq2FTwkQLPOlLkDLCvTmQ7v6LtGKdc2Ed6rvYpRan+JXThInQ==",
"cpu": [
"arm64"
],
@ -1732,9 +1752,9 @@
}
},
"node_modules/@napi-rs/canvas-darwin-x64": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.67.tgz",
"integrity": "sha512-SNk9lYBr84N0gW8MZ2IrjygFtbFBILr3SEqMdHzHHuph20SQmssFvJGPZwSSCMEyKAvyqhogbmlew0te5Z4w9Q==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.68.tgz",
"integrity": "sha512-tEpvGR6vCLTo1Tx9wmDnoOKROpw57wiCWwCpDOuVlj/7rqEJOUYr9ixW4aRJgmeGBrZHgevI0EURys2ER6whmg==",
"cpu": [
"x64"
],
@ -1748,9 +1768,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-arm-gnueabihf": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.67.tgz",
"integrity": "sha512-qmBlSvUpl567bzH8tNXi82u5FrL4d0qINqd6K9O7GWGGGFmKMJdrgi2/SW3wwCTxqHBasIDdVWc4KSJfwyaoDQ==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.68.tgz",
"integrity": "sha512-U9xbJsumPOiAYeAFZMlHf62b9dGs2HJ6Q5xt7xTB0uEyPeurwhgYBWGgabdsEidyj38YuzI/c3LGBbSQB3vagw==",
"cpu": [
"arm"
],
@ -1764,9 +1784,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-arm64-gnu": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.67.tgz",
"integrity": "sha512-k3nAPQefkMeFuJ65Rqdnx92KX1JXQhEKjjWeKsCJB+7sIBgQUWtHo9c3etfVLv5pkWJJDFi/Zc2soNkH3E8dRA==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.68.tgz",
"integrity": "sha512-KFkn8wEm3mPnWD4l8+OUUkxylSJuN5q9PnJRZJgv15RtCA1bgxIwTkBhI/+xuyVMcHqON9sXq7cDkEJtHm35dg==",
"cpu": [
"arm64"
],
@ -1780,9 +1800,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-arm64-musl": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.67.tgz",
"integrity": "sha512-lZwHWR1cCP408l86n3Qbs3X1oFeAYMjJIQvQl1VMZh6wo5PfI+jaZSKBUOd8x44TnVllX9yhLY9unNRztk/sUQ==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.68.tgz",
"integrity": "sha512-IQzts91rCdOALXBWQxLZRCEDrfFTGDtNRJMNu+2SKZ1uT8cmPQkPwVk5rycvFpvgAcmiFiOSCp1aRrlfU8KPpQ==",
"cpu": [
"arm64"
],
@ -1796,9 +1816,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-riscv64-gnu": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.67.tgz",
"integrity": "sha512-PdBC9p6bLHA1W3OdA0vTHj701SB/kioGQ1uCFBRMs5KBCaMLb/H4aNi8uaIUIEvBWnxeAjoNcLU7//q0FxEosw==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.68.tgz",
"integrity": "sha512-e9AS5UttoIKqXSmBzKZdd3NErSVyOEYzJfNOCGtafGk1//gibTwQXGlSXmAKuErqMp09pyk9aqQRSYzm1AQfBw==",
"cpu": [
"riscv64"
],
@ -1812,9 +1832,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-x64-gnu": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.67.tgz",
"integrity": "sha512-kJJX6eWzjipL/LdKOWCJctc88e5yzuXri8+s0V/lN06OwuLGW62TWS3lvi8qlUrGMOfRGabSWWlB4omhASSB8w==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.68.tgz",
"integrity": "sha512-Pa/I36VE3j57I3Obhrr+J48KGFfkZk2cJN/2NmW/vCgmoF7kCP6aTVq5n+cGdGWLd/cN9CJ9JvNwEoMRDghu0g==",
"cpu": [
"x64"
],
@ -1828,9 +1848,9 @@
}
},
"node_modules/@napi-rs/canvas-linux-x64-musl": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.67.tgz",
"integrity": "sha512-jLKiPWGeN6ZzhnaLG7ex7eexsiHJ1mdtPK1qKvETIcu45dApMXyUIHvdL6XWB5gFFtj5ScHzLUxv1vkfPZsoxA==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.68.tgz",
"integrity": "sha512-9c6rkc5195wNxuUHJdf4/mmnq433OQey9TNvQ9LspJazvHbfSkTij8wtKjASVQsJyPDva4fkWOeV/OQ7cLw0GQ==",
"cpu": [
"x64"
],
@ -1844,9 +1864,9 @@
}
},
"node_modules/@napi-rs/canvas-win32-x64-msvc": {
"version": "0.1.67",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.67.tgz",
"integrity": "sha512-K/JmkOFbc4iRZYUqJhj0jwqfHA/wNQEmTiGNsgZ6d59yF/IBNp5T0D5eg3B8ghjI8GxDYCiSJ6DNX8mC3Oh2EQ==",
"version": "0.1.68",
"resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.68.tgz",
"integrity": "sha512-Fc5Dez23u0FoSATurT6/w1oMytiRnKWEinHivdMvXpge6nG4YvhrASrtqMk8dGJMVQpHr8QJYF45rOrx2YU2Aw==",
"cpu": [
"x64"
],
@ -2238,6 +2258,16 @@
"resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz",
"integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="
},
"node_modules/@types/accepts": {
"version": "1.3.7",
"resolved": "https://registry.npmjs.org/@types/accepts/-/accepts-1.3.7.tgz",
"integrity": "sha512-Pay9fq2lM2wXPWbteBsRAGiWH2hig4ZE2asK+mm7kUzlxRTfL961rj89I6zV/E3PcIkDqyuBEcMxFT7rccugeQ==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/node": "*"
}
},
"node_modules/@types/archiver": {
"version": "5.3.4",
"resolved": "https://registry.npmjs.org/@types/archiver/-/archiver-5.3.4.tgz",
@ -2344,6 +2374,26 @@
"@types/node": "*"
}
},
"node_modules/@types/content-disposition": {
"version": "0.5.8",
"resolved": "https://registry.npmjs.org/@types/content-disposition/-/content-disposition-0.5.8.tgz",
"integrity": "sha512-QVSSvno3dE0MgO76pJhmv4Qyi/j0Yk9pBp0Y7TJ2Tlj+KCgJWY6qX7nnxCOLkZ3VYRSIk1WTxCvwUSdx6CCLdg==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/cookies": {
"version": "0.9.0",
"resolved": "https://registry.npmjs.org/@types/cookies/-/cookies-0.9.0.tgz",
"integrity": "sha512-40Zk8qR147RABiQ7NQnBzWzDcjKzNrntB5BAmeGCb2p/MIyOE+4BVvc17wumsUqUw00bJYqoXFHYygQnEFh4/Q==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/connect": "*",
"@types/express": "*",
"@types/keygrip": "*",
"@types/node": "*"
}
},
"node_modules/@types/cors": {
"version": "2.8.17",
"resolved": "https://registry.npmjs.org/@types/cors/-/cors-2.8.17.tgz",
@ -2403,6 +2453,13 @@
"@types/node": "*"
}
},
"node_modules/@types/http-assert": {
"version": "1.5.6",
"resolved": "https://registry.npmjs.org/@types/http-assert/-/http-assert-1.5.6.tgz",
"integrity": "sha512-TTEwmtjgVbYAzZYWyeHPrrtWnfVkm8tQkP8P21uQifPgMRgjrow3XDEYqucuC8SKZJT7pUnhU/JymvjggxO9vw==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/http-cache-semantics": {
"version": "4.0.4",
"resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz",
@ -2460,6 +2517,13 @@
"@types/node": "*"
}
},
"node_modules/@types/keygrip": {
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/@types/keygrip/-/keygrip-1.0.6.tgz",
"integrity": "sha512-lZuNAY9xeJt7Bx4t4dx0rYCDqGPW8RXhQZK1td7d4H6E9zYbLoOtjBvfwdTKpsyxQI/2jv+armjX/RW+ZNpXOQ==",
"dev": true,
"license": "MIT"
},
"node_modules/@types/keyv": {
"version": "3.1.4",
"resolved": "https://registry.npmjs.org/@types/keyv/-/keyv-3.1.4.tgz",
@ -2468,6 +2532,33 @@
"@types/node": "*"
}
},
"node_modules/@types/koa": {
"version": "2.15.0",
"resolved": "https://registry.npmjs.org/@types/koa/-/koa-2.15.0.tgz",
"integrity": "sha512-7QFsywoE5URbuVnG3loe03QXuGajrnotr3gQkXcEBShORai23MePfFYdhz90FEtBBpkyIYQbVD+evKtloCgX3g==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/accepts": "*",
"@types/content-disposition": "*",
"@types/cookies": "*",
"@types/http-assert": "*",
"@types/http-errors": "*",
"@types/keygrip": "*",
"@types/koa-compose": "*",
"@types/node": "*"
}
},
"node_modules/@types/koa-compose": {
"version": "3.2.8",
"resolved": "https://registry.npmjs.org/@types/koa-compose/-/koa-compose-3.2.8.tgz",
"integrity": "sha512-4Olc63RY+MKvxMwVknCUDhRQX1pFQoBZ/lXcRLP69PQkEpze/0cr8LNqJQe5NFb/b19DWi2a5bTi2VAlQzhJuA==",
"dev": true,
"license": "MIT",
"dependencies": {
"@types/koa": "*"
}
},
"node_modules/@types/lodash": {
"version": "4.17.0",
"resolved": "https://registry.npmjs.org/@types/lodash/-/lodash-4.17.0.tgz",
@ -3836,7 +3927,6 @@
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/cache-content-type/-/cache-content-type-1.0.1.tgz",
"integrity": "sha512-IKufZ1o4Ut42YUrZSo8+qnMTrFuKkvyoLXUywKz9GJ5BrhOFGhLdkx9sG4KAnVvbY6kEcSFjLQul+DVmBm2bgA==",
"optional": true,
"dependencies": {
"mime-types": "^2.1.18",
"ylru": "^1.2.0"
@ -4005,9 +4095,10 @@
}
},
"node_modules/civkit": {
"version": "0.8.3-3e69606",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.3-3e69606.tgz",
"integrity": "sha512-niV5U11ySIiVNSnGpW49KJlExmIiuQQfnyQEXeYuKCE+B+wkqYCBG+3tlY3E882tmPkaQQKpDlF/yTeqEU2q2Q==",
"version": "0.8.4-32482a3",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-32482a3.tgz",
"integrity": "sha512-VQwRreeVKYEoSMlhwYrPGpAA5na6lrIavGKmYNrhsHVJEvSfgkWKEete/btZzer4+WBxnNRw+PpRPrq6xjt13Q==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",
"tslib": "^2.5.0"
@ -4138,7 +4229,6 @@
"version": "4.6.0",
"resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz",
"integrity": "sha512-QVb0dM5HvG+uaxitm8wONl7jltx8dqhfU33DcqtOZcLSVIKSDDLDi7+0LbAKiyI8hD9u42m2YxXSkMGWThaecQ==",
"devOptional": true,
"engines": {
"iojs": ">= 1.0.0",
"node": ">= 0.12.0"
@ -4148,7 +4238,6 @@
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/co-body/-/co-body-6.1.0.tgz",
"integrity": "sha512-m7pOT6CdLN7FuXUcpuz/8lfQ/L77x8SchHCF4G0RBTJO20Wzmhn5Sp4/5WsKy8OSpifBSUrmg83qEqaDHdyFuQ==",
"optional": true,
"dependencies": {
"inflation": "^2.0.0",
"qs": "^6.5.2",
@ -4273,7 +4362,6 @@
"version": "0.9.1",
"resolved": "https://registry.npmjs.org/cookies/-/cookies-0.9.1.tgz",
"integrity": "sha512-TG2hpqe4ELx54QER/S3HQ9SRVnQnGBtKUz5bLQWtYAQ+o6GpgMs6sYUvaiJjVxb+UXwhRhAEP3m7LbsIZ77Hmw==",
"optional": true,
"dependencies": {
"depd": "~2.0.0",
"keygrip": "~1.1.0"
@ -4582,8 +4670,7 @@
"node_modules/deep-equal": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/deep-equal/-/deep-equal-1.0.1.tgz",
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw==",
"optional": true
"integrity": "sha512-bHtC0iYvWhyaTzvV3CZgPeZQqCOBGyGsVV7v4eevpdkLHfiSrXUdBG+qAuSz4RI70sszvjQ1QSZ98An1yNwpSw=="
},
"node_modules/deep-extend": {
"version": "0.6.0",
@ -6701,7 +6788,6 @@
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/http-assert/-/http-assert-1.5.0.tgz",
"integrity": "sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==",
"optional": true,
"dependencies": {
"deep-equal": "~1.0.1",
"http-errors": "~1.8.0"
@ -6714,7 +6800,6 @@
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
"optional": true,
"engines": {
"node": ">= 0.6"
}
@ -6723,7 +6808,6 @@
"version": "1.8.1",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
"optional": true,
"dependencies": {
"depd": "~1.1.2",
"inherits": "2.0.4",
@ -6739,7 +6823,6 @@
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
"optional": true,
"engines": {
"node": ">= 0.6"
}
@ -6940,7 +7023,6 @@
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/inflation/-/inflation-2.1.0.tgz",
"integrity": "sha512-t54PPJHG1Pp7VQvxyVCJ9mBbjG3Hqryges9bXoOO6GExCPa+//i/d5GSuFtpx3ALLd7lgIAur6zrIlBQyJuMlQ==",
"optional": true,
"engines": {
"node": ">= 0.8.0"
}
@ -8316,7 +8398,6 @@
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/keygrip/-/keygrip-1.1.0.tgz",
"integrity": "sha512-iYSchDJ+liQ8iwbSI2QqsQOvqv58eJCEanyJPJi+Khyu8smkcKSFUCbPwzFcL7YVtZ6eONjqRX/38caJ7QjRAQ==",
"optional": true,
"dependencies": {
"tsscmp": "1.0.6"
},
@ -8354,10 +8435,10 @@
}
},
"node_modules/koa": {
"version": "2.15.3",
"resolved": "https://registry.npmjs.org/koa/-/koa-2.15.3.tgz",
"integrity": "sha512-j/8tY9j5t+GVMLeioLaxweJiKUayFhlGqNTzf2ZGwL0ZCQijd2RLHK0SLW5Tsko8YyyqCZC2cojIb0/s62qTAg==",
"optional": true,
"version": "2.16.0",
"resolved": "https://registry.npmjs.org/koa/-/koa-2.16.0.tgz",
"integrity": "sha512-Afhqq0Vq3W7C+/rW6IqHVBDLzqObwZ07JaUNUEF8yCQ6afiyFE3RAy+i7V0E46XOWlH7vPWn/x0vsZwNy6PWxw==",
"license": "MIT",
"dependencies": {
"accepts": "^1.3.5",
"cache-content-type": "^1.0.0",
@ -8404,14 +8485,12 @@
"node_modules/koa-compose": {
"version": "4.1.0",
"resolved": "https://registry.npmjs.org/koa-compose/-/koa-compose-4.1.0.tgz",
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw==",
"optional": true
"integrity": "sha512-8ODW8TrDuMYvXRwra/Kh7/rJo9BtOfPc6qO8eAfC80CnCvSjSl0bkRM24X6/XBBEyj0v1nRUQ1LyOy3dbqOWXw=="
},
"node_modules/koa-convert": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/koa-convert/-/koa-convert-2.0.0.tgz",
"integrity": "sha512-asOvN6bFlSnxewce2e/DK3p4tltyfC4VM7ZwuTuepI7dEQVcvpyFuBcEARu1+Hxg8DIwytce2n7jrZtRlPrARA==",
"optional": true,
"dependencies": {
"co": "^4.6.0",
"koa-compose": "^4.1.0"
@ -8424,7 +8503,6 @@
"version": "1.8.1",
"resolved": "https://registry.npmjs.org/http-errors/-/http-errors-1.8.1.tgz",
"integrity": "sha512-Kpk9Sm7NmI+RHhnj6OIWDI1d6fIoFAtFt9RLaTMRlg/8w49juAStsrBgp0Dp4OdxdVbRIeKhtCUvoi/RuAhO4g==",
"optional": true,
"dependencies": {
"depd": "~1.1.2",
"inherits": "2.0.4",
@ -8440,7 +8518,6 @@
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/depd/-/depd-1.1.2.tgz",
"integrity": "sha512-7emPTl6Dpo6JRXOXjLRxck+FlLRX5847cLKEn00PLAgc3g2hTZZgr+e4c2v6QpSmLeFP3n5yUo7ft6avBK/5jQ==",
"optional": true,
"engines": {
"node": ">= 0.6"
}
@ -8449,7 +8526,6 @@
"version": "1.5.0",
"resolved": "https://registry.npmjs.org/statuses/-/statuses-1.5.0.tgz",
"integrity": "sha512-OpZ3zP+jT1PI7I8nemJX4AKmAX070ZkYPVWV/AaKTJl+tXCTGyVdC1a4SL8RUQYEwk/f34ZX8UTykN68FwrqAA==",
"optional": true,
"engines": {
"node": ">= 0.6"
}
@ -8644,8 +8720,7 @@
"node_modules/lodash.merge": {
"version": "4.6.2",
"resolved": "https://registry.npmjs.org/lodash.merge/-/lodash.merge-4.6.2.tgz",
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ==",
"dev": true
"integrity": "sha512-0KpjqXRVvrYyCsX1swR/XTK0va6VQkQM6MNo7PqW77ByjAhoARA8EfrP1N4+KlKj8YS0ZUCtRT/YUuhyYDujIQ=="
},
"node_modules/lodash.once": {
"version": "4.1.1",
@ -9853,8 +9928,7 @@
"node_modules/only": {
"version": "0.0.2",
"resolved": "https://registry.npmjs.org/only/-/only-0.0.2.tgz",
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ==",
"optional": true
"integrity": "sha512-Fvw+Jemq5fjjyWz6CpKx6w9s7xxqo3+JCyM0WXWeCSOboZ8ABkyvP8ID4CZuChA/wxSx+XSJmdOm8rGVyJ1hdQ=="
},
"node_modules/openai": {
"version": "4.33.0",
@ -10118,15 +10192,15 @@
}
},
"node_modules/pdfjs-dist": {
"version": "4.2.67",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.2.67.tgz",
"integrity": "sha512-rJmuBDFpD7cqC8WIkQUEClyB4UAH05K4AsyewToMTp2gSy3Rrx8c1ydAVqlJlGv3yZSOrhEERQU/4ScQQFlLHA==",
"version": "4.10.38",
"resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-4.10.38.tgz",
"integrity": "sha512-/Y3fcFrXEAsMjJXeL9J8+ZG9U01LbuWaYypvDW2ycW1jL269L3js3DVBjDJ0Up9Np1uqDXsDrRihHANhZOlwdQ==",
"license": "Apache-2.0",
"engines": {
"node": ">=18"
"node": ">=20"
},
"optionalDependencies": {
"canvas": "^2.11.2",
"path2d": "^0.2.0"
"@napi-rs/canvas": "^0.1.65"
}
},
"node_modules/peek-stream": {
@ -12443,7 +12517,6 @@
"version": "1.0.6",
"resolved": "https://registry.npmjs.org/tsscmp/-/tsscmp-1.0.6.tgz",
"integrity": "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA==",
"optional": true,
"engines": {
"node": ">=0.6.x"
}
@ -13136,7 +13209,6 @@
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/ylru/-/ylru-1.4.0.tgz",
"integrity": "sha512-2OQsPNEmBCvXuFlIni/a+Rn+R2pHW9INm0BxXJ4hVDA8TirqMj+J/Rp9ItLatT/5pZqWwefVrTQcHpixsxnVlA==",
"optional": true,
"engines": {
"node": ">= 4.0.0"
}

View File

@ -1,15 +1,84 @@
{
"name": "reader",
"version": "1.0.0",
"description": "### Prerequisite - Node v18 (The build fails for Node version >18) - Yarn - Firebase CLI (`npm install -g firebase-tools`)",
"main": "index.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"author": "",
"license": "ISC",
"devDependencies": {
"firebase-tools": "^13.6.2",
"typescript": "^5.1.6"
}
}
"name": "reader",
"scripts": {
"lint": "eslint --ext .js,.ts .",
"build": "node ./integrity-check.cjs && tsc -p .",
"build:watch": "tsc --watch",
"build:clean": "rm -rf ./build",
"serve": "npm run build && npm run start",
"debug": "npm run build && npm run dev",
"start": "npm run shell"
},
"engines": {
"node": "20"
},
"main": "build/index.js",
"dependencies": {
"@esm2cjs/normalize-url": "^8.0.0",
"@google-cloud/translate": "^8.2.0",
"@koa/bodyparser": "^5.1.1",
"@mozilla/readability": "^0.5.0",
"@napi-rs/canvas": "^0.1.68",
"@types/turndown": "^5.0.4",
"@xmldom/xmldom": "^0.9.3",
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"busboy": "^1.6.0",
"civkit": "^0.8.4-32482a3",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.1",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
"linkedom": "^0.18.4",
"maxmind": "^4.3.18",
"minio": "^7.1.3",
"node-libcurl": "^4.1.0",
"openai": "^4.20.0",
"pdfjs-dist": "^4.10.38",
"puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^1.3.1",
"puppeteer-page-proxy": "^1.3.0",
"robots-parser": "^3.0.1",
"set-cookie-parser": "^2.6.0",
"simple-zstd": "^1.4.2",
"stripe": "^11.11.0",
"tiktoken": "^1.0.16",
"tld-extract": "^2.1.0",
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"undici": "^5.24.0"
},
"devDependencies": {
"@types/archiver": "^5.3.4",
"@types/bcrypt": "^5.0.0",
"@types/busboy": "^1.5.4",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/koa": "^2.15.0",
"@types/node": "^20.14.13",
"@types/set-cookie-parser": "^2.4.7",
"@types/xmldom": "^0.1.34",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
"eslint": "^8.9.0",
"eslint-config-google": "^0.14.0",
"eslint-plugin-import": "^2.25.4",
"firebase-functions-test": "^3.0.0",
"koa": "^2.16.0",
"pino-pretty": "^13.0.0",
"replicate": "^0.16.1",
"typescript": "^5.5.4"
},
"private": true,
"exports": {
".": "./build/index.js"
}
}

View File

Before

Width:  |  Height:  |  Size: 14 KiB

After

Width:  |  Height:  |  Size: 14 KiB

View File

@ -1,30 +1,45 @@
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
AssertionFailureError, ParamValidationError, Defer,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { Crawled } from '../db/crawled';
import { pathToFileURL } from 'url';
import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import _ from 'lodash';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import {
assignTransferProtocolMeta, RPCHost, RPCReflection,
AssertionFailureError, ParamValidationError,
RawString,
ApplicationError,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { Defer } from 'civkit/defer';
import { retryWith } from 'civkit/decorators';
import { CONTENT_FORMAT, CrawlerOptions, CrawlerOptionsHeaderOnly, ENGINE_TYPE } from '../dto/crawler-options';
import { Crawled } from '../db/crawled';
import { DomainBlockade } from '../db/domain-blockade';
import { DomainProfile } from '../db/domain-profile';
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl';
import { LmControl } from '../services/lm';
import { tryDecodeURIComponent } from '../utils/misc';
import { CFBrowserRendering } from '../services/cf-browser-rendering';
import { GlobalLogger } from '../services/logger';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import { AsyncLocalContext } from '../services/async-context';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import { BudgetExceededError, InsufficientBalanceError, SecurityCompromiseError } from '../services/errors';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { ProxyProvider } from '../shared/services/proxy-provider';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { RobotsTxtService } from '../services/robots-text';
import { ServiceBadAttemptError } from '../shared/lib/errors';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -33,6 +48,8 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
engine?: string;
allocProxy?: string;
private?: boolean;
}
const indexProto = {
@ -56,16 +73,18 @@ export class CrawlerHost extends RPCHost {
domainProfileRetentionMs = 1000 * 3600 * 24 * 30;
constructor(
protected globalLogger: Logger,
protected globalLogger: GlobalLogger,
protected puppeteerControl: PuppeteerControl,
protected curlControl: CurlControl,
protected cfBrowserRendering: CFBrowserRendering,
protected proxyProvider: ProxyProvider,
protected lmControl: LmControl,
protected jsdomControl: JSDomControl,
protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
protected fbHealthCheck: FirebaseRoundTripChecker,
protected threadLocal: AsyncLocalContext,
protected robotsTxtService: RobotsTxtService,
) {
super(...arguments);
@ -73,7 +92,7 @@ export class CrawlerHost extends RPCHost {
if (!snapshot.title?.trim() && !snapshot.pdfs?.length) {
return;
}
if (options.cookies?.length) {
if (options.cookies?.length || options.private) {
// Potential privacy issue, dont cache if cookies are used
return;
}
@ -84,9 +103,14 @@ export class CrawlerHost extends RPCHost {
if (options.locale) {
Reflect.set(snapshot, 'locale', options.locale);
}
await this.setToCache(options.url, snapshot);
await this.exploreDirectEngine(snapshot).catch(() => undefined);
const analyzed = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
if (analyzed.tokens < 200) {
// Does not contain enough content
return;
}
await this.setToCache(options.url, snapshot);
});
puppeteerControl.on('abuse', async (abuseEvent: { url: URL; reason: string, sn: number; }) => {
@ -108,12 +132,19 @@ export class CrawlerHost extends RPCHost {
override async init() {
await this.dependencyReady();
this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, ''));
this.emit('ready');
}
getIndex(user?: JinaEmbeddingsTokenAccount) {
async getIndex(auth?: JinaEmbeddingsAuthDTO) {
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
// Object.assign(indexObject, {
// usage1: `${ctx.origin}/YOUR_URL`,
// usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`,
// homepage: 'https://jina.ai/reader',
// sourceCode: 'https://github.com/jina-ai/reader',
// });
Object.assign(indexObject, {
usage1: 'https://r.jina.ai/YOUR_URL',
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
@ -121,71 +152,83 @@ export class CrawlerHost extends RPCHost {
sourceCode: 'https://github.com/jina-ai/reader',
});
if (user) {
await auth?.solveUID();
if (auth && auth.user) {
indexObject[''] = undefined;
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
indexObject.balanceLeft = user.wallet.total_balance;
indexObject.authenticatedAs = `${auth.user.user_id} (${auth.user.full_name})`;
indexObject.balanceLeft = auth.user.wallet.total_balance;
}
return indexObject;
}
@CloudHTTPv2({
name: 'crawl2',
runtime: {
memory: '4GiB',
timeoutSeconds: 300,
concurrency: 22,
@Method({
name: 'getIndex',
description: 'Index of the service',
proto: {
http: {
action: 'get',
path: '/',
}
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
tags: ['misc', 'crawl'],
returnType: [String, Object],
})
@CloudHTTPv2({
runtime: {
memory: '4GiB',
cpu: 2,
timeoutSeconds: 300,
concurrency: 10,
maxInstances: 1000,
minInstances: 1,
async getIndexCtrl(@Ctx() ctx: Context, @Param({ required: false }) auth?: JinaEmbeddingsAuthDTO) {
const indexObject = await this.getIndex(auth);
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
return indexObject;
}
return assignTransferProtocolMeta(`${indexObject}`,
{ contentType: 'text/plain; charset=utf-8', envelope: null }
);
}
@Method({
name: 'crawlByPostingToIndex',
description: 'Crawl any url into markdown',
proto: {
http: {
action: 'POST',
path: '/',
}
},
tags: ['Crawler'],
httpMethod: ['get', 'post'],
tags: ['crawl'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
@Method({
description: 'Crawl any url into markdown',
proto: {
http: {
action: ['GET', 'POST'],
path: '::url',
}
},
tags: ['crawl'],
returnType: [String, OutputServerEventStream, RawString],
})
async crawl(
@RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: {
req: Request,
res: Response,
},
@Ctx() ctx: Context,
auth: JinaEmbeddingsAuthDTO,
crawlerOptionsHeaderOnly: CrawlerOptionsHeaderOnly,
crawlerOptionsParamsAllowed: CrawlerOptions,
) {
const uid = await auth.solveUID();
let chargeAmount = 0;
const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
const crawlerOptions = ctx.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed;
// Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url.
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions);
const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.path), crawlerOptions);
if (!targetUrl) {
const latestUser = uid ? await auth.assertUser() : undefined;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
return this.getIndex(latestUser);
}
return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
{ contentType: 'text/plain', envelope: null }
);
return await this.getIndex(auth);
}
// Prevent circular crawling
this.puppeteerControl.circuitBreakerHosts.add(
ctx.req.hostname.toLowerCase()
ctx.hostname.toLowerCase()
);
if (uid) {
@ -222,8 +265,8 @@ export class CrawlerHost extends RPCHost {
apiRoll.chargeAmount = chargeAmount;
}
});
} else if (ctx.req.ip) {
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, [rpcReflect.name.toUpperCase()],
} else if (ctx.ip) {
const apiRoll = await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.ip, [rpcReflect.name.toUpperCase()],
[
// 20 requests per minute
new Date(Date.now() - 60 * 1000), 20
@ -254,9 +297,12 @@ export class CrawlerHost extends RPCHost {
}
}
if (crawlerOptions.robotsTxt) {
await this.robotsTxtService.assertAccessAllowed(targetUrl, crawlerOptions.robotsTxt);
}
const crawlOpts = await this.configure(crawlerOptions);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
@ -265,8 +311,11 @@ export class CrawlerHost extends RPCHost {
if (!scrapped) {
continue;
}
if (rpcReflect.signal.aborted) {
break;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -293,17 +342,20 @@ export class CrawlerHost extends RPCHost {
}
let lastScrapped;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (rpcReflect.signal.aborted) {
break;
}
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
@ -324,7 +376,7 @@ export class CrawlerHost extends RPCHost {
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -342,16 +394,18 @@ export class CrawlerHost extends RPCHost {
for await (const scrapped of this.iterSnapshots(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (rpcReflect.signal.aborted) {
break;
}
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
if (crawlerOptions.waitForSelector || !scrapped || await this.snapshotNotGoodEnough(scrapped)) {
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -370,7 +424,7 @@ export class CrawlerHost extends RPCHost {
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
}
if (!lastScrapped) {
@ -380,7 +434,7 @@ export class CrawlerHost extends RPCHost {
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -399,7 +453,7 @@ export class CrawlerHost extends RPCHost {
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain; charset=utf-8', envelope: null });
}
@ -419,7 +473,7 @@ export class CrawlerHost extends RPCHost {
}
let result: URL;
const normalizeUrl = (await pNormalizeUrl).default;
const normalizeUrl = require('@esm2cjs/normalize-url').default;
try {
result = new URL(
normalizeUrl(
@ -638,7 +692,25 @@ export class CrawlerHost extends RPCHost {
}
if (crawlOpts?.engine === ENGINE_TYPE.DIRECT) {
yield this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
const sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
await this.sideLoadWithAllocatedProxy(urlToCrawl, crawlOpts) :
await this.curlControl.sideLoad(urlToCrawl, crawlOpts);
if (!sideLoaded.file) {
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
}
const draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
yield this.jsdomControl.narrowSnapshot(draftSnapshot, crawlOpts);
return;
}
if (crawlOpts?.engine === ENGINE_TYPE.CF_BROWSER_RENDERING) {
const html = await this.cfBrowserRendering.fetchContent(urlToCrawl.href);
const snapshot = {
href: urlToCrawl.toString(),
html,
title: '',
text: '',
} as PageSnapshot;
yield this.jsdomControl.narrowSnapshot(snapshot, crawlOpts);
return;
}
@ -653,26 +725,68 @@ export class CrawlerHost extends RPCHost {
(!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable))) &&
(_.get(cache.snapshot, 'locale') === crawlOpts?.locale)
) {
if (cache.snapshot) {
cache.snapshot.isFromCache = true;
}
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return;
}
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
const { digest } = this.getDomainProfileUrlDigest(urlToCrawl);
const domainProfile = await DomainProfile.fromFirestore(digest);
if (domainProfile?.engine === ENGINE_TYPE.DIRECT) {
try {
const snapshot = await this.curlControl.urlToSnapshot(urlToCrawl, crawlOpts);
try {
const altOpts = { ...crawlOpts };
let sideLoaded = (crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) ?
await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts) :
await this.curlControl.sideLoad(urlToCrawl, altOpts).catch((err) => {
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
// Expect downstream code to "break" here if it's satisfied with the direct engine
yield snapshot;
if (crawlOpts?.engine === ENGINE_TYPE.AUTO) {
return;
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
return Promise.reject(err);
}
} catch (err: any) {
this.logger.warn(`Failed to scrap ${urlToCrawl} with direct engine`, { err: marshalErrorLike(err) });
return this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
});
if (!sideLoaded.file) {
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
}
let draftSnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, sideLoaded.file, sideLoaded.contentType, sideLoaded.fileName);
if (sideLoaded.status == 200 && !sideLoaded.contentType.startsWith('text/html')) {
yield draftSnapshot;
return;
}
let analyzed = await this.jsdomControl.analyzeHTMLTextLite(draftSnapshot.html);
draftSnapshot.title ??= analyzed.title;
let fallbackProxyIsUsed = false;
if ((!crawlOpts?.allocProxy && !crawlOpts?.proxyUrl) && (analyzed.tokens < 42 || sideLoaded.status !== 200)) {
const proxyLoaded = await this.sideLoadWithAllocatedProxy(urlToCrawl, altOpts);
if (!proxyLoaded.file) {
throw new ServiceBadAttemptError(`Remote server did not return a body: ${urlToCrawl}`);
}
const proxySnapshot = await this.snapshotFormatter.createSnapshotFromFile(urlToCrawl, proxyLoaded.file, proxyLoaded.contentType, proxyLoaded.fileName);
analyzed = await this.jsdomControl.analyzeHTMLTextLite(proxySnapshot.html);
if (proxyLoaded.status === 200 || analyzed.tokens >= 200) {
draftSnapshot = proxySnapshot;
sideLoaded = proxyLoaded;
fallbackProxyIsUsed = true;
}
}
if (crawlOpts?.engine !== ENGINE_TYPE.BROWSER && crawlerOpts?.browserIsNotRequired()) {
yield draftSnapshot;
}
if (crawlOpts && (sideLoaded.status === 200 || analyzed.tokens >= 200 || crawlOpts.allocProxy)) {
this.logger.info(`Side load seems to work, applying to crawler.`, { url: urlToCrawl.href });
crawlOpts.sideLoad ??= sideLoaded.sideLoadOpts;
if (fallbackProxyIsUsed) {
this.logger.info(`Proxy seems to salvage the page`, { url: urlToCrawl.href });
}
}
} catch (err: any) {
this.logger.warn(`Failed to side load ${urlToCrawl.origin}`, { err: marshalErrorLike(err), href: urlToCrawl.href });
if (err instanceof ApplicationError && !(err instanceof ServiceBadAttemptError)) {
throw err;
}
}
@ -782,6 +896,8 @@ export class CrawlerHost extends RPCHost {
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
this.threadLocal.set('keepImgDataUrl', opts.keepImgDataUrl);
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
this.threadLocal.set('withIframe', opts.withIframe);
this.threadLocal.set('withShadowDom', opts.withShadowDom);
this.threadLocal.set('userAgent', opts.userAgent);
if (opts.timeout) {
this.threadLocal.set('timeout', opts.timeout * 1000);
@ -804,6 +920,9 @@ export class CrawlerHost extends RPCHost {
referer: opts.referer,
viewport: opts.viewport,
engine: opts.engine,
allocProxy: opts.proxy?.endsWith('+') ? opts.proxy.slice(0, -1) : opts.proxy,
proxyResources: (opts.proxyUrl || opts.proxy?.endsWith('+')) ? true : false,
private: Boolean(opts.doNotTrack),
};
if (opts.locale) {
@ -842,14 +961,15 @@ export class CrawlerHost extends RPCHost {
return crawlOpts;
}
formatSnapshot(
protected async formatSnapshot(
crawlerOptions: CrawlerOptions,
snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
},
nominalUrl?: URL,
urlValidMs?: number
urlValidMs?: number,
scrappingOptions?: ScrappingOptions
) {
const presumedURL = crawlerOptions.base === 'final' ? new URL(snapshot.href) : nominalUrl;
@ -870,7 +990,29 @@ export class CrawlerHost extends RPCHost {
return output;
}
return this.snapshotFormatter.formatSnapshot(respondWith, snapshot, presumedURL, urlValidMs);
return this.formatSnapshotWithPDFSideLoad(respondWith, snapshot, presumedURL, urlValidMs, scrappingOptions);
}
async formatSnapshotWithPDFSideLoad(mode: string, snapshot: PageSnapshot, nominalUrl?: URL, urlValidMs?: number, scrappingOptions?: ScrappingOptions) {
const snapshotCopy = _.cloneDeep(snapshot);
if (snapshotCopy.pdfs?.length) {
const pdfUrl = snapshotCopy.pdfs[0];
if (pdfUrl.startsWith('http')) {
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
if (sideLoaded?.body) {
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
}
const r = await this.curlControl.sideLoad(new URL(pdfUrl), scrappingOptions);
if (r.file) {
snapshotCopy.pdfs[0] = pathToFileURL(await r.file.filePath).href;
}
}
}
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
}
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
@ -967,6 +1109,26 @@ export class CrawlerHost extends RPCHost {
return;
}
async snapshotNotGoodEnough(snapshot: PageSnapshot) {
if (snapshot.pdfs?.length) {
return false;
}
if (!snapshot.title) {
return true;
}
if (snapshot.parsed?.content) {
return false;
}
if (snapshot.html) {
const r = await this.jsdomControl.analyzeHTMLTextLite(snapshot.html);
const tokens = r.tokens;
if (tokens < 200) {
return true;
}
}
return false;
}
getDomainProfileUrlDigest(url: URL) {
const pathname = url.pathname;
const pathVec = pathname.split('/');
@ -981,4 +1143,29 @@ export class CrawlerHost extends RPCHost {
path: finalPath,
};
}
@retryWith((err) => {
if (err instanceof ServiceBadAttemptError) {
// Keep trying
return true;
}
if (err instanceof ApplicationError) {
// Quit with this error
return false;
}
return undefined;
}, 3)
async sideLoadWithAllocatedProxy(url: URL, opts?: ExtraScrappingOptions) {
const proxy = await this.proxyProvider.alloc(opts?.allocProxy);
const r = await this.curlControl.sideLoad(url, {
...opts,
proxyUrl: proxy.href,
});
if (opts && opts.allocProxy) {
opts.proxyUrl ??= proxy.href;
}
return { ...r, proxy };
}
}

View File

@ -1,21 +1,25 @@
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
AssertionFailureError,
objHashMd5B64Of,
assignMeta,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import {
assignTransferProtocolMeta, RPCHost, RPCReflection, AssertionFailureError, assignMeta, RawString,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { objHashMd5B64Of } from 'civkit/hash';
import _ from 'lodash';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
import { SerperSearchResult } from '../db/searched';
import { CrawlerOptions } from '../dto/scrapping-options';
import { CrawlerOptions } from '../dto/crawler-options';
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
import { GoogleSearchExplicitOperatorsDto, SerperSearchService } from '../services/serper-search';
import { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { InsufficientBalanceError } from '../services/errors';
import { SerperSearchQueryParams, SerperSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES);
@ -33,9 +37,9 @@ export class SearcherHost extends RPCHost {
targetResultCount = 5;
constructor(
protected globalLogger: Logger,
protected globalLogger: GlobalLogger,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
protected threadLocal: AsyncLocalContext,
protected serperSearchService: SerperSearchService,
protected crawler: CrawlerHost,
protected snapshotFormatter: SnapshotFormatter,
@ -49,39 +53,30 @@ export class SearcherHost extends RPCHost {
this.emit('ready');
}
@CloudHTTPv2({
name: 'search2',
runtime: {
cpu: 4,
memory: '4GiB',
timeoutSeconds: 300,
concurrency: 4,
@Method({
name: 'searchIndex',
ext: {
http: {
action: ['get', 'post'],
path: '/search'
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
tags: ['search'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
@CloudHTTPv2({
runtime: {
cpu: 4,
memory: '16GiB',
timeoutSeconds: 300,
concurrency: 4,
maxInstances: 200,
minInstances: 1,
@Method({
ext: {
http: {
action: ['get', 'post'],
path: '::q'
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
tags: ['search'],
returnType: [String, OutputServerEventStream, RawString],
})
async search(
@RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: {
req: Request,
res: Response,
},
@Ctx() ctx: Context,
auth: JinaEmbeddingsAuthDTO,
crawlerOptions: CrawlerOptions,
searchExplicitOperators: GoogleSearchExplicitOperatorsDto,
@ -102,19 +97,17 @@ export class SearcherHost extends RPCHost {
const uid = await auth.solveUID();
// Return content by default
const respondWith = ctx.req.get('X-Respond-With') ?? 'content';
const crawlWithoutContent = respondWith.includes('no-content');
const withFavicon = ctx.req.get('X-With-Favicons') === 'true';
const crawlWithoutContent = crawlerOptions.respondWith.includes('no-content');
const withFavicon = Boolean(ctx.get('X-With-Favicons'));
let chargeAmount = 0;
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
if (!noSlashPath && !q) {
const latestUser = uid ? await auth.assertUser() : undefined;
const index = this.crawler.getIndex(latestUser);
const index = await this.crawler.getIndex(auth);
if (!uid) {
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
}
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
return index;
}
@ -189,7 +182,7 @@ export class SearcherHost extends RPCHost {
chargeAmount = 10000;
}
this.assignTokenUsage(lastScrapped, chargeAmount, crawlWithoutContent);
if ((!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) || count === 0) {
if ((!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) || count === 0) {
return lastScrapped;
}
return assignTransferProtocolMeta(`${lastScrapped}`, { contentType: 'text/plain', envelope: null });
@ -201,7 +194,7 @@ export class SearcherHost extends RPCHost {
withFavicon
);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
@ -210,6 +203,9 @@ export class SearcherHost extends RPCHost {
if (!scrapped) {
continue;
}
if (rpcReflect.signal.aborted) {
break;
}
chargeAmount = this.assignChargeAmount(scrapped);
sseStream.write({
@ -233,7 +229,7 @@ export class SearcherHost extends RPCHost {
}
let earlyReturn = false;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
const setEarlyReturnTimer = () => {
if (earlyReturnTimer) {
@ -251,6 +247,9 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of it) {
lastScrapped = scrapped;
if (rpcReflect.signal.aborted) {
break;
}
if (_.some(scrapped, (x) => this.pageQualified(x))) {
setEarlyReturnTimer();
}
@ -299,7 +298,9 @@ export class SearcherHost extends RPCHost {
for await (const scrapped of it) {
lastScrapped = scrapped;
if (rpcReflect.signal.aborted) {
break;
}
if (_.some(scrapped, (x) => this.pageQualified(x))) {
setEarlyReturnTimer();
}
@ -367,8 +368,8 @@ export class SearcherHost extends RPCHost {
const dataItems = [
{ key: 'title', label: 'Title' },
{ key: 'url', label: 'URL Source' },
{ key: 'description', label: 'Description'},
]
{ key: 'description', label: 'Description' },
];
if (withContent) {
result.content = ['html', 'text', 'screenshot'].includes(mode) ? undefined : '';
@ -386,7 +387,7 @@ export class SearcherHost extends RPCHost {
result.toString = function () {
const self = this as any;
return dataItems.map((x) => `[${index + 1}] ${x.label}: ${self[x.key]}`).join('\n') + '\n';
}
};
return result;
}));
@ -408,7 +409,6 @@ export class SearcherHost extends RPCHost {
if (!searchResults) {
return;
}
const urls = searchResults.map((x) => new URL(x.link));
const snapshotMap = new WeakMap();
for await (const scrapped of this.crawler.scrapMany(urls, options, crawlerOptions)) {
@ -427,7 +427,7 @@ export class SearcherHost extends RPCHost {
if (snapshotMap.has(x)) {
return snapshotMap.get(x);
}
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
return this.crawler.formatSnapshotWithPDFSideLoad(mode, x, urls[i], undefined, options).then((r) => {
r.title ??= upstreamSearchResult.title;
r.description = upstreamSearchResult.snippet;
snapshotMap.set(x, r);

View File

@ -1,22 +1,30 @@
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
AssertionFailureError,
objHashMd5B64Of,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, InsufficientBalanceError, Logger, OutputServerEventStream, Param, RPCReflect } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { SearchResult } from '../db/searched';
import {
assignTransferProtocolMeta, RPCHost, RPCReflection,
AssertionFailureError,
RawString,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import { objHashMd5B64Of } from 'civkit/hash';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
import { CrawlerOptions } from '../dto/scrapping-options';
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
import { SearchResult } from '../db/searched';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { CrawlerOptions } from '../dto/crawler-options';
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
import { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
import { InsufficientBalanceError } from '../services/errors';
@singleton()
@ -32,9 +40,9 @@ export class SearcherHost extends RPCHost {
targetResultCount = 5;
constructor(
protected globalLogger: Logger,
protected globalLogger: GlobalLogger,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
protected threadLocal: AsyncLocalContext,
protected braveSearchService: BraveSearchService,
protected crawler: CrawlerHost,
protected snapshotFormatter: SnapshotFormatter,
@ -48,39 +56,30 @@ export class SearcherHost extends RPCHost {
this.emit('ready');
}
@CloudHTTPv2({
name: 'search2',
runtime: {
cpu: 4,
memory: '4GiB',
timeoutSeconds: 300,
concurrency: 4,
@Method({
name: 'searchIndex',
ext: {
http: {
action: ['get', 'post'],
path: '/search'
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
tags: ['search'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
})
@CloudHTTPv2({
runtime: {
cpu: 4,
memory: '16GiB',
timeoutSeconds: 300,
concurrency: 4,
maxInstances: 200,
minInstances: 1,
@Method({
ext: {
http: {
action: ['get', 'post'],
path: '::q'
}
},
tags: ['Searcher'],
httpMethod: ['get', 'post'],
returnType: [String, OutputServerEventStream],
exposeRoot: true,
tags: ['search'],
returnType: [String, OutputServerEventStream, RawString],
})
async search(
@RPCReflect() rpcReflect: RPCReflection,
@Ctx() ctx: {
req: Request,
res: Response,
},
@Ctx() ctx: Context,
auth: JinaEmbeddingsAuthDTO,
@Param('count', { default: 5, validate: (v) => v >= 0 && v <= 10 })
count: number,
@ -90,14 +89,13 @@ export class SearcherHost extends RPCHost {
) {
const uid = await auth.solveUID();
let chargeAmount = 0;
const noSlashPath = decodeURIComponent(ctx.req.path).slice(1);
const noSlashPath = decodeURIComponent(ctx.path).slice(1);
if (!noSlashPath && !q) {
const latestUser = uid ? await auth.assertUser() : undefined;
const index = this.crawler.getIndex(latestUser);
const index = await this.crawler.getIndex(auth);
if (!uid) {
index.note = 'Authentication is required to use this endpoint. Please provide a valid API key via Authorization header.';
}
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
return index;
}
@ -160,7 +158,7 @@ export class SearcherHost extends RPCHost {
count,
);
if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) {
if (!ctx.accepts('text/plain') && ctx.accepts('text/event-stream')) {
const sseStream = new OutputServerEventStream();
rpcReflect.return(sseStream);
@ -193,7 +191,7 @@ export class SearcherHost extends RPCHost {
let lastScrapped: any[] | undefined;
let earlyReturn = false;
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
if (!ctx.accepts('text/plain') && (ctx.accepts('text/json') || ctx.accepts('application/json'))) {
let earlyReturnTimer: ReturnType<typeof setTimeout> | undefined;
const setEarlyReturnTimer = () => {
if (earlyReturnTimer) {

View File

@ -14,7 +14,7 @@ import robotsParser from 'robots-parser';
import { DOMParser } from '@xmldom/xmldom';
import { AdaptiveCrawlerOptions } from '../dto/adaptive-crawler-options';
import { CrawlerOptions } from '../dto/scrapping-options';
import { CrawlerOptions } from '../dto/crawler-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { AdaptiveCrawlTask, AdaptiveCrawlTaskStatus } from '../db/adaptive-crawl-task';
import { getFunctions } from 'firebase-admin/functions';

View File

@ -9,7 +9,7 @@ import {
FirebaseStorageBucketControl, Logger, Param, TempFileManager
} from '../shared';
import _ from 'lodash';
import { CrawlerHost } from './crawler';
import { CrawlerHost } from '../api/crawler';
import { Crawled } from '../db/crawled';
import dayjs from 'dayjs';

View File

@ -1,6 +1,6 @@
import { Also, Prop } from 'civkit';
import { FirestoreRecord } from '../shared/lib/firestore';
import { ENGINE_TYPE } from '../dto/scrapping-options';
import { ENGINE_TYPE } from '../dto/crawler-options';
@Also({
dictOf: Object

View File

@ -1,6 +1,6 @@
import { Also, AutoCastable, ParamValidationError, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
import type { Request, Response } from 'express';
import { Cookie, parseString as parseSetCookieString } from 'set-cookie-parser';
import { Context } from '../services/registry';
export enum CONTENT_FORMAT {
CONTENT = 'content',
@ -19,6 +19,7 @@ export enum ENGINE_TYPE {
DIRECT = 'direct',
VLM = 'vlm',
READER_LM = 'readerlm-v2',
CF_BROWSER_RENDERING = 'cf-browser-rendering',
}
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@ -125,6 +126,11 @@ class Viewport extends AutoCastable {
in: 'header',
schema: { type: 'string' }
},
'X-Proxy': {
description: `Use a proxy server provided by Jina AI.\n\nOptionally specify two-letter country code.`,
in: 'header',
schema: { type: 'string' }
},
'X-Set-Cookie': {
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
`Syntax is the same with standard Set-Cookie`,
@ -297,6 +303,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
proxyUrl?: string;
@Prop()
proxy?: string;
@Prop()
userAgent?: string;
@ -338,15 +347,18 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
jsonSchema?: object;
@Prop()
robotsTxt?: string;
@Prop()
doNotTrack?: number | null;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
req: Request,
res: Response,
} | undefined;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
const customMode = ctx?.req.get('x-respond-with') || ctx?.req.get('x-return-format');
if (customMode !== undefined) {
const customMode = ctx?.get('x-respond-with') || ctx?.get('x-return-format');
if (customMode) {
instance.respondWith = customMode;
}
if (instance.respondWith) {
@ -361,74 +373,74 @@ export class CrawlerOptions extends AutoCastable {
}
}
const locale = ctx?.req.get('x-locale');
if (locale !== undefined) {
const locale = ctx?.get('x-locale');
if (locale) {
instance.locale = locale;
}
const referer = ctx?.req.get('x-referer');
if (referer !== undefined) {
const referer = ctx?.get('x-referer');
if (referer) {
instance.referer = referer;
}
const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
if (withGeneratedAlt !== undefined) {
const withGeneratedAlt = ctx?.get('x-with-generated-alt');
if (withGeneratedAlt) {
instance.withGeneratedAlt = Boolean(withGeneratedAlt);
}
const withLinksSummary = ctx?.req.get('x-with-links-summary');
if (withLinksSummary !== undefined) {
const withLinksSummary = ctx?.get('x-with-links-summary');
if (withLinksSummary) {
if (withLinksSummary === 'all') {
instance.withLinksSummary = withLinksSummary;
} else {
instance.withLinksSummary = Boolean(withLinksSummary);
}
}
const withImagesSummary = ctx?.req.get('x-with-images-summary');
if (withImagesSummary !== undefined) {
const withImagesSummary = ctx?.get('x-with-images-summary');
if (withImagesSummary) {
instance.withImagesSummary = Boolean(withImagesSummary);
}
const retainImages = ctx?.req.get('x-retain-images');
const retainImages = ctx?.get('x-retain-images');
if (retainImages && IMAGE_RETENTION_MODE_VALUES.has(retainImages)) {
instance.retainImages = retainImages as any;
}
if (instance.withGeneratedAlt) {
instance.retainImages = 'all_p';
}
const noCache = ctx?.req.get('x-no-cache');
if (noCache !== undefined) {
const noCache = ctx?.get('x-no-cache');
if (noCache) {
instance.noCache = Boolean(noCache);
}
if (instance.noCache && instance.cacheTolerance === undefined) {
instance.cacheTolerance = 0;
}
let cacheTolerance = parseInt(ctx?.req.get('x-cache-tolerance') || '');
let cacheTolerance = parseInt(ctx?.get('x-cache-tolerance') || '');
if (!isNaN(cacheTolerance)) {
instance.cacheTolerance = cacheTolerance;
}
const noGfm = ctx?.req.get('x-no-gfm');
const noGfm = ctx?.get('x-no-gfm');
if (noGfm) {
instance.noGfm = noGfm === 'table' ? noGfm : Boolean(noGfm);
}
let timeoutSeconds = parseInt(ctx?.req.get('x-timeout') || '');
let timeoutSeconds = parseInt(ctx?.get('x-timeout') || '');
if (!isNaN(timeoutSeconds) && timeoutSeconds > 0) {
instance.timeout = timeoutSeconds <= 180 ? timeoutSeconds : 180;
} else if (ctx?.req.get('x-timeout')) {
} else if (ctx?.get('x-timeout')) {
instance.timeout = null;
}
const removeSelector = ctx?.req.get('x-remove-selector')?.split(', ');
instance.removeSelector ??= removeSelector;
const targetSelector = ctx?.req.get('x-target-selector')?.split(', ');
instance.targetSelector ??= targetSelector;
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
const removeSelector = ctx?.get('x-remove-selector')?.split(', ').filter(Boolean);
instance.removeSelector ??= removeSelector?.length ? removeSelector : undefined;
const targetSelector = ctx?.get('x-target-selector')?.split(', ').filter(Boolean);
instance.targetSelector ??= targetSelector?.length ? targetSelector : undefined;
const waitForSelector = ctx?.get('x-wait-for-selector')?.split(', ').filter(Boolean);
instance.waitForSelector ??= (waitForSelector?.length ? waitForSelector : undefined) || instance.targetSelector;
instance.targetSelector = filterSelector(instance.targetSelector);
const overrideUserAgent = ctx?.req.get('x-user-agent');
const overrideUserAgent = ctx?.get('x-user-agent') || undefined;
instance.userAgent ??= overrideUserAgent;
const engine = ctx?.req.get('x-engine');
const engine = ctx?.get('x-engine');
if (engine) {
instance.engine = engine;
}
@ -443,18 +455,18 @@ export class CrawlerOptions extends AutoCastable {
instance.respondWith = CONTENT_FORMAT.READER_LM;
}
const keepImgDataUrl = ctx?.req.get('x-keep-img-data-url');
if (keepImgDataUrl !== undefined) {
const keepImgDataUrl = ctx?.get('x-keep-img-data-url');
if (keepImgDataUrl) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
}
const withIframe = ctx?.req.get('x-with-iframe');
if (withIframe !== undefined) {
const withIframe = ctx?.get('x-with-iframe');
if (withIframe) {
instance.withIframe = withIframe.toLowerCase() === 'quoted' ? 'quoted' : Boolean(withIframe);
}
if (instance.withIframe) {
instance.timeout ??= null;
}
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
const withShadowDom = ctx?.get('x-with-shadow-dom');
if (withShadowDom) {
instance.withShadowDom = Boolean(withShadowDom);
}
@ -463,7 +475,7 @@ export class CrawlerOptions extends AutoCastable {
}
const cookies: Cookie[] = [];
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
const setCookieHeaders = (ctx?.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[])).filter(Boolean);
if (Array.isArray(setCookieHeaders)) {
for (const setCookie of setCookieHeaders) {
cookies.push({
@ -477,21 +489,24 @@ export class CrawlerOptions extends AutoCastable {
}
instance.setCookies = cookies;
const proxyUrl = ctx?.req.get('x-proxy-url');
instance.proxyUrl ??= proxyUrl;
const proxyUrl = ctx?.get('x-proxy-url');
instance.proxyUrl ??= proxyUrl || undefined;
const proxy = ctx?.get('x-proxy');
instance.proxy ??= proxy || undefined;
const robotsTxt = ctx?.get('x-robots-txt');
instance.robotsTxt ??= robotsTxt || undefined;
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
const tokenBudget = ctx?.get('x-token-budget');
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
const baseMode = ctx?.req.get('x-base') || undefined;
const baseMode = ctx?.get('x-base');
if (baseMode) {
instance.base = baseMode as any;
}
const dnt = ctx?.get('dnt');
instance.doNotTrack ??= (parseInt(dnt || '') || null);
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}

View File

@ -0,0 +1,216 @@
import _ from 'lodash';
import {
Also, AuthenticationFailedError, AuthenticationRequiredError,
DownstreamServiceFailureError, RPC_CALL_ENVIRONMENT,
AutoCastable,
} from 'civkit/civ-rpc';
import { htmlEscape } from 'civkit/escape';
import { marshalErrorLike } from 'civkit/lang';
import type { Context } from 'koa';
import logger from '../services/logger';
import { InjectProperty } from '../services/registry';
import { AsyncLocalContext } from '../services/async-context';
import envConfig from '../shared/services/secrets';
import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
const authDtoLogger = logger.child({ service: 'JinaAuthDTO' });
const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY);
@Also({
openapi: {
operation: {
parameters: {
'Authorization': {
description: htmlEscape`Jina Token for authentication.\n\n` +
htmlEscape`- Member of <JinaEmbeddingsAuthDTO>\n\n` +
`- Authorization: Bearer {YOUR_JINA_TOKEN}`
,
in: 'header',
schema: {
anyOf: [
{ type: 'string', format: 'token' }
]
}
}
}
}
}
})
export class JinaEmbeddingsAuthDTO extends AutoCastable {
uid?: string;
bearerToken?: string;
user?: JinaEmbeddingsTokenAccount;
@InjectProperty(AsyncLocalContext)
ctxMgr!: AsyncLocalContext;
jinaEmbeddingsDashboard = THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT;
static override from(input: any) {
const instance = super.from(input) as JinaEmbeddingsAuthDTO;
const ctx = input[RPC_CALL_ENVIRONMENT] as Context;
if (ctx) {
const authorization = ctx.get('authorization');
if (authorization) {
const authToken = authorization.split(' ')[1] || authorization;
instance.bearerToken = authToken;
}
}
if (!instance.bearerToken && input._token) {
instance.bearerToken = input._token;
}
return instance;
}
async getBrief(ignoreCache?: boolean | string) {
if (!this.bearerToken) {
throw new AuthenticationRequiredError({
message: 'Jina API key is required to authenticate. Please get one from https://jina.ai'
});
}
let account;
try {
account = await JinaEmbeddingsTokenAccount.fromFirestore(this.bearerToken);
} catch (err) {
// FireStore would not accept any string as input and may throw if not happy with it
void 0;
}
const age = account?.lastSyncedAt ? Date.now() - account.lastSyncedAt.getTime() : Infinity;
if (account && !ignoreCache) {
if (account && age < 180_000) {
this.user = account;
this.uid = this.user?.user_id;
return account;
}
}
try {
const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken);
const brief = r.data;
const draftAccount = JinaEmbeddingsTokenAccount.from({
...account, ...brief, _id: this.bearerToken,
lastSyncedAt: new Date()
});
await JinaEmbeddingsTokenAccount.save(draftAccount.degradeForFireStore(), undefined, { merge: true });
this.user = draftAccount;
this.uid = this.user?.user_id;
return draftAccount;
} catch (err: any) {
authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) });
if (err?.status === 401) {
throw new AuthenticationFailedError({
message: 'Invalid API key, please get a new one from https://jina.ai'
});
}
if (account) {
this.user = account;
this.uid = this.user?.user_id;
return account;
}
throw new DownstreamServiceFailureError(`Failed to authenticate: ${err}`);
}
}
async reportUsage(tokenCount: number, mdl: string, endpoint: string = '/encode') {
const user = await this.assertUser();
const uid = user.user_id;
user.wallet.total_balance -= tokenCount;
return this.jinaEmbeddingsDashboard.reportUsage(this.bearerToken!, {
model_name: mdl,
api_endpoint: endpoint,
consumer: {
id: uid,
user_id: uid,
},
usage: {
total_tokens: tokenCount
},
labels: {
model_name: mdl
}
}).then((r) => {
JinaEmbeddingsTokenAccount.COLLECTION.doc(this.bearerToken!)
.update({ 'wallet.total_balance': JinaEmbeddingsTokenAccount.OPS.increment(-tokenCount) })
.catch((err) => {
authDtoLogger.warn(`Failed to update cache for ${uid}: ${err}`, { err: marshalErrorLike(err) });
});
return r;
}).catch((err) => {
user.wallet.total_balance += tokenCount;
authDtoLogger.warn(`Failed to report usage for ${uid}: ${err}`, { err: marshalErrorLike(err) });
});
}
async solveUID() {
if (this.uid) {
this.ctxMgr.set('uid', this.uid);
return this.uid;
}
if (this.bearerToken) {
await this.getBrief();
this.ctxMgr.set('uid', this.uid);
return this.uid;
}
return undefined;
}
async assertUID() {
const uid = await this.solveUID();
if (!uid) {
throw new AuthenticationRequiredError('Authentication failed');
}
return uid;
}
async assertUser() {
if (this.user) {
return this.user;
}
await this.getBrief();
return this.user!;
}
getRateLimits(...tags: string[]) {
const descs = tags.map((x) => this.user?.customRateLimits?.[x] || []).flat().filter((x) => x.isEffective());
if (descs.length) {
return descs;
}
return undefined;
}
}

View File

@ -0,0 +1,169 @@
import { TPM, parseJSONText } from 'civkit';
import { Transform, TransformCallback, TransformOptions } from 'stream';
export class InputServerEventStream extends Transform {
cache: string[] = [];
constructor(options?: TransformOptions) {
super({
...options,
readableObjectMode: true
});
}
decodeRoutine() {
if (!this.cache.length) {
return;
}
const vecs = this.cache.join('').split(/\r?\n\r?\n/);
this.cache.length = 0;
const lastVec = vecs.pop();
if (lastVec) {
this.cache.push(lastVec);
}
for (const x of vecs) {
const lines: string[] = x.split(/\r?\n/);
const event: {
event?: string;
data?: string;
id?: string;
retry?: number;
} = {};
for (const l of lines) {
const columnPos = l.indexOf(':');
if (columnPos <= 0) {
continue;
}
const key = l.substring(0, columnPos);
const rawValue = l.substring(columnPos + 1);
const value = rawValue.startsWith(' ') ? rawValue.slice(1) : rawValue;
if (key === 'data') {
if (event.data) {
event.data += value || '\n';
} else if (event.data === '') {
event.data += '\n';
event.data += value || '\n';
} else {
event.data = value;
}
} else if (key === 'retry') {
event.retry = parseInt(value, 10);
} else {
Reflect.set(event, key, value);
}
}
if (event.data) {
const parsed = parseJSONText(event.data);
if (parsed && typeof parsed === 'object') {
event.data = parsed;
}
}
if (Object.keys(event).length) {
this.push(event);
}
}
}
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
if (chunk === null) {
this.push(null);
}
this.cache.push(chunk.toString());
this.decodeRoutine();
callback();
}
override _final(callback: (error?: Error | null | undefined) => void): void {
this.decodeRoutine();
callback();
}
}
@TPM({
contentType: 'text/event-stream',
})
export class OutputServerEventStream extends Transform {
n: number = 0;
constructor(options?: TransformOptions) {
super({
...options, writableObjectMode: true, encoding: 'utf-8'
});
}
encodeRoutine(chunk: {
event?: string;
data?: any;
id?: string;
retry?: number;
} | string) {
if (typeof chunk === 'object') {
const lines: string[] = [];
if (chunk.event) {
lines.push(`event: ${chunk.event}`);
}
if (chunk.data) {
if (typeof chunk.data === 'string') {
for (const x of chunk.data.split(/\r?\n/)) {
lines.push(`data: ${x}`);
}
} else {
lines.push(`data: ${JSON.stringify(chunk.data)}`);
}
}
if (chunk.id) {
lines.push(`id: ${chunk.id}`);
}
if (chunk.retry) {
lines.push(`retry: ${chunk.retry}`);
}
if (!lines.length) {
lines.push(`data: ${JSON.stringify(chunk)}`);
}
this.push(lines.join('\n'));
this.push('\n\n');
this.n++;
return;
} else if (typeof chunk === 'string') {
const lines: string[] = [];
for (const x of chunk.split(/\r?\n/)) {
lines.push(`data: ${x}`);
}
this.push(lines.join('\n'));
this.push('\n\n');
this.n++;
}
}
override _transform(chunk: any, encoding: BufferEncoding, callback: TransformCallback): void {
if (chunk === null) {
this.push(null);
}
this.encodeRoutine(chunk);
callback();
}
}
export interface OutputServerEventStream extends Transform {
write(chunk: string | {
event?: string;
data?: any;
id?: string;
retry?: number;
}, callback?: (error: Error | null | undefined) => void): boolean;
write(chunk: any, callback?: (error: Error | null | undefined) => void): boolean;
write(chunk: any, encoding: BufferEncoding, callback?: (error: Error | null | undefined) => void): boolean;
}

View File

@ -0,0 +1,10 @@
import { GlobalAsyncContext } from 'civkit/async-context';
import { container, singleton } from 'tsyringe';
@singleton()
export class AsyncLocalContext extends GlobalAsyncContext { }
const instance = container.resolve(AsyncLocalContext);
Reflect.set(process, 'asyncLocalContext', instance);
export default instance;

View File

@ -0,0 +1,72 @@
import { singleton } from 'tsyringe';
import { AsyncService } from 'civkit/async-service';
import { GlobalLogger } from './logger';
@singleton()
export class BlackHoleDetector extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
lastWorkedTs?: number;
lastDoneRequestTs?: number;
lastIncomingRequestTs?: number;
maxDelay = 1000 * 30;
concurrentRequests = 0;
strikes = 0;
constructor(protected globalLogger: GlobalLogger) {
super(...arguments);
if (process.env.NODE_ENV?.startsWith('prod')) {
setInterval(() => {
this.routine();
}, 1000 * 15).unref();
}
}
override async init() {
await this.dependencyReady();
this.logger.debug('BlackHoleDetector started');
this.emit('ready');
}
routine() {
const now = Date.now();
const lastWorked = this.lastWorkedTs;
if (!lastWorked) {
return;
}
const dt = (now - lastWorked);
if (this.concurrentRequests > 0 &&
this.lastIncomingRequestTs && lastWorked &&
this.lastIncomingRequestTs >= lastWorked &&
(dt > (this.maxDelay * (this.strikes + 1)))
) {
this.logger.warn(`BlackHole detected, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
this.strikes += 1;
}
if (this.strikes >= 3) {
this.logger.error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`);
this.emit('error', new Error(`BlackHole detected for ${this.strikes} strikes, last worked: ${Math.ceil(dt / 1000)}s ago, concurrentRequests: ${this.concurrentRequests}`));
}
}
incomingRequest() {
this.lastIncomingRequestTs = Date.now();
this.lastWorkedTs ??= Date.now();
this.concurrentRequests++;
}
doneWithRequest() {
this.concurrentRequests--;
this.lastDoneRequestTs = Date.now();
}
itWorked() {
this.lastWorkedTs = Date.now();
this.strikes = 0;
}
};

View File

@ -7,6 +7,7 @@ import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
import { AsyncContext } from '../shared';
import { WebSearchOptionalHeaderOptions } from '../shared/3rd-party/brave-types';
import type { Request, Response } from 'express';
import { BlackHoleDetector } from './blackhole-detector';
@singleton()
export class BraveSearchService extends AsyncService {
@ -20,6 +21,7 @@ export class BraveSearchService extends AsyncService {
protected secretExposer: SecretExposer,
protected geoipControl: GeoIPService,
protected threadLocal: AsyncContext,
protected blackHoleDetector: BlackHoleDetector,
) {
super(...arguments);
}
@ -69,6 +71,7 @@ export class BraveSearchService extends AsyncService {
while (maxTries--) {
try {
const r = await this.braveSearchHTTP.webSearch(encoded, { headers: extraHeaders as Record<string, string> });
this.blackHoleDetector.itWorked();
return r.parsed;
} catch (err: any) {

View File

@ -0,0 +1,38 @@
import { container, singleton } from 'tsyringe';
import { AsyncService } from 'civkit/async-service';
import { Logger, SecretExposer } from '../shared';
import { CloudFlareHTTP } from '../shared/3rd-party/cloud-flare';
@singleton()
export class CFBrowserRendering extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
client!: CloudFlareHTTP;
constructor(
protected globalLogger: Logger,
protected secretExposer: SecretExposer,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
const [account, key] = this.secretExposer.CLOUD_FLARE_API_KEY?.split(':');
this.client = new CloudFlareHTTP(account, key);
this.emit('ready');
}
async fetchContent(url: string) {
const r = await this.client.fetchBrowserRenderedHTML({ url });
return r.parsed.result;
}
}
const instance = container.resolve(CFBrowserRendering);
export default instance;

387
src/services/curl.ts Normal file
View File

@ -0,0 +1,387 @@
import { marshalErrorLike } from 'civkit/lang';
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { Curl, CurlCode, CurlFeature, HeaderInfo } from 'node-libcurl';
import { parseString as parseSetCookieString } from 'set-cookie-parser';
import { ScrappingOptions } from './puppeteer';
import { Logger } from '../shared/services/logger';
import { AssertionFailureError, FancyFile } from 'civkit';
import { ServiceBadAttemptError, TempFileManager } from '../shared';
import { createBrotliDecompress, createInflate, createGunzip } from 'zlib';
import { ZSTDDecompress } from 'simple-zstd';
import _ from 'lodash';
import { Readable } from 'stream';
import { AsyncLocalContext } from './async-context';
export interface CURLScrappingOptions extends ScrappingOptions {
method?: string;
body?: string | Buffer;
}
@singleton()
export class CurlControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
chromeVersion: string = `132`;
safariVersion: string = `537.36`;
platform: string = `Linux`;
ua: string = `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/${this.safariVersion} (KHTML, like Gecko) Chrome/${this.chromeVersion}.0.0.0 Safari/${this.safariVersion}`;
lifeCycleTrack = new WeakMap();
constructor(
protected globalLogger: Logger,
protected tempFileManager: TempFileManager,
protected asyncLocalContext: AsyncLocalContext,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
if (process.platform === 'darwin') {
this.platform = `macOS`;
} else if (process.platform === 'win32') {
this.platform = `Windows`;
}
this.emit('ready');
}
impersonateChrome(ua: string) {
this.chromeVersion = ua.match(/Chrome\/(\d+)/)![1];
this.safariVersion = ua.match(/AppleWebKit\/([\d\.]+)/)![1];
this.ua = ua;
}
curlImpersonateHeader(curl: Curl, headers?: object) {
const mixinHeaders: Record<string, string> = {
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': this.platform,
'Upgrade-Insecure-Requests': '1',
'User-Agent': this.ua,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Sec-Fetch-Site': 'none',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Accept-Encoding': 'gzip, deflate, br, zstd',
'Accept-Language': 'en-US,en;q=0.9',
};
const headersCopy: Record<string, string | undefined> = { ...headers };
for (const k of Object.keys(mixinHeaders)) {
const lowerK = k.toLowerCase();
if (headersCopy[lowerK]) {
mixinHeaders[k] = headersCopy[lowerK];
delete headersCopy[lowerK];
}
}
Object.assign(mixinHeaders, headersCopy);
curl.setOpt(Curl.option.HTTPHEADER, Object.entries(mixinHeaders).flatMap(([k, v]) => {
if (Array.isArray(v) && v.length) {
return v.map((v2) => `${k}: ${v2}`);
}
return [`${k}: ${v}`];
}));
return curl;
}
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
return new Promise<{
statusCode: number,
data?: FancyFile,
headers: HeaderInfo[],
}>((resolve, reject) => {
let contentType = '';
const curl = new Curl();
curl.enable(CurlFeature.StreamResponse);
curl.setOpt('URL', urlToCrawl.toString());
curl.setOpt(Curl.option.FOLLOWLOCATION, false);
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
curl.setOpt(Curl.option.TIMEOUT_MS, Math.min(30_000, crawlOpts?.timeoutMs || 30_000));
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
if (crawlOpts?.method) {
curl.setOpt(Curl.option.CUSTOMREQUEST, crawlOpts.method.toUpperCase());
}
if (crawlOpts?.body) {
curl.setOpt(Curl.option.POSTFIELDS, crawlOpts.body.toString());
}
const headersToSet = { ...crawlOpts?.extraHeaders };
if (crawlOpts?.cookies?.length) {
const cookieChunks = crawlOpts.cookies.map((cookie) => `${cookie.name}=${encodeURIComponent(cookie.value)}`);
headersToSet.cookie ??= cookieChunks.join('; ');
}
if (crawlOpts?.referer) {
headersToSet.referer ??= crawlOpts.referer;
}
if (crawlOpts?.overrideUserAgent) {
headersToSet['user-agent'] ??= crawlOpts.overrideUserAgent;
}
this.curlImpersonateHeader(curl, headersToSet);
if (crawlOpts?.proxyUrl) {
const proxyUrlCopy = new URL(crawlOpts.proxyUrl);
curl.setOpt(Curl.option.PROXY, proxyUrlCopy.href);
}
let curlStream: Readable | undefined;
curl.on('error', (err, errCode) => {
curl.close();
this.logger.warn(`Curl ${urlToCrawl.origin}: ${err}`, { err: marshalErrorLike(err), urlToCrawl });
if (curlStream) {
// For some reason, manually emitting error event is required for curlStream.
curlStream.emit('error', err);
curlStream.destroy(err);
}
const err2 = this.digestCurlCode(errCode, err.message);
if (err2) {
reject(err2);
return;
}
reject(new AssertionFailureError(`Failed to access ${urlToCrawl.origin}: ${err.message}`));
});
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
let status = -1;
let contentEncoding = '';
curl.once('end', () => {
if (curlStream) {
curlStream.once('end', () => curl.close());
return;
}
curl.close();
});
curl.on('stream', (stream, statusCode, headers) => {
this.logger.debug(`CURL: [${statusCode}] ${urlToCrawl.origin}`, { statusCode });
status = statusCode;
curlStream = stream;
for (const headerSet of (headers as HeaderInfo[])) {
for (const [k, v] of Object.entries(headerSet)) {
if (k.trim().endsWith(':')) {
Reflect.set(headerSet, k.slice(0, k.indexOf(':')), v || '');
Reflect.deleteProperty(headerSet, k);
continue;
}
if (v === undefined) {
Reflect.set(headerSet, k, '');
continue;
}
if (k.toLowerCase() === 'content-type' && typeof v === 'string') {
contentType = v.toLowerCase();
}
}
}
const lastResHeaders = headers[headers.length - 1];
for (const [k, v] of Object.entries(lastResHeaders)) {
const kl = k.toLowerCase();
if (kl === 'content-type') {
contentType = v.toLowerCase();
}
if (kl === 'content-encoding') {
contentEncoding = v.toLowerCase();
}
if (contentType && contentEncoding) {
break;
}
}
if ([301, 302, 307, 308].includes(statusCode)) {
if (stream) {
stream.resume();
}
resolve({
statusCode: status,
data: undefined,
headers: headers as HeaderInfo[],
});
return;
}
if (!stream) {
resolve({
statusCode: status,
data: undefined,
headers: headers as HeaderInfo[],
});
return;
}
switch (contentEncoding) {
case 'gzip': {
const decompressed = createGunzip();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'deflate': {
const decompressed = createInflate();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'br': {
const decompressed = createBrotliDecompress();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
case 'zstd': {
const decompressed = ZSTDDecompress();
stream.pipe(decompressed);
stream.once('error', (err) => {
decompressed.destroy(err);
});
stream = decompressed;
break;
}
default: {
break;
}
}
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
data: fancyFile,
headers: headers as HeaderInfo[],
});
});
curl.perform();
});
}
async urlToFile(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
let leftRedirection = 10;
let opts = { ...crawlOpts };
let nextHopUrl = urlToCrawl;
const fakeHeaderInfos: HeaderInfo[] = [];
do {
const r = await this.urlToFile1Shot(nextHopUrl, opts);
if ([301, 302, 307, 308].includes(r.statusCode)) {
const headers = r.headers[r.headers.length - 1];
const location = headers.Location || headers.location;
if (!location) {
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Bad redirection from ${nextHopUrl}`);
}
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
if (setCookieHeader) {
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
if (parsed.length) {
opts.cookies = [...(opts.cookies || []), ...parsed];
}
}
nextHopUrl = new URL(location, nextHopUrl);
fakeHeaderInfos.push(...r.headers);
leftRedirection -= 1;
continue;
}
return {
statusCode: r.statusCode,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
} while (leftRedirection > 0);
throw new AssertionFailureError(`Failed to access ${urlToCrawl}: Too many redirections.`);
}
async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
let finalURL = targetUrl;
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
impersonate: {},
proxyOrigin: {},
};
for (const headers of curlResult.headers) {
sideLoadOpts.impersonate[finalURL.href] = {
status: headers.result?.code || -1,
headers: _.omit(headers, 'result'),
contentType: headers['Content-Type'] || headers['content-type'],
};
if (crawlOpts?.proxyUrl) {
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
}
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
const location = headers.Location || headers.location;
if (!location) {
throw new Error(`Bad redirection: ${curlResult.headers.length} times`);
}
finalURL = new URL(location, finalURL);
}
}
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type']).toLowerCase() || (await curlResult.data?.mimeType) || 'application/octet-stream';
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
if (sideLoadOpts.impersonate[finalURL.href] && (await curlResult.data?.size)) {
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
}
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
return {
finalURL,
sideLoadOpts,
chain: curlResult.headers,
status: curlResult.statusCode,
headers: lastHeaders,
contentType,
contentDisposition,
fileName,
file: curlResult.data
};
}
digestCurlCode(code: CurlCode, msg: string) {
switch (code) {
// 400 User errors
case CurlCode.CURLE_GOT_NOTHING:
case CurlCode.CURLE_COULDNT_RESOLVE_HOST:
case CurlCode.CURLE_REMOTE_ACCESS_DENIED: {
return new AssertionFailureError(msg);
}
// Retryable errors
case CurlCode.CURLE_SSL_CONNECT_ERROR:
case CurlCode.CURLE_QUIC_CONNECT_ERROR:
case CurlCode.CURLE_COULDNT_RESOLVE_PROXY:
case CurlCode.CURLE_COULDNT_CONNECT:
case CurlCode.CURLE_PARTIAL_FILE:
case CurlCode.CURLE_OPERATION_TIMEDOUT: {
return new ServiceBadAttemptError(msg);
}
default: {
return undefined;
}
}
}
}

70
src/services/errors.ts Normal file
View File

@ -0,0 +1,70 @@
import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc';
import _ from 'lodash';
import dayjs from 'dayjs';
import utc from 'dayjs/plugin/utc';
dayjs.extend(utc);
@StatusCode(50301)
export class ServiceDisabledError extends ApplicationError { }
@StatusCode(50302)
export class ServiceCrashedError extends ApplicationError { }
@StatusCode(50303)
export class ServiceNodeResourceDrainError extends ApplicationError { }
@StatusCode(40104)
export class EmailUnverifiedError extends ApplicationError { }
@StatusCode(40201)
export class InsufficientCreditsError extends ApplicationError { }
@StatusCode(40202)
export class FreeFeatureLimitError extends ApplicationError { }
@StatusCode(40203)
export class InsufficientBalanceError extends ApplicationError { }
@StatusCode(40903)
export class LockConflictError extends ApplicationError { }
@StatusCode(40904)
export class BudgetExceededError extends ApplicationError { }
@StatusCode(45101)
export class HarmfulContentError extends ApplicationError { }
@StatusCode(45102)
export class SecurityCompromiseError extends ApplicationError { }
@StatusCode(41201)
export class BatchSizeTooLargeError extends ApplicationError { }
@StatusCode(42903)
export class RateLimitTriggeredError extends ApplicationError {
@Prop({
desc: 'Retry after seconds',
})
retryAfter?: number;
@Prop({
desc: 'Retry after date',
})
retryAfterDate?: Date;
protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() {
const retryAfter = this.retryAfter || this.retryAfterDate;
if (!retryAfter) {
return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL];
}
return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), {
headers: {
'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`,
}
});
}
}

24
src/services/finalizer.ts Normal file
View File

@ -0,0 +1,24 @@
import { AbstractFinalizerService } from 'civkit/finalizer';
import { container, singleton } from 'tsyringe';
import { isMainThread } from 'worker_threads';
import { GlobalLogger } from './logger';
@singleton()
export class FinalizerService extends AbstractFinalizerService {
container = container;
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(protected globalLogger: GlobalLogger) {
super(...arguments);
}
}
const instance = container.resolve(FinalizerService);
export const { Finalizer } = instance.decorators();
export default instance;
if (isMainThread) {
instance.serviceReady();
}

View File

@ -4,9 +4,10 @@ import { Logger } from '../shared/services/logger';
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
import { Threaded } from '../shared/services/threaded';
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
import { Threaded } from '../services/threaded';
import type { ExtraScrappingOptions } from '../api/crawler';
import { tailwindClasses } from '../utils/tailwind-classes';
import { countGPTToken } from '../shared';
const pLinkedom = import('linkedom');
@ -37,7 +38,8 @@ export class JSDomControl extends AsyncService {
return snapshot;
}
return this.actualNarrowSnapshot(snapshot, options);
// SideLoad contains native objects that cannot go through thread boundaries.
return this.actualNarrowSnapshot(snapshot, { ...options, sideLoad: undefined });
}
@Threaded()
@ -348,6 +350,22 @@ export class JSDomControl extends AsyncService {
}
}
}
@Threaded()
async analyzeHTMLTextLite(sourceHTML: string) {
let jsdom = this.linkedom.parseHTML(sourceHTML);
if (!jsdom.window.document.documentElement) {
jsdom = this.linkedom.parseHTML(`<html><body>${sourceHTML}</body></html>`);
}
jsdom.window.document.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
const text = jsdom.window.document.body.innerText || '';
return {
title: jsdom.window.document.title,
text,
tokens: countGPTToken(text.replaceAll(/[\s\r\n\t]+/g, ' ')),
};
}
}
const jsdomControl = container.resolve(JSDomControl);

57
src/services/logger.ts Normal file
View File

@ -0,0 +1,57 @@
import { AbstractPinoLogger } from 'civkit/pino-logger';
import { singleton, container } from 'tsyringe';
import { threadId } from 'node:worker_threads';
import { getTraceCtx } from 'civkit/async-context';
const levelToSeverityMap: { [k: string]: string | undefined; } = {
trace: 'DEFAULT',
debug: 'DEBUG',
info: 'INFO',
warn: 'WARNING',
error: 'ERROR',
fatal: 'CRITICAL',
};
@singleton()
export class GlobalLogger extends AbstractPinoLogger {
loggerOptions = {
level: 'debug',
base: {
tid: threadId,
}
};
override init(): void {
if (process.env['NODE_ENV']?.startsWith('prod')) {
super.init(process.stdout);
} else {
const PinoPretty = require('pino-pretty').PinoPretty;
super.init(PinoPretty({
singleLine: true,
colorize: true,
messageFormat(log: any, messageKey: any) {
return `${log['tid'] ? `[${log['tid']}]` : ''}[${log['service'] || 'ROOT'}] ${log[messageKey]}`;
},
}));
}
this.emit('ready');
}
override log(...args: any[]) {
const [levelObj, ...rest] = args;
const severity = levelToSeverityMap[levelObj?.level];
const traceCtx = getTraceCtx();
const patched: any= { ...levelObj, severity };
const traceId = traceCtx?.googleTraceId || traceCtx?.traceId;
if (traceId && process.env['GCLOUD_PROJECT']) {
patched['logging.googleapis.com/trace'] = `projects/${process.env['GCLOUD_PROJECT']}/traces/${traceId}`;
}
return super.log(patched, ...rest);
}
}
const instance = container.resolve(GlobalLogger);
export default instance;

View File

@ -8,14 +8,15 @@ import { PDFContent } from '../db/pdf';
import dayjs from 'dayjs';
import { FirebaseStorageBucketControl } from '../shared';
import { randomUUID } from 'crypto';
import { PDFDocumentLoadingTask } from 'pdfjs-dist';
import type { PDFDocumentLoadingTask } from 'pdfjs-dist';
import path from 'path';
const utc = require('dayjs/plugin/utc'); // Import the UTC plugin
dayjs.extend(utc); // Extend dayjs with the UTC plugin
const timezone = require('dayjs/plugin/timezone');
dayjs.extend(timezone);
const pPdfjs = import('pdfjs-dist');
const pPdfjs = import('pdfjs-dist/legacy/build/pdf.mjs');
const nodeCmapUrl = path.resolve(require.resolve('pdfjs-dist'), '../../cmaps') + '/';
const md5Hasher = new HashManager('md5', 'hex');
@ -26,7 +27,10 @@ function stdDev(numbers: number[]) {
return Math.sqrt(avgSquareDiff);
}
function isRotatedByAtLeast35Degrees(transform: [number, number, number, number, number, number]): boolean {
function isRotatedByAtLeast35Degrees(transform?: [number, number, number, number, number, number]): boolean {
if (!transform) {
return false;
}
const [a, b, c, d, _e, _f] = transform;
// Calculate the rotation angles using arctan(b/a) and arctan(-c/d)
@ -94,13 +98,15 @@ export class PDFExtractor extends AsyncService {
loadingTask = this.pdfjs.getDocument({
data: binary,
disableFontFace: true,
verbosity: 0
verbosity: 0,
cMapUrl: nodeCmapUrl,
});
} else {
loadingTask = this.pdfjs.getDocument({
url,
disableFontFace: true,
verbosity: 0
verbosity: 0,
cMapUrl: nodeCmapUrl,
});
}
@ -112,7 +118,7 @@ export class PDFExtractor extends AsyncService {
for (const pg of _.range(0, doc.numPages)) {
const page = await doc.getPage(pg + 1);
const textContent = await page.getTextContent();
const textContent = await page.getTextContent({ includeMarkedContent: true });
textItems.push((textContent.items as TextItem[]));
}
@ -335,6 +341,7 @@ export class PDFExtractor extends AsyncService {
});
} catch (err) {
this.logger.warn(`Unable to extract from pdf ${nameUrl}`, { err });
throw err;
}
return extracted;

View File

@ -0,0 +1,65 @@
import { marshalErrorLike } from 'civkit';
import { AbstractPseudoTransfer, SYM_PSEUDO_TRANSFERABLE } from 'civkit/pseudo-transfer';
import { container, singleton } from 'tsyringe';
@singleton()
export class PseudoTransfer extends AbstractPseudoTransfer {
override async init() {
await this.dependencyReady();
this.emit('ready');
}
}
const instance = container.resolve(PseudoTransfer);
Object.defineProperty(Error.prototype, SYM_PSEUDO_TRANSFERABLE, {
value: function () {
const prototype = this;
return {
copyOwnProperty: 'all',
marshall: (input: Error) => marshalErrorLike(input),
unMarshall: (input: object) => {
Object.setPrototypeOf(input, prototype);
return input;
},
};
},
enumerable: false,
});
instance.expectPseudoTransferableType(Error);
for (const x of [...Object.values(require('./errors')), ...Object.values(require('civkit/civ-rpc'))]) {
if (typeof x === 'function' && x.prototype instanceof Error) {
instance.expectPseudoTransferableType(x as any);
}
}
Object.defineProperty(URL.prototype, SYM_PSEUDO_TRANSFERABLE, {
value: function () {
return {
copyOwnProperty: 'none',
marshall: (input: URL) => ({ href: input.href }),
unMarshall: (input: { href: string; }) => new URL(input.href),
};
},
enumerable: false,
});
instance.expectPseudoTransferableType(URL);
Object.defineProperty(Buffer.prototype, SYM_PSEUDO_TRANSFERABLE, {
value: function () {
return {
copyOwnProperty: 'none',
unMarshall: (input: Uint8Array | Buffer) => Buffer.isBuffer(input) ? input : Buffer.from(input),
marshall: (input: Uint8Array | Buffer) => input,
};
},
enumerable: false,
});
instance.expectPseudoTransferableType(Buffer);
export default instance;

View File

@ -1,7 +1,7 @@
import os from 'os';
import fs from 'fs';
import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError } from 'civkit';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer';
@ -14,6 +14,9 @@ import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainE
import { TimeoutError } from 'puppeteer';
import _ from 'lodash';
import { isIP } from 'net';
import { CurlControl } from './curl';
import { readFile } from 'fs/promises';
import { BlackHoleDetector } from './blackhole-detector';
const tldExtract = require('tld-extract');
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -53,6 +56,8 @@ export interface PageSnapshot {
text: string;
status?: number;
statusText?: string;
isIntermediate?: boolean;
isFromCache?: boolean;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer;
pageshot?: Buffer;
@ -82,17 +87,22 @@ export interface ScrappingOptions {
injectFrameScripts?: string[];
injectPageScripts?: string[];
viewport?: Viewport;
proxyResources?: boolean;
sideLoad?: {
impersonate: {
[url: string]: {
status: number;
headers: { [k: string]: string | string[]; };
contentType?: string;
body?: FancyFile;
};
};
proxyOrigin: { [origin: string]: string; };
};
}
// const puppeteerStealth = require('puppeteer-extra-plugin-stealth');
// puppeteer.use(puppeteerStealth());
// const puppeteerUAOverride = require('puppeteer-extra-plugin-stealth/evasions/user-agent-override');
// puppeteer.use(puppeteerUAOverride({
// userAgent: `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`,
// platform: `Linux`,
// }))
puppeteer.use(puppeteerBlockResources({
blockedTypes: new Set(['media']),
interceptResolutionPriority: 1,
@ -460,6 +470,8 @@ export class PuppeteerControl extends AsyncService {
constructor(
protected globalLogger: Logger,
protected curlControl: CurlControl,
protected blackHoleDetector: BlackHoleDetector,
) {
super(...arguments);
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
@ -514,10 +526,11 @@ export class PuppeteerControl extends AsyncService {
});
this.ua = await this.browser.userAgent();
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
this.emit('ready');
this.newPage().then((r) => this.__loadedPage.push(r));
}
@perNextTick()
@ -538,8 +551,10 @@ export class PuppeteerControl extends AsyncService {
}
}
async newPage() {
await this.serviceReady();
async newPage(bewareDeadLock: any = false) {
if (!bewareDeadLock) {
await this.serviceReady();
}
const sn = this._sn++;
let page;
try {
@ -687,7 +702,7 @@ export class PuppeteerControl extends AsyncService {
`);
this.snMap.set(page, sn);
this.logger.info(`Page ${sn} created.`);
this.logger.debug(`Page ${sn} created.`);
this.lastPageCratedAt = Date.now();
this.livePages.add(page);
this.pagePhase.set(page, 'idle');
@ -731,7 +746,7 @@ export class PuppeteerControl extends AsyncService {
return;
}
const sn = this.snMap.get(page);
this.logger.info(`Closing page ${sn}`);
this.logger.debug(`Closing page ${sn}`);
await Promise.race([
(async () => {
const ctx = page.browserContext();
@ -749,7 +764,7 @@ export class PuppeteerControl extends AsyncService {
this.pagePhase.delete(page);
}
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
async *scrap(parsedUrl: URL, options: ScrappingOptions = {}): AsyncGenerator<PageSnapshot | undefined> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
@ -761,7 +776,9 @@ export class PuppeteerControl extends AsyncService {
const page = await this.getNextPage();
this.pagePhase.set(page, 'active');
page.on('response', (resp) => {
if (resp.request().isNavigationRequest()) {
this.blackHoleDetector.itWorked();
const req = resp.request();
if (req.frame() === page.mainFrame() && req.isNavigationRequest()) {
navigationResponse = resp;
}
if (!resp.ok()) {
@ -774,7 +791,111 @@ export class PuppeteerControl extends AsyncService {
pdfUrls.push(url);
}
});
if (options?.extraHeaders) {
page.on('request', async (req) => {
if (req.isInterceptResolutionHandled()) {
return;
};
const reqUrlParsed = new URL(req.url());
if (!reqUrlParsed.protocol.startsWith('http')) {
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
const typ = req.resourceType();
if (!options.proxyResources) {
const isDocRequest = ['document', 'xhr', 'fetch', 'websocket', 'prefetch', 'eventsource', 'ping'].includes(typ);
if (!isDocRequest) {
const overrides = req.continueRequestOverrides();
return req.continue(overrides, 0);
}
}
const sideload = options.sideLoad;
const impersonate = sideload?.impersonate[reqUrlParsed.href];
if (impersonate) {
let body;
if (impersonate.body) {
body = await readFile(await impersonate.body.filePath);
if (req.isInterceptResolutionHandled()) {
return;
}
}
return req.respond({
status: impersonate.status,
headers: impersonate.headers,
contentType: impersonate.contentType,
body: body ? Uint8Array.from(body) : undefined,
}, 999);
}
const proxy = options.proxyUrl || sideload?.proxyOrigin?.[reqUrlParsed.origin];
if (proxy) {
try {
const curled = await this.curlControl.sideLoad(reqUrlParsed, {
...options,
method: req.method(),
body: req.postData(),
extraHeaders: {
...req.headers(),
...options.extraHeaders,
},
proxyUrl: proxy
});
if (req.isInterceptResolutionHandled()) {
return;
};
if (curled.chain.length === 1) {
if (!curled.file) {
return req.respond({
status: curled.status,
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
}, 999);
}
const body = await readFile(await curled.file.filePath);
if (req.isInterceptResolutionHandled()) {
return;
};
return req.respond({
status: curled.status,
headers: _.omit(curled.headers, 'result'),
contentType: curled.contentType,
body: Uint8Array.from(body),
}, 999);
}
options.sideLoad ??= curled.sideLoadOpts;
_.merge(options.sideLoad, curled.sideLoadOpts);
const firstReq = curled.chain[0];
return req.respond({
status: firstReq.result!.code,
headers: _.omit(firstReq, 'result'),
}, 999);
} catch (err: any) {
this.logger.warn(`Failed to sideload ${reqUrlParsed.origin}`, { href: reqUrlParsed.href, err: marshalErrorLike(err) });
}
}
if (req.isInterceptResolutionHandled()) {
return;
};
const overrides = req.continueRequestOverrides();
const continueArgs = [{
...overrides,
headers: {
...req.headers(),
...overrides?.headers,
...options.extraHeaders,
}
}, 1] as const;
return req.continue(continueArgs[0], continueArgs[1]);
});
if (options.extraHeaders) {
page.on('request', async (req) => {
if (req.isInterceptResolutionHandled()) {
return;
@ -795,7 +916,7 @@ export class PuppeteerControl extends AsyncService {
}
let pageScriptEvaluations: Promise<unknown>[] = [];
let frameScriptEvaluations: Promise<unknown>[] = [];
if (options?.injectPageScripts?.length) {
if (options.injectPageScripts?.length) {
page.on('framenavigated', (frame) => {
if (frame !== page.mainFrame()) {
return;
@ -808,7 +929,7 @@ export class PuppeteerControl extends AsyncService {
);
});
}
if (options?.injectFrameScripts?.length) {
if (options.injectFrameScripts?.length) {
page.on('framenavigated', (frame) => {
frameScriptEvaluations.push(
Promise.allSettled(options.injectFrameScripts!.map((x) => frame.evaluate(x).catch((err) => {
@ -819,34 +940,28 @@ export class PuppeteerControl extends AsyncService {
}
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
if (options?.locale) {
if (options.locale) {
// Add headers via request interception to walk around this bug
// https://github.com/puppeteer/puppeteer/issues/10235
// await page.setExtraHTTPHeaders({
// 'Accept-Language': options?.locale
// 'Accept-Language': options.locale
// });
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "language", {
get: function () {
return options?.locale;
return options.locale;
}
});
Object.defineProperty(navigator, "languages", {
get: function () {
return [options?.locale];
return [options.locale];
}
});
});
}
if (options?.proxyUrl) {
await page.useProxy(options.proxyUrl, {
headers: options.extraHeaders,
interceptResolutionPriority: 2,
});
}
if (options?.cookies) {
if (options.cookies) {
const mapped = options.cookies.map((x) => {
const draft: CookieParam = {
name: x.name,
@ -876,10 +991,10 @@ export class PuppeteerControl extends AsyncService {
});
}
}
if (options?.overrideUserAgent) {
if (options.overrideUserAgent) {
await page.setUserAgent(options.overrideUserAgent);
}
if (options?.viewport) {
if (options.viewport) {
await page.setViewport(options.viewport);
}
@ -921,13 +1036,13 @@ export class PuppeteerControl extends AsyncService {
);
});
const timeout = options?.timeoutMs || 30_000;
const timeout = options.timeoutMs || 30_000;
const goToOptions: GoToOptions = {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout,
};
if (options?.referer) {
if (options.referer) {
goToOptions.referer = options.referer;
}
@ -1019,7 +1134,7 @@ export class PuppeteerControl extends AsyncService {
});
gotoPromise.catch(() => 'just dont crash anything');
let waitForPromise: Promise<any> | undefined;
if (options?.waitForSelector) {
if (options.waitForSelector) {
const t0 = Date.now();
waitForPromise = nextSnapshotDeferred.promise.then(() => {
const t1 = Date.now();
@ -1054,7 +1169,7 @@ export class PuppeteerControl extends AsyncService {
if (waitForPromise) {
ckpt.push(waitForPromise);
}
if (options?.minIntervalMs) {
if (options.minIntervalMs) {
ckpt.push(delay(options.minIntervalMs));
}
let error;
@ -1074,7 +1189,7 @@ export class PuppeteerControl extends AsyncService {
} as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
if (options.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = Buffer.from(await page.screenshot());
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
lastHTML = snapshot.html;
@ -1084,7 +1199,8 @@ export class PuppeteerControl extends AsyncService {
...snapshot,
status: navigationResponse?.status(),
statusText: navigationResponse?.statusText(),
pdfs: _.uniq(pdfUrls), screenshot, pageshot
pdfs: _.uniq(pdfUrls), screenshot, pageshot,
isIntermediate: true,
} as PageSnapshot;
}
if (error) {

60
src/services/registry.ts Normal file
View File

@ -0,0 +1,60 @@
import { propertyInjectorFactory } from 'civkit/property-injector';
import { KoaRPCRegistry } from 'civkit/civ-rpc/koa';
import { container, singleton } from 'tsyringe';
import { IntegrityEnvelope } from 'civkit/civ-rpc';
import bodyParser from '@koa/bodyparser';
import { GlobalLogger } from './logger';
import { TempFileManager } from './temp-file';
import { AsyncLocalContext } from './async-context';
import { BlackHoleDetector } from './blackhole-detector';
export { Context } from 'koa';
export const InjectProperty = propertyInjectorFactory(container);
@singleton()
export class RPCRegistry extends KoaRPCRegistry {
title = 'Jina Reader API';
container = container;
logger = this.globalLogger.child({ service: this.constructor.name });
static override envelope = IntegrityEnvelope;
override _BODY_PARSER_LIMIT = '102mb';
override _RESPONSE_STREAM_MODE = 'koa' as const;
override koaMiddlewares = [
this.__CORSAllowAllMiddleware.bind(this),
bodyParser({
encoding: 'utf-8',
enableTypes: ['json', 'form'],
jsonLimit: this._BODY_PARSER_LIMIT,
xmlLimit: this._BODY_PARSER_LIMIT,
formLimit: this._BODY_PARSER_LIMIT,
}),
this.__multiParse.bind(this),
this.__binaryParse.bind(this),
];
constructor(
protected globalLogger: GlobalLogger,
protected ctxMgr: AsyncLocalContext,
protected tempFileManager: TempFileManager,
protected blackHoleDetector: BlackHoleDetector,
) {
super(...arguments);
this.on('run', () => this.blackHoleDetector.incomingRequest());
this.on('ran', () => this.blackHoleDetector.doneWithRequest());
this.on('fail', () => this.blackHoleDetector.doneWithRequest());
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
}
const instance = container.resolve(RPCRegistry);
export default instance;
export const { Method, RPCMethod, RPCReflect, Param, Ctx, } = instance.decorators();

129
src/services/robots-text.ts Normal file
View File

@ -0,0 +1,129 @@
import { singleton } from 'tsyringe';
import { DownstreamServiceFailureError, ResourcePolicyDenyError } from 'civkit/civ-rpc';
import { AsyncService } from 'civkit/async-service';
import { HashManager } from 'civkit/hash';
import { marshalErrorLike } from 'civkit/lang';
import { Logger } from '../shared/services/logger';
import { BraveSearchHTTP } from '../shared/3rd-party/brave-search';
import { FirebaseStorageBucketControl } from '../shared';
import { URL } from 'url';
import { Threaded } from '../services/threaded';
export const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class RobotsTxtService extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
braveSearchHTTP!: BraveSearchHTTP;
constructor(
protected globalLogger: Logger,
protected firebaseStorageBucketControl: FirebaseStorageBucketControl,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async getCachedRobotTxt(origin: string) {
const digest = md5Hasher.hash(origin.toLowerCase());
const cacheLoc = `/robot-txt/${digest}`;
let buff;
buff = await this.firebaseStorageBucketControl.downloadFile(cacheLoc).catch(() => undefined);
if (buff) {
return buff.toString();
}
const r = await fetch(new URL('robots.txt', origin).href, { signal: AbortSignal.timeout(5000) });
if (!r.ok) {
throw new DownstreamServiceFailureError(`Failed to fetch robots.txt from ${origin}`);
}
buff = Buffer.from(await r.arrayBuffer());
this.firebaseStorageBucketControl.saveFile(cacheLoc, buff, {
contentType: 'text/plain'
}).catch((err) => {
this.logger.warn(`Failed to save robots.txt to cache: ${err}`, { err: marshalErrorLike(err) });
});
return buff.toString();
}
@Threaded()
async assertAccessAllowed(url: URL, inputMyUa = '*') {
let robotTxt: string = '';
try {
robotTxt = await this.getCachedRobotTxt(url.origin);
} catch (err) {
if (err instanceof DownstreamServiceFailureError) {
return true;
}
throw err;
}
const myUa = inputMyUa.toLowerCase();
const lines = robotTxt.split(/\r?\n/g);
let currentUa = myUa || '*';
let uaLine = 'User-Agent: *';
const pathNormalized = `${url.pathname}?`;
for (const line of lines) {
const trimmed = line.trim();
if (trimmed.startsWith('#') || !trimmed) {
continue;
}
const [k, ...rest] = trimmed.split(':');
const key = k.trim().toLowerCase();
const value = rest.join(':').trim();
if (key === 'user-agent') {
currentUa = value.toLowerCase();
if (value === '*') {
currentUa = myUa;
}
uaLine = line;
continue;
}
if (currentUa !== myUa) {
continue;
}
if (key === 'disallow') {
if (!value) {
return true;
}
if (value.includes('*')) {
const [head, tail] = value.split('*');
if (url.pathname.startsWith(head) && url.pathname.endsWith(tail)) {
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
} else if (pathNormalized.startsWith(value)) {
throw new ResourcePolicyDenyError(`Access to ${url.href} is disallowed by site robots.txt: For ${uaLine}, ${line}`);
}
continue;
}
if (key === 'allow') {
if (!value) {
return true;
}
if (pathNormalized.startsWith(value)) {
return true;
}
continue;
}
}
return true;
}
}

View File

@ -1,11 +1,12 @@
import { AsyncService, AutoCastable, DownstreamServiceFailureError, Prop, RPC_CALL_ENVIRONMENT, delay, marshalErrorLike } from 'civkit';
import type { Request, Response } from 'express';
import { singleton } from 'tsyringe';
import { Logger } from '../shared/services/logger';
import { SecretExposer } from '../shared/services/secrets';
import { GEOIP_SUPPORTED_LANGUAGES, GeoIPService } from './geoip';
import { AsyncContext } from '../shared';
import { SerperGoogleHTTP, SerperSearchQueryParams, WORLD_COUNTRIES } from '../shared/3rd-party/serper-search';
import { BlackHoleDetector } from './blackhole-detector';
import { Context } from './registry';
@singleton()
export class SerperSearchService extends AsyncService {
@ -19,6 +20,7 @@ export class SerperSearchService extends AsyncService {
protected secretExposer: SecretExposer,
protected geoipControl: GeoIPService,
protected threadLocal: AsyncContext,
protected blackHoleDetector: BlackHoleDetector,
) {
super(...arguments);
}
@ -61,6 +63,7 @@ export class SerperSearchService extends AsyncService {
try {
this.logger.debug(`Doing external search`, query);
const r = await this.serperSearchHTTP.webSearch(query);
this.blackHoleDetector.itWorked();
return r.parsed;
} catch (err: any) {
@ -132,15 +135,12 @@ export class GoogleSearchExplicitOperatorsDto extends AutoCastable {
static override from(input: any) {
const instance = super.from(input) as GoogleSearchExplicitOperatorsDto;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
req: Request,
res: Response,
} | undefined;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as Context | undefined;
const params = ['ext', 'filetype', 'intitle', 'loc', 'site'];
for (const p of params) {
const customValue = ctx?.req.get(`x-${p}`) || ctx?.req.get(`${p}`);
const customValue = ctx?.get(`x-${p}`) || ctx?.get(`${p}`);
if (!customValue) {
continue;
}

View File

@ -1,19 +1,22 @@
import { randomUUID } from 'crypto';
import { container, singleton } from 'tsyringe';
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
import { AssertionFailureError, AsyncService, FancyFile, HashManager, marshalErrorLike } from 'civkit';
import TurndownService, { Filter, Rule } from 'turndown';
import { Logger } from '../shared/services/logger';
import { PageSnapshot } from './puppeteer';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { AsyncContext } from '../shared/services/async-context';
import { Threaded } from '../shared/services/threaded';
import { Threaded } from '../services/threaded';
import { JSDomControl } from './jsdom';
import { AltTextService } from './alt-text';
import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
import { STATUS_CODES } from 'http';
import type { CrawlerOptions } from '../dto/scrapping-options';
import type { CrawlerOptions } from '../dto/crawler-options';
import { readFile } from 'fs/promises';
import { pathToFileURL } from 'url';
import { countGPTToken } from '../shared';
export interface FormattedPage {
@ -189,7 +192,7 @@ export class SnapshotFormatter extends AsyncService {
(!mode.includes('markdown') && !mode.includes('content')))
) {
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
const formatted: FormattedPage = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
@ -401,7 +404,9 @@ export class SnapshotFormatter extends AsyncService {
const n = code - 200;
if (n < 0 || n >= 200) {
const text = snapshot.statusText || STATUS_CODES[code];
formatted.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
formatted.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
}
@ -428,7 +433,31 @@ export class SnapshotFormatter extends AsyncService {
if (this.threadLocal.get('withLinksSummary') === 'all') {
formatted.links = links;
} else {
formatted.links = _.fromPairs(links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
formatted.links = _(links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value();
}
}
if (countGPTToken(formatted.content) < 200) {
formatted.warning ??= '';
if (snapshot.isIntermediate) {
const msg = 'This page maybe not yet fully loaded, consider explicitly specify a timeout.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
if (snapshot.childFrames?.length && !this.threadLocal.get('withIframe')) {
const msg = 'This page contains iframe that are currently hidden, consider enabling iframe processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
if (snapshot.shadowExpanded && !this.threadLocal.get('withShadowDom')) {
const msg = 'This page contains shadow DOM that are currently hidden, consider enabling shadow DOM processing.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
if (snapshot.html.includes('captcha') || snapshot.html.includes('cf-turnstile-response')) {
const msg = 'This page maybe requiring CAPTCHA, please make sure you are authorized to access this page.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
if (snapshot.isFromCache) {
const msg = 'This is a cached snapshot of the original page, consider retry with caching opt-out.';
formatted.warning = `${formatted.warning}${formatted.warning ? '\n': ''}${msg}`;
}
}
@ -468,7 +497,7 @@ export class SnapshotFormatter extends AsyncService {
}
if (this.warning) {
mixins.push(`Warning: ${this.warning}`);
mixins.push(this.warning.split('\n').map((v) => `Warning: ${v}`).join('\n'));
}
if (mode.includes('markdown')) {
@ -488,7 +517,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
Object.defineProperty(f, 'textRepresentation', { value: textRepresentation, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
this.logger.debug(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return f as FormattedPage;
}
@ -526,7 +555,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (this.threadLocal.get('withLinksSummary') === 'all') {
mixin.links = inferred.links;
} else {
mixin.links = _.fromPairs(inferred.links.filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')));
mixin.links = _(inferred.links).filter(([_label, href]) => !href.startsWith('file:') && !href.startsWith('javascript:')).uniqBy(1).fromPairs().value();
}
}
if (snapshot.status) {
@ -534,7 +563,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const n = code - 200;
if (n < 0 || n >= 200) {
const text = snapshot.statusText || STATUS_CODES[code];
mixin.warning = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
mixin.warning ??= '';
const msg = `Target URL returned error ${code}${text ? `: ${text}` : ''}`;
mixin.warning = `${mixin.warning}${mixin.warning ? '\n': ''}${msg}`;
}
}
@ -697,6 +728,52 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
return false;
}
async createSnapshotFromFile(url: URL, file: FancyFile, overrideContentType?: string, overrideFileName?: string) {
if (overrideContentType === 'application/octet-stream') {
overrideContentType = undefined;
}
const contentType = (overrideContentType || await file.mimeType).toLowerCase();
const fileName = overrideFileName || `${url.origin}${url.pathname}`;
const snapshot: PageSnapshot = {
title: '',
href: url.href,
html: '',
text: ''
};
if (contentType.startsWith('image/')) {
snapshot.html = `<html style="height: 100%;"><head><meta name="viewport" content="width=device-width, minimum-scale=0.1"><title>${fileName}</title></head><body style="margin: 0px; height: 100%; background-color: rgb(14, 14, 14);"><img style="display: block;-webkit-user-select: none;margin: auto;background-color: hsl(0, 0%, 90%);transition: background-color 300ms;" src="${url.href}"></body></html>`;
snapshot.title = fileName;
return snapshot;
}
if (contentType.startsWith('text/html')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' });
return snapshot;
}
if (contentType.startsWith('text/') || contentType.startsWith('application/json')) {
if ((await file.size) > 1024 * 1024 * 32) {
throw new AssertionFailureError(`Failed to access ${url}: file too large`);
}
snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' });
snapshot.html = `<html><head><meta name="color-scheme" content="light dark"></head><body><pre style="word-wrap: break-word; white-space: pre-wrap;">${snapshot.text}</pre></body></html>`;
return snapshot;
}
if (contentType.startsWith('application/pdf')) {
snapshot.pdfs = [pathToFileURL(await file.filePath).href];
return snapshot;
}
throw new AssertionFailureError(`Failed to access ${url}: unexpected type ${contentType}`);
}
}
const snapshotFormatter = container.resolve(SnapshotFormatter);

22
src/services/temp-file.ts Normal file
View File

@ -0,0 +1,22 @@
import { AbstractTempFileManger } from 'civkit/temp';
import { unlink } from 'fs/promises';
import { singleton } from 'tsyringe';
@singleton()
export class TempFileManager extends AbstractTempFileManger {
rootDir = '';
override async init() {
await this.dependencyReady();
await super.init();
this.emit('ready');
}
override async standDown() {
await super.standDown();
await unlink(this.rootDir);
}
}

66
src/services/threaded.ts Normal file
View File

@ -0,0 +1,66 @@
import 'reflect-metadata';
import { singleton, container } from 'tsyringe';
import { AbstractThreadedServiceRegistry } from 'civkit/threaded';
import _ from 'lodash';
import { GlobalLogger } from './logger';
import { AsyncLocalContext } from './async-context';
import { PseudoTransfer } from './pseudo-transfer';
import { cpus } from 'os';
import { isMainThread } from 'worker_threads';
@singleton()
export class ThreadedServiceRegistry extends AbstractThreadedServiceRegistry {
container = container;
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: GlobalLogger,
public asyncContext: AsyncLocalContext,
public pseudoTransfer: PseudoTransfer,
) {
super(...arguments);
}
setMaxWorkersByCpu() {
const cpuStat = cpus();
const evenCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 0).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
const oddCpuCycles = cpuStat.filter((_cpu, i) => i % 2 === 1).reduce((acc, cpu) => acc + cpu.times.user + cpu.times.sys, 0);
const isLikelyHyperThreaded = (oddCpuCycles / evenCpuCycles) < 0.5;
this.maxWorkers = isLikelyHyperThreaded ? cpuStat.length / 2 : cpuStat.length;
}
override async init() {
await this.dependencyReady();
await super.init();
if (isMainThread) {
this.setMaxWorkersByCpu();
await Promise.all(
_.range(0, 2).map(
(_n) =>
new Promise<void>(
(resolve, reject) => {
this.createWorker()
.once('message', resolve)
.once('error', reject);
}
)
)
);
}
this.emit('ready');
}
}
const instance = container.resolve(ThreadedServiceRegistry);
export default instance;
export const { Method, Param, Ctx, RPCReflect, Threaded } = instance.decorators();

1
src/shared Symbolic link
View File

@ -0,0 +1 @@
../thinapps-shared/backend

139
src/stand-alone/crawl.ts Normal file
View File

@ -0,0 +1,139 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { KoaServer } from 'civkit/civ-rpc/koa';
import http2 from 'http2';
import { CrawlerHost } from '../api/crawler';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { Context, Next } from 'koa';
import { RPCRegistry } from '../services/registry';
import { AsyncResource } from 'async_hooks';
import { runOnce } from 'civkit/decorators';
import { randomUUID } from 'crypto';
import { ThreadedServiceRegistry } from '../services/threaded';
import globalLogger, { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
process.on('unhandledRejection', (err) => {
globalLogger.warn('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
globalLogger.error('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
globalLogger.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class CrawlStandAloneServer extends KoaServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: GlobalLogger,
protected registry: RPCRegistry,
protected crawlerHost: CrawlerHost,
protected threadLocal: AsyncLocalContext,
protected threads: ThreadedServiceRegistry,
) {
super(...arguments);
}
h2c() {
this.httpAlternativeServer = this.httpServer;
const fn = this.koaApp.callback();
this.httpServer = http2.createServer((req, res) => {
const ar = new AsyncResource('HTTP2ServerRequest');
ar.runInAsyncScope(fn, this.koaApp, req, res);
});
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
makeAssetsServingController() {
return (ctx: Context, next: Next) => {
const requestPath = ctx.path;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
ctx.body = fs.createReadStream(asset.path);
ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
ctx.set('Content-Length', asset.stats.size.toString());
return;
};
}
registerRoutes(): void {
this.koaApp.use(this.makeAssetsServingController());
this.koaApp.use(this.registry.makeShimController());
}
// Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
// TraceId is expected to be request-bound and unique. So these two has to be distinguished.
@runOnce()
override insertAsyncHookMiddleware() {
const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
this.threadLocal.setup({
traceId: randomUUID(),
traceT0: new Date(),
googleTraceId,
});
return next();
};
this.koaApp.use(asyncHookMiddleware);
}
}
const instance = container.resolve(CrawlStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));

148
src/stand-alone/search.ts Normal file
View File

@ -0,0 +1,148 @@
import 'reflect-metadata';
import { container, singleton } from 'tsyringe';
import { KoaServer } from 'civkit/civ-rpc/koa';
import http2 from 'http2';
import { SearcherHost } from '../api/searcher-serper';
import { FsWalk, WalkOutEntity } from 'civkit/fswalk';
import path from 'path';
import fs from 'fs';
import { mimeOfExt } from 'civkit/mime';
import { Context, Next } from 'koa';
import { RPCRegistry } from '../services/registry';
import { AsyncResource } from 'async_hooks';
import { runOnce } from 'civkit/decorators';
import { randomUUID } from 'crypto';
import { ThreadedServiceRegistry } from '../services/threaded';
import globalLogger, { GlobalLogger } from '../services/logger';
import { AsyncLocalContext } from '../services/async-context';
process.on('unhandledRejection', (err) => {
globalLogger.warn('Unhandled rejection', err);
});
process.on('uncaughtException', (err) => {
globalLogger.error('Uncaught exception', err);
// Looks like Firebase runtime does not handle error properly.
// Make sure to quit the process.
globalLogger.error('Uncaught exception, process quit.');
process.nextTick(() => process.exit(1));
});
@singleton()
export class SearchStandAloneServer extends KoaServer {
logger = this.globalLogger.child({ service: this.constructor.name });
httpAlternativeServer?: typeof this['httpServer'];
assets = new Map<string, WalkOutEntity>();
constructor(
protected globalLogger: GlobalLogger,
protected registry: RPCRegistry,
protected searcherHost: SearcherHost,
protected threadLocal: AsyncLocalContext,
protected threads: ThreadedServiceRegistry,
) {
super(...arguments);
}
h2c() {
this.httpAlternativeServer = this.httpServer;
const fn = this.koaApp.callback();
this.httpServer = http2.createServer((req, res) => {
const ar = new AsyncResource('HTTP2ServerRequest');
ar.runInAsyncScope(fn, this.koaApp, req, res);
});
// useResourceBasedDefaultTracker();
return this;
}
override async init() {
await this.walkForAssets();
await this.dependencyReady();
for (const [k,v] of this.registry.conf.entries()) {
if (v.tags?.includes('crawl')) {
this.registry.conf.delete(k);
}
}
await super.init();
}
async walkForAssets() {
const files = await FsWalk.walkOut(path.resolve(__dirname, '..', '..', 'public'));
for (const file of files) {
if (file.type !== 'file') {
continue;
}
this.assets.set(file.relativePath.toString(), file);
}
}
override listen(port: number) {
const r = super.listen(port);
if (this.httpAlternativeServer) {
const altPort = port + 1;
this.httpAlternativeServer.listen(altPort, () => {
this.logger.info(`Alternative ${this.httpAlternativeServer!.constructor.name} listening on port ${altPort}`);
});
}
return r;
}
makeAssetsServingController() {
return (ctx: Context, next: Next) => {
const requestPath = ctx.path;
const file = requestPath.slice(1);
if (!file) {
return next();
}
const asset = this.assets.get(file);
if (asset?.type !== 'file') {
return next();
}
ctx.body = fs.createReadStream(asset.path);
ctx.type = mimeOfExt(path.extname(asset.path.toString())) || 'application/octet-stream';
ctx.set('Content-Length', asset.stats.size.toString());
return;
};
}
registerRoutes(): void {
this.koaApp.use(this.makeAssetsServingController());
this.koaApp.use(this.registry.makeShimController());
}
// Using h2c server has an implication that multiple requests may share the same connection and x-cloud-trace-context
// TraceId is expected to be request-bound and unique. So these two has to be distinguished.
@runOnce()
override insertAsyncHookMiddleware() {
const asyncHookMiddleware = async (ctx: Context, next: () => Promise<void>) => {
const googleTraceId = ctx.get('x-cloud-trace-context').split('/')?.[0];
this.threadLocal.setup({
traceId: randomUUID(),
traceT0: new Date(),
googleTraceId,
});
return next();
};
this.koaApp.use(asyncHookMiddleware);
}
}
const instance = container.resolve(SearchStandAloneServer);
export default instance;
instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000));

@ -1 +1 @@
Subproject commit b80a917835031da9ab7073b6b4005402eece0746
Subproject commit 0c62acf45e4749ecf4bb7f4bfc7ed49533e239cb