diff --git a/Dockerfile b/Dockerfile index 1c16980..223d807 100644 --- a/Dockerfile +++ b/Dockerfile @@ -29,8 +29,11 @@ COPY licensed ./licensed RUN rm -rf ~/.config/chromium && mkdir -p ~/.config/chromium +RUN NODE_COMPILE_CACHE=node_modules npm run dry-run + ENV OVERRIDE_CHROME_EXECUTABLE_PATH=/usr/bin/google-chrome-stable ENV LD_PRELOAD=/usr/local/lib/libcurl-impersonate.so CURL_IMPERSONATE=chrome116 CURL_IMPERSONATE_HEADERS=no +ENV NODE_CACHE_DIR=node_modules ENV PORT=8080 EXPOSE 3000 3001 8080 8081 diff --git a/package-lock.json b/package-lock.json index feb3d0f..f490f0f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -17,7 +17,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-5f839a7", + "civkit": "^0.8.4-f65f570", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", @@ -4095,9 +4095,9 @@ } }, "node_modules/civkit": { - "version": "0.8.4-5f839a7", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-5f839a7.tgz", - "integrity": "sha512-wF9Sm0dKBNGTXtueYtmwqreciilEw2+H3uAZgJNK/B+MoeQecvQ1alrqPqIP/Xf64H1ik6mD0Z47cez8jkayGA==", + "version": "0.8.4-f65f570", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.4-f65f570.tgz", + "integrity": "sha512-s04b5ca/wYaBI1cCfSxNS5t170y+vEHXkc7CqriZA+qLJY3mI5hAiFSeWoOBwt4/0nc+lY4UaHuhmjjpRXYSqw==", "license": "AGPL", "dependencies": { "lodash": "^4.17.21", diff --git a/package.json b/package.json index c5a584a..84f365e 100644 --- a/package.json +++ b/package.json @@ -7,7 +7,8 @@ "build:clean": "rm -rf ./build", "serve": "npm run build && npm run start", "debug": "npm run build && npm run dev", - "start": "npm run shell" + "start": "node ./build/stand-alone/crawl.js", + "dry-run": "NODE_ENV=dry-run node ./build/stand-alone/search.js" }, "engines": { "node": ">=18" @@ -25,7 +26,7 @@ "axios": "^1.3.3", "bcrypt": "^5.1.0", "busboy": "^1.6.0", - "civkit": "^0.8.4-5f839a7", + "civkit": "^0.8.4-f65f570", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 5d41824..d214ce1 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -137,7 +137,9 @@ export class CrawlerHost extends RPCHost { override async init() { await this.dependencyReady(); - this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, '')); + if (this.puppeteerControl.ua) { + this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, '')); + } this.emit('ready'); } diff --git a/src/index.ts b/src/index.ts deleted file mode 100644 index 3c65cd0..0000000 --- a/src/index.ts +++ /dev/null @@ -1,31 +0,0 @@ -import 'reflect-metadata'; -import './shared/lib/doom-domain'; -import { initializeApp } from 'firebase-admin/app'; -initializeApp(); - - -import { loadModulesDynamically, registry } from './shared'; -import path from 'path'; -loadModulesDynamically(path.resolve(__dirname, 'cloud-functions')); -loadModulesDynamically(path.resolve(__dirname, 'shared', 'cloud-functions')); - -Object.assign(exports, registry.exportAll()); -Object.assign(exports, registry.exportGrouped({ - memory: '4GiB', - timeoutSeconds: 540, -})); -registry.allHandsOnDeck().catch(() => void 0); -registry.title = 'reader'; -registry.version = '0.1.0'; - -process.on('unhandledRejection', (_err) => `Somehow is false alarm in firebase`); - -process.on('uncaughtException', (err) => { - console.log('Uncaught exception', err); - - // Looks like Firebase runtime does not handle error properly. - // Make sure to quit the process. - process.nextTick(() => process.exit(1)); - console.error('Uncaught exception, process quit.'); - throw err; -}); diff --git a/src/services/blackhole-detector.ts b/src/services/blackhole-detector.ts index fded4f4..9769720 100644 --- a/src/services/blackhole-detector.ts +++ b/src/services/blackhole-detector.ts @@ -1,6 +1,7 @@ import { singleton } from 'tsyringe'; import { AsyncService } from 'civkit/async-service'; import { GlobalLogger } from './logger'; +import { delay } from 'civkit/timeout'; @singleton() @@ -22,7 +23,7 @@ export class BlackHoleDetector extends AsyncService { if (process.env.NODE_ENV?.startsWith('prod')) { setInterval(() => { this.routine(); - }, 1000 * 15).unref(); + }, 1000 * 30).unref(); } } @@ -32,14 +33,16 @@ export class BlackHoleDetector extends AsyncService { this.emit('ready'); } - routine() { + async routine() { + // We give routine a 3s grace period for potentially paused CPU to spin up and process some requests + await delay(3000); const now = Date.now(); const lastWorked = this.lastWorkedTs; if (!lastWorked) { return; } const dt = (now - lastWorked); - if (this.concurrentRequests > 0 && + if (this.concurrentRequests > 1 && this.lastIncomingRequestTs && lastWorked && this.lastIncomingRequestTs >= lastWorked && (dt > (this.maxDelay * (this.strikes + 1))) diff --git a/src/services/curl.ts b/src/services/curl.ts index 55453c8..320a09f 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -14,6 +14,7 @@ import { ZSTDDecompress } from 'simple-zstd'; import _ from 'lodash'; import { Readable } from 'stream'; import { AsyncLocalContext } from './async-context'; +import { BlackHoleDetector } from './blackhole-detector'; export interface CURLScrappingOptions extends ScrappingOptions { method?: string; @@ -36,6 +37,7 @@ export class CurlControl extends AsyncService { protected globalLogger: GlobalLogger, protected tempFileManager: TempFileManager, protected asyncLocalContext: AsyncLocalContext, + protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); } @@ -349,7 +351,7 @@ export class CurlControl extends AsyncService { async sideLoad(targetUrl: URL, crawlOpts?: CURLScrappingOptions) { const curlResult = await this.urlToFile(targetUrl, crawlOpts); - + this.blackHoleDetector.itWorked(); let finalURL = targetUrl; const sideLoadOpts: CURLScrappingOptions['sideLoad'] = { impersonate: {}, @@ -406,13 +408,13 @@ export class CurlControl extends AsyncService { } // Retryable errors + case CurlCode.CURLE_RECV_ERROR: case CurlCode.CURLE_OPERATION_TIMEDOUT: case CurlCode.CURLE_SSL_CONNECT_ERROR: case CurlCode.CURLE_QUIC_CONNECT_ERROR: case CurlCode.CURLE_COULDNT_RESOLVE_PROXY: case CurlCode.CURLE_COULDNT_CONNECT: - case CurlCode.CURLE_PARTIAL_FILE: - case CurlCode.CURLE_OPERATION_TIMEDOUT: { + case CurlCode.CURLE_PARTIAL_FILE: { return new ServiceBadAttemptError(msg); } diff --git a/src/services/finalizer.ts b/src/services/finalizer.ts index 85bc565..9fe4c41 100644 --- a/src/services/finalizer.ts +++ b/src/services/finalizer.ts @@ -3,12 +3,24 @@ import { container, singleton } from 'tsyringe'; import { isMainThread } from 'worker_threads'; import { GlobalLogger } from './logger'; +const realProcessExit = process.exit; +process.exit = ((code?: number) => { + if (isMainThread) { + return; + } + return realProcessExit(code); +}) as typeof process.exit; + @singleton() export class FinalizerService extends AbstractFinalizerService { container = container; logger = this.globalLogger.child({ service: this.constructor.name }); + override quitProcess(code?: string | number | null | undefined): never { + return realProcessExit(code); + } + constructor(protected globalLogger: GlobalLogger) { super(...arguments); } diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index ed851a5..9c84983 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -1,4 +1,3 @@ -import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit'; @@ -474,7 +473,7 @@ export class PuppeteerControl extends AsyncService { protected blackHoleDetector: BlackHoleDetector, ) { super(...arguments); - this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95; + this.setMaxListeners(Infinity); let crippledTimes = 0; this.on('crippled', () => { @@ -496,6 +495,10 @@ export class PuppeteerControl extends AsyncService { this.__reqCapInterval = undefined; } await this.dependencyReady(); + if (process.env.NODE_ENV?.includes('dry-run')) { + this.emit('ready'); + return; + } if (this.browser) { if (this.browser.connected) { diff --git a/src/services/temp-file.ts b/src/services/temp-file.ts index 857f06b..74f10ba 100644 --- a/src/services/temp-file.ts +++ b/src/services/temp-file.ts @@ -1,6 +1,7 @@ import { AbstractTempFileManger } from 'civkit/temp'; -import { unlink } from 'fs/promises'; +import { rm } from 'fs/promises'; import { singleton } from 'tsyringe'; +import { Finalizer } from './finalizer'; @singleton() export class TempFileManager extends AbstractTempFileManger { @@ -13,10 +14,10 @@ export class TempFileManager extends AbstractTempFileManger { this.emit('ready'); } + @Finalizer() override async standDown() { await super.standDown(); - await unlink(this.rootDir); - + await rm(this.rootDir, { recursive: true, force: true }); } } diff --git a/src/stand-alone/crawl.ts b/src/stand-alone/crawl.ts index 59e8fa4..e2e5576 100644 --- a/src/stand-alone/crawl.ts +++ b/src/stand-alone/crawl.ts @@ -3,6 +3,7 @@ import { container, singleton } from 'tsyringe'; import { KoaServer } from 'civkit/civ-rpc/koa'; import http2 from 'http2'; +import http from 'http'; import { CrawlerHost } from '../api/crawler'; import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; import path from 'path'; @@ -14,21 +15,9 @@ import { AsyncResource } from 'async_hooks'; import { runOnce } from 'civkit/decorators'; import { randomUUID } from 'crypto'; import { ThreadedServiceRegistry } from '../services/threaded'; -import globalLogger, { GlobalLogger } from '../services/logger'; +import { GlobalLogger } from '../services/logger'; import { AsyncLocalContext } from '../services/async-context'; - -process.on('unhandledRejection', (err) => { - globalLogger.warn('Unhandled rejection', err); -}); - -process.on('uncaughtException', (err) => { - globalLogger.error('Uncaught exception', err); - - // Looks like Firebase runtime does not handle error properly. - // Make sure to quit the process. - globalLogger.error('Uncaught exception, process quit.'); - process.nextTick(() => process.exit(1)); -}); +import finalizer, { Finalizer } from '../services/finalizer'; @singleton() export class CrawlStandAloneServer extends KoaServer { @@ -131,9 +120,32 @@ export class CrawlStandAloneServer extends KoaServer { this.koaApp.use(asyncHookMiddleware); } + @Finalizer() + override async standDown() { + const tasks: Promise[] = []; + if (this.httpAlternativeServer?.listening) { + (this.httpAlternativeServer as http.Server).closeIdleConnections?.(); + this.httpAlternativeServer.close(); + tasks.push(new Promise((resolve, reject) => { + this.httpAlternativeServer!.close((err) => { + if (err) { + return reject(err); + } + resolve(); + }); + })); + } + tasks.push(super.standDown()); + await Promise.all(tasks); + } + } const instance = container.resolve(CrawlStandAloneServer); export default instance; -instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); +if (process.env.NODE_ENV?.includes('dry-run')) { + instance.serviceReady().then(() => finalizer.terminate()); +} else { + instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); +} \ No newline at end of file diff --git a/src/stand-alone/search.ts b/src/stand-alone/search.ts index c3c7cc8..b86840f 100644 --- a/src/stand-alone/search.ts +++ b/src/stand-alone/search.ts @@ -3,6 +3,7 @@ import { container, singleton } from 'tsyringe'; import { KoaServer } from 'civkit/civ-rpc/koa'; import http2 from 'http2'; +import http from 'http'; import { SearcherHost } from '../api/searcher-serper'; import { FsWalk, WalkOutEntity } from 'civkit/fswalk'; import path from 'path'; @@ -14,21 +15,9 @@ import { AsyncResource } from 'async_hooks'; import { runOnce } from 'civkit/decorators'; import { randomUUID } from 'crypto'; import { ThreadedServiceRegistry } from '../services/threaded'; -import globalLogger, { GlobalLogger } from '../services/logger'; +import { GlobalLogger } from '../services/logger'; import { AsyncLocalContext } from '../services/async-context'; - -process.on('unhandledRejection', (err) => { - globalLogger.warn('Unhandled rejection', err); -}); - -process.on('uncaughtException', (err) => { - globalLogger.error('Uncaught exception', err); - - // Looks like Firebase runtime does not handle error properly. - // Make sure to quit the process. - globalLogger.error('Uncaught exception, process quit.'); - process.nextTick(() => process.exit(1)); -}); +import finalizer, { Finalizer } from '../services/finalizer'; @singleton() export class SearchStandAloneServer extends KoaServer { @@ -63,7 +52,7 @@ export class SearchStandAloneServer extends KoaServer { await this.walkForAssets(); await this.dependencyReady(); - for (const [k,v] of this.registry.conf.entries()) { + for (const [k, v] of this.registry.conf.entries()) { if (v.tags?.includes('crawl')) { this.registry.conf.delete(k); } @@ -140,9 +129,32 @@ export class SearchStandAloneServer extends KoaServer { this.koaApp.use(asyncHookMiddleware); } + @Finalizer() + override async standDown() { + const tasks: Promise[] = []; + if (this.httpAlternativeServer?.listening) { + (this.httpAlternativeServer as http.Server).closeIdleConnections?.(); + this.httpAlternativeServer.close(); + tasks.push(new Promise((resolve, reject) => { + this.httpAlternativeServer!.close((err) => { + if (err) { + return reject(err); + } + resolve(); + }); + })); + } + tasks.push(super.standDown()); + await Promise.all(tasks); + } + } const instance = container.resolve(SearchStandAloneServer); export default instance; -instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); +if (process.env.NODE_ENV?.includes('dry-run')) { + instance.serviceReady().then(() => finalizer.terminate()); +} else { + instance.serviceReady().then((s) => s.h2c().listen(parseInt(process.env.PORT || '') || 3000)); +} diff --git a/thinapps-shared b/thinapps-shared index ed90522..f7d65a8 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit ed905226dac8e465b85b957f295eab02d260fc83 +Subproject commit f7d65a8b12fa32d3d6fa46585d73693cba7b14e3