feat: control concurrent request per page instead of server bucket

This commit is contained in:
yanlong.wang 2025-03-10 16:45:56 +08:00
parent c064fcf77e
commit 3b3a0265df
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -1,21 +1,26 @@
import _ from 'lodash';
import { isIP } from 'net';
import { readFile } from 'fs/promises';
import fs from 'fs'; import fs from 'fs';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
import { GlobalLogger } from './logger';
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer'; import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
import type { Cookie } from 'set-cookie-parser'; import type { Cookie } from 'set-cookie-parser';
import puppeteer from 'puppeteer-extra'; import puppeteer from 'puppeteer-extra';
import { TimeoutError } from 'puppeteer';
import { Defer, Deferred } from 'civkit/defer';
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
import { AsyncService } from 'civkit/async-service';
import { FancyFile } from 'civkit/fancy-file';
import { delay } from 'civkit/timeout';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors'; import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
import { TimeoutError } from 'puppeteer';
import _ from 'lodash';
import { isIP } from 'net';
import { CurlControl } from './curl'; import { CurlControl } from './curl';
import { readFile } from 'fs/promises';
import { BlackHoleDetector } from './blackhole-detector'; import { BlackHoleDetector } from './blackhole-detector';
import { AsyncLocalContext } from './async-context'; import { AsyncLocalContext } from './async-context';
import { GlobalLogger } from './logger';
const tldExtract = require('tld-extract'); const tldExtract = require('tld-extract');
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
@ -440,6 +445,39 @@ window.briefImgs = briefImgs;
})(); })();
`; `;
class PageReqCtrlKit {
reqSet: Set<HTTPRequest> = new Set();
blockers: Deferred<void>[] = [];
constructor(
public concurrency: number,
) {
if (isNaN(concurrency) || concurrency < 1) {
throw new AssertionFailureError(`Invalid concurrency: ${concurrency}`);
}
}
onNewRequest(req: HTTPRequest) {
this.reqSet.add(req);
if (this.reqSet.size <= this.concurrency) {
return Promise.resolve();
}
const deferred = Defer();
this.blockers.push(deferred);
return deferred.promise;
}
onFinishRequest(req: HTTPRequest) {
this.reqSet.delete(req);
if (this.reqSet.size > this.concurrency) {
return;
}
const deferred = this.blockers.shift();
deferred?.resolve();
}
}
@singleton() @singleton()
export class PuppeteerControl extends AsyncService { export class PuppeteerControl extends AsyncService {
@ -447,8 +485,6 @@ export class PuppeteerControl extends AsyncService {
browser!: Browser; browser!: Browser;
logger = this.globalLogger.child({ service: this.constructor.name }); logger = this.globalLogger.child({ service: this.constructor.name });
private __reqCapInterval?: NodeJS.Timeout;
__loadedPage: Page[] = []; __loadedPage: Page[] = [];
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>(); finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
@ -458,9 +494,10 @@ export class PuppeteerControl extends AsyncService {
lastPageCratedAt: number = 0; lastPageCratedAt: number = 0;
ua: string = ''; ua: string = '';
rpsCap: number = 500; concurrentRequestsPerPage: number = 16;
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
lastReqSentAt: number = 0; lastReqSentAt: number = 0;
requestDeferredQueue: Deferred<boolean>[] = [];
circuitBreakerHosts: Set<string> = new Set(); circuitBreakerHosts: Set<string> = new Set();
@ -490,10 +527,6 @@ export class PuppeteerControl extends AsyncService {
} }
override async init() { override async init() {
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
await this.dependencyReady(); await this.dependencyReady();
if (process.env.NODE_ENV?.includes('dry-run')) { if (process.env.NODE_ENV?.includes('dry-run')) {
this.emit('ready'); this.emit('ready');
@ -536,22 +569,14 @@ export class PuppeteerControl extends AsyncService {
this.emit('ready'); this.emit('ready');
} }
@perNextTick() protected getRpsControlKit(page: Page) {
reqCapRoutine() { let kit = this.pageReqCtrl.get(page);
const now = Date.now(); if (!kit) {
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap); kit = new PageReqCtrlKit(this.concurrentRequestsPerPage);
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true)); this.pageReqCtrl.set(page, kit);
if (numToPass) {
this.lastReqSentAt = now;
}
if (!this.requestDeferredQueue.length) {
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
} else if (!this.__reqCapInterval) {
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
} }
return kit;
} }
async newPage(bewareDeadLock: any = false) { async newPage(bewareDeadLock: any = false) {
@ -564,7 +589,7 @@ export class PuppeteerControl extends AsyncService {
const dedicatedContext = await this.browser.createBrowserContext(); const dedicatedContext = await this.browser.createBrowserContext();
page = await dedicatedContext.newPage(); page = await dedicatedContext.newPage();
} catch (err: any) { } catch (err: any) {
this.logger.warn(`Failed to create page ${sn}`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to create page ${sn}`, { err });
this.browser.process()?.kill('SIGKILL'); this.browser.process()?.kill('SIGKILL');
throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`); throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
} }
@ -661,10 +686,8 @@ export class PuppeteerControl extends AsyncService {
} }
if (requestUrl.startsWith('http')) { if (requestUrl.startsWith('http')) {
const d = Defer(); const kit = this.getRpsControlKit(page);
this.requestDeferredQueue.push(d); await kit.onNewRequest(req);
this.reqCapRoutine();
await d.promise;
} }
if (req.isInterceptResolutionHandled()) { if (req.isInterceptResolutionHandled()) {
@ -677,6 +700,13 @@ export class PuppeteerControl extends AsyncService {
return req.continue(continueArgs[0], continueArgs[1]); return req.continue(continueArgs[0], continueArgs[1]);
}); });
const reqFinishHandler = (req: HTTPRequest) => {
const kit = this.getRpsControlKit(page);
kit.onFinishRequest(req);
};
page.on('requestfinished', reqFinishHandler);
page.on('requestfailed', reqFinishHandler);
page.on('requestservedfromcache', reqFinishHandler);
await page.evaluateOnNewDocument(` await page.evaluateOnNewDocument(`
(function () { (function () {
@ -721,7 +751,7 @@ export class PuppeteerControl extends AsyncService {
this.newPage() this.newPage()
.then((r) => this.__loadedPage.push(r)) .then((r) => this.__loadedPage.push(r))
.catch((err) => { .catch((err) => {
this.logger.warn(`Failed to load new page ahead of time`, { err: marshalErrorLike(err) }); this.logger.warn(`Failed to load new page ahead of time`, { err });
}); });
} }
} }
@ -761,7 +791,7 @@ export class PuppeteerControl extends AsyncService {
})(), })(),
delay(5000) delay(5000)
]).catch((err) => { ]).catch((err) => {
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to destroy page ${sn}`, { err });
}); });
this.livePages.delete(page); this.livePages.delete(page);
this.pagePhase.delete(page); this.pagePhase.delete(page);
@ -997,7 +1027,7 @@ export class PuppeteerControl extends AsyncService {
try { try {
await page.setCookie(...mapped); await page.setCookie(...mapped);
} catch (err: any) { } catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
throw new ParamValidationError({ throw new ParamValidationError({
path: 'cookies', path: 'cookies',
message: `Failed to set cookies: ${err?.message}` message: `Failed to set cookies: ${err?.message}`
@ -1062,7 +1092,7 @@ export class PuppeteerControl extends AsyncService {
const gotoPromise = page.goto(url, goToOptions) const gotoPromise = page.goto(url, goToOptions)
.catch((err) => { .catch((err) => {
if (err instanceof TimeoutError) { if (err instanceof TimeoutError) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
return new AssertionFailureError({ return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`, message: `Failed to goto ${url}: ${err}`,
cause: err, cause: err,
@ -1075,7 +1105,7 @@ export class PuppeteerControl extends AsyncService {
} }
} }
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
return new AssertionFailureError({ return new AssertionFailureError({
message: `Failed to goto ${url}: ${err}`, message: `Failed to goto ${url}: ${err}`,
cause: err, cause: err,
@ -1126,7 +1156,7 @@ export class PuppeteerControl extends AsyncService {
// } // }
// } // }
// } catch (err: any) { // } catch (err: any) {
// this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) }); // this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err });
// } // }
finalized = true; finalized = true;
@ -1166,7 +1196,7 @@ export class PuppeteerControl extends AsyncService {
finalized = true; finalized = true;
}) })
.catch((err) => { .catch((err) => {
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
waitForPromise = undefined; waitForPromise = undefined;
}); });
return p as any; return p as any;
@ -1243,7 +1273,7 @@ export class PuppeteerControl extends AsyncService {
// } // }
// await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => { // await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
// this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) }); // this.logger.warn(`Page salvation did not fully succeed.`, { err });
// }); // });
// this.logger.info(`Salvation completed.`); // this.logger.info(`Salvation completed.`);