mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 05:25:55 +08:00
feat: control concurrent request per page instead of server bucket
This commit is contained in:
parent
c064fcf77e
commit
3b3a0265df
@ -1,21 +1,26 @@
|
|||||||
|
import _ from 'lodash';
|
||||||
|
import { isIP } from 'net';
|
||||||
|
import { readFile } from 'fs/promises';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick, ParamValidationError, FancyFile } from 'civkit';
|
|
||||||
import { GlobalLogger } from './logger';
|
|
||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, HTTPResponse, Page, Viewport } from 'puppeteer';
|
import type { Browser, CookieParam, GoToOptions, HTTPRequest, HTTPResponse, Page, Viewport } from 'puppeteer';
|
||||||
import type { Cookie } from 'set-cookie-parser';
|
import type { Cookie } from 'set-cookie-parser';
|
||||||
import puppeteer from 'puppeteer-extra';
|
import puppeteer from 'puppeteer-extra';
|
||||||
|
import { TimeoutError } from 'puppeteer';
|
||||||
|
|
||||||
|
import { Defer, Deferred } from 'civkit/defer';
|
||||||
|
import { AssertionFailureError, ParamValidationError } from 'civkit/civ-rpc';
|
||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { FancyFile } from 'civkit/fancy-file';
|
||||||
|
import { delay } from 'civkit/timeout';
|
||||||
|
|
||||||
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
|
||||||
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
import { SecurityCompromiseError, ServiceCrashedError, ServiceNodeResourceDrainError } from '../shared/lib/errors';
|
||||||
import { TimeoutError } from 'puppeteer';
|
|
||||||
import _ from 'lodash';
|
|
||||||
import { isIP } from 'net';
|
|
||||||
import { CurlControl } from './curl';
|
import { CurlControl } from './curl';
|
||||||
import { readFile } from 'fs/promises';
|
|
||||||
import { BlackHoleDetector } from './blackhole-detector';
|
import { BlackHoleDetector } from './blackhole-detector';
|
||||||
import { AsyncLocalContext } from './async-context';
|
import { AsyncLocalContext } from './async-context';
|
||||||
|
import { GlobalLogger } from './logger';
|
||||||
const tldExtract = require('tld-extract');
|
const tldExtract = require('tld-extract');
|
||||||
|
|
||||||
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
|
||||||
@ -440,6 +445,39 @@ window.briefImgs = briefImgs;
|
|||||||
})();
|
})();
|
||||||
`;
|
`;
|
||||||
|
|
||||||
|
class PageReqCtrlKit {
|
||||||
|
reqSet: Set<HTTPRequest> = new Set();
|
||||||
|
blockers: Deferred<void>[] = [];
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
public concurrency: number,
|
||||||
|
) {
|
||||||
|
if (isNaN(concurrency) || concurrency < 1) {
|
||||||
|
throw new AssertionFailureError(`Invalid concurrency: ${concurrency}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
onNewRequest(req: HTTPRequest) {
|
||||||
|
this.reqSet.add(req);
|
||||||
|
if (this.reqSet.size <= this.concurrency) {
|
||||||
|
return Promise.resolve();
|
||||||
|
}
|
||||||
|
const deferred = Defer();
|
||||||
|
this.blockers.push(deferred);
|
||||||
|
|
||||||
|
return deferred.promise;
|
||||||
|
}
|
||||||
|
|
||||||
|
onFinishRequest(req: HTTPRequest) {
|
||||||
|
this.reqSet.delete(req);
|
||||||
|
if (this.reqSet.size > this.concurrency) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const deferred = this.blockers.shift();
|
||||||
|
deferred?.resolve();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class PuppeteerControl extends AsyncService {
|
export class PuppeteerControl extends AsyncService {
|
||||||
|
|
||||||
@ -447,8 +485,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
browser!: Browser;
|
browser!: Browser;
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
private __reqCapInterval?: NodeJS.Timeout;
|
|
||||||
|
|
||||||
__loadedPage: Page[] = [];
|
__loadedPage: Page[] = [];
|
||||||
|
|
||||||
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
||||||
@ -458,9 +494,10 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
ua: string = '';
|
ua: string = '';
|
||||||
|
|
||||||
rpsCap: number = 500;
|
concurrentRequestsPerPage: number = 16;
|
||||||
|
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
||||||
|
|
||||||
lastReqSentAt: number = 0;
|
lastReqSentAt: number = 0;
|
||||||
requestDeferredQueue: Deferred<boolean>[] = [];
|
|
||||||
|
|
||||||
circuitBreakerHosts: Set<string> = new Set();
|
circuitBreakerHosts: Set<string> = new Set();
|
||||||
|
|
||||||
@ -490,10 +527,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
override async init() {
|
override async init() {
|
||||||
if (this.__reqCapInterval) {
|
|
||||||
clearInterval(this.__reqCapInterval);
|
|
||||||
this.__reqCapInterval = undefined;
|
|
||||||
}
|
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
if (process.env.NODE_ENV?.includes('dry-run')) {
|
if (process.env.NODE_ENV?.includes('dry-run')) {
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
@ -536,22 +569,14 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
@perNextTick()
|
protected getRpsControlKit(page: Page) {
|
||||||
reqCapRoutine() {
|
let kit = this.pageReqCtrl.get(page);
|
||||||
const now = Date.now();
|
if (!kit) {
|
||||||
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
|
kit = new PageReqCtrlKit(this.concurrentRequestsPerPage);
|
||||||
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
|
this.pageReqCtrl.set(page, kit);
|
||||||
if (numToPass) {
|
|
||||||
this.lastReqSentAt = now;
|
|
||||||
}
|
|
||||||
if (!this.requestDeferredQueue.length) {
|
|
||||||
if (this.__reqCapInterval) {
|
|
||||||
clearInterval(this.__reqCapInterval);
|
|
||||||
this.__reqCapInterval = undefined;
|
|
||||||
}
|
|
||||||
} else if (!this.__reqCapInterval) {
|
|
||||||
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return kit;
|
||||||
}
|
}
|
||||||
|
|
||||||
async newPage(bewareDeadLock: any = false) {
|
async newPage(bewareDeadLock: any = false) {
|
||||||
@ -564,7 +589,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
const dedicatedContext = await this.browser.createBrowserContext();
|
||||||
page = await dedicatedContext.newPage();
|
page = await dedicatedContext.newPage();
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Failed to create page ${sn}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to create page ${sn}`, { err });
|
||||||
this.browser.process()?.kill('SIGKILL');
|
this.browser.process()?.kill('SIGKILL');
|
||||||
throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
|
throw new ServiceNodeResourceDrainError(`This specific worker node failed to open a new page, try again.`);
|
||||||
}
|
}
|
||||||
@ -661,10 +686,8 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (requestUrl.startsWith('http')) {
|
if (requestUrl.startsWith('http')) {
|
||||||
const d = Defer();
|
const kit = this.getRpsControlKit(page);
|
||||||
this.requestDeferredQueue.push(d);
|
await kit.onNewRequest(req);
|
||||||
this.reqCapRoutine();
|
|
||||||
await d.promise;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (req.isInterceptResolutionHandled()) {
|
if (req.isInterceptResolutionHandled()) {
|
||||||
@ -677,6 +700,13 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
return req.continue(continueArgs[0], continueArgs[1]);
|
return req.continue(continueArgs[0], continueArgs[1]);
|
||||||
});
|
});
|
||||||
|
const reqFinishHandler = (req: HTTPRequest) => {
|
||||||
|
const kit = this.getRpsControlKit(page);
|
||||||
|
kit.onFinishRequest(req);
|
||||||
|
};
|
||||||
|
page.on('requestfinished', reqFinishHandler);
|
||||||
|
page.on('requestfailed', reqFinishHandler);
|
||||||
|
page.on('requestservedfromcache', reqFinishHandler);
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(`
|
await page.evaluateOnNewDocument(`
|
||||||
(function () {
|
(function () {
|
||||||
@ -721,7 +751,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.newPage()
|
this.newPage()
|
||||||
.then((r) => this.__loadedPage.push(r))
|
.then((r) => this.__loadedPage.push(r))
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Failed to load new page ahead of time`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -761,7 +791,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
})(),
|
})(),
|
||||||
delay(5000)
|
delay(5000)
|
||||||
]).catch((err) => {
|
]).catch((err) => {
|
||||||
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
this.logger.error(`Failed to destroy page ${sn}`, { err });
|
||||||
});
|
});
|
||||||
this.livePages.delete(page);
|
this.livePages.delete(page);
|
||||||
this.pagePhase.delete(page);
|
this.pagePhase.delete(page);
|
||||||
@ -997,7 +1027,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
try {
|
try {
|
||||||
await page.setCookie(...mapped);
|
await page.setCookie(...mapped);
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Failed to set cookies`, { err });
|
||||||
throw new ParamValidationError({
|
throw new ParamValidationError({
|
||||||
path: 'cookies',
|
path: 'cookies',
|
||||||
message: `Failed to set cookies: ${err?.message}`
|
message: `Failed to set cookies: ${err?.message}`
|
||||||
@ -1062,7 +1092,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
const gotoPromise = page.goto(url, goToOptions)
|
const gotoPromise = page.goto(url, goToOptions)
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
if (err instanceof TimeoutError) {
|
if (err instanceof TimeoutError) {
|
||||||
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err });
|
||||||
return new AssertionFailureError({
|
return new AssertionFailureError({
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
cause: err,
|
cause: err,
|
||||||
@ -1075,7 +1105,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Browsing of ${url} failed`, { err });
|
||||||
return new AssertionFailureError({
|
return new AssertionFailureError({
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
cause: err,
|
cause: err,
|
||||||
@ -1126,7 +1156,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
// }
|
// }
|
||||||
// }
|
// }
|
||||||
// } catch (err: any) {
|
// } catch (err: any) {
|
||||||
// this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) });
|
// this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err });
|
||||||
// }
|
// }
|
||||||
|
|
||||||
finalized = true;
|
finalized = true;
|
||||||
@ -1166,7 +1196,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
finalized = true;
|
finalized = true;
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err });
|
||||||
waitForPromise = undefined;
|
waitForPromise = undefined;
|
||||||
});
|
});
|
||||||
return p as any;
|
return p as any;
|
||||||
@ -1243,7 +1273,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
// }
|
// }
|
||||||
|
|
||||||
// await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
|
// await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
|
||||||
// this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
|
// this.logger.warn(`Page salvation did not fully succeed.`, { err });
|
||||||
// });
|
// });
|
||||||
|
|
||||||
// this.logger.info(`Salvation completed.`);
|
// this.logger.info(`Salvation completed.`);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user