fix: cap browser request freq to avoid block from google

This commit is contained in:
Yanlong Wang 2024-08-29 09:28:17 +08:00
parent 7e6c2fcf48
commit 9a514cd473
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 47 additions and 7 deletions

View File

@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
event: 'data',
data: formatted,
});
if (chargeAmount && scrapped.pdfs?.length) {
break;
}
}
} catch (err: any) {
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (crawlerOptions.timeout === undefined) {
return formatted;
}
if (chargeAmount && scrapped.pdfs?.length) {
return formatted;
}
}
if (!lastScrapped) {

View File

@ -1,7 +1,7 @@
import os from 'os';
import fs from 'fs';
import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
private __healthCheckInterval?: NodeJS.Timeout;
private __reqCapInterval?: NodeJS.Timeout;
__loadedPage: Page[] = [];
@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService {
livePages = new Set<Page>();
lastPageCratedAt: number = 0;
rpsCap: number = 300;
lastReqSentAt: number = 0;
requestDeferredQueue: Deferred<boolean>[] = [];
circuitBreakerHosts: Set<string> = new Set();
constructor(
@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService {
clearInterval(this.__healthCheckInterval);
this.__healthCheckInterval = undefined;
}
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
await this.dependencyReady();
if (this.browser) {
@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService {
this.emit('ready');
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
this.newPage().then((r) => this.__loadedPage.push(r));
}
@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService {
this.logger.warn(`Browser killed`);
}
reqCapRoutine() {
const now = Date.now();
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
this.lastReqSentAt = now;
if (!this.requestDeferredQueue.length) {
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
} else if (!this.__reqCapInterval) {
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
}
}
async newPage() {
await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext();
@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService {
let t0: number | undefined;
let halt = false;
page.on('request', (req) => {
page.on('request', async (req) => {
reqCounter++;
if (halt) {
return req.abort('blockedbyclient', 1000);
@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService {
return req.abort('blockedbyclient', 1000);
}
const d = Defer();
this.requestDeferredQueue.push(d);
process.nextTick(() => this.reqCapRoutine());
await d.promise;
if (req.isInterceptResolutionHandled()) {
return;
};
const continueArgs = req.continueRequestOverrides
? [req.continueRequestOverrides(), 0] as const
: [];
@ -483,16 +516,16 @@ document.addEventListener('load', handlePageLoad);
await page.evaluateOnNewDocument(() => {
Object.defineProperty(navigator, "language", {
get: function() {
get: function () {
return options?.locale;
}
});
Object.defineProperty(navigator, "languages", {
get: function() {
get: function () {
return [options?.locale];
}
});
})
});
}
if (options?.proxyUrl) {

@ -1 +1 @@
Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d
Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b