fix: cap browser request freq to avoid block from google

This commit is contained in:
Yanlong Wang 2024-08-29 09:28:17 +08:00
parent 7e6c2fcf48
commit 9a514cd473
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 47 additions and 7 deletions

View File

@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
event: 'data', event: 'data',
data: formatted, data: formatted,
}); });
if (chargeAmount && scrapped.pdfs?.length) {
break;
}
} }
} catch (err: any) { } catch (err: any) {
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
if (crawlerOptions.timeout === undefined) { if (crawlerOptions.timeout === undefined) {
return formatted; return formatted;
} }
if (chargeAmount && scrapped.pdfs?.length) {
return formatted;
}
} }
if (!lastScrapped) { if (!lastScrapped) {

View File

@ -1,7 +1,7 @@
import os from 'os'; import os from 'os';
import fs from 'fs'; import fs from 'fs';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer'; import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name }); logger = this.globalLogger.child({ service: this.constructor.name });
private __healthCheckInterval?: NodeJS.Timeout; private __healthCheckInterval?: NodeJS.Timeout;
private __reqCapInterval?: NodeJS.Timeout;
__loadedPage: Page[] = []; __loadedPage: Page[] = [];
@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService {
livePages = new Set<Page>(); livePages = new Set<Page>();
lastPageCratedAt: number = 0; lastPageCratedAt: number = 0;
rpsCap: number = 300;
lastReqSentAt: number = 0;
requestDeferredQueue: Deferred<boolean>[] = [];
circuitBreakerHosts: Set<string> = new Set(); circuitBreakerHosts: Set<string> = new Set();
constructor( constructor(
@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService {
clearInterval(this.__healthCheckInterval); clearInterval(this.__healthCheckInterval);
this.__healthCheckInterval = undefined; this.__healthCheckInterval = undefined;
} }
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
await this.dependencyReady(); await this.dependencyReady();
if (this.browser) { if (this.browser) {
@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService {
this.emit('ready'); this.emit('ready');
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000); this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
this.newPage().then((r) => this.__loadedPage.push(r)); this.newPage().then((r) => this.__loadedPage.push(r));
} }
@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService {
this.logger.warn(`Browser killed`); this.logger.warn(`Browser killed`);
} }
reqCapRoutine() {
const now = Date.now();
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
this.lastReqSentAt = now;
if (!this.requestDeferredQueue.length) {
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
}
} else if (!this.__reqCapInterval) {
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
}
}
async newPage() { async newPage() {
await this.serviceReady(); await this.serviceReady();
const dedicatedContext = await this.browser.createBrowserContext(); const dedicatedContext = await this.browser.createBrowserContext();
@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService {
let t0: number | undefined; let t0: number | undefined;
let halt = false; let halt = false;
page.on('request', (req) => { page.on('request', async (req) => {
reqCounter++; reqCounter++;
if (halt) { if (halt) {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService {
return req.abort('blockedbyclient', 1000); return req.abort('blockedbyclient', 1000);
} }
const d = Defer();
this.requestDeferredQueue.push(d);
process.nextTick(() => this.reqCapRoutine());
await d.promise;
if (req.isInterceptResolutionHandled()) {
return;
};
const continueArgs = req.continueRequestOverrides const continueArgs = req.continueRequestOverrides
? [req.continueRequestOverrides(), 0] as const ? [req.continueRequestOverrides(), 0] as const
: []; : [];
@ -492,7 +525,7 @@ document.addEventListener('load', handlePageLoad);
return [options?.locale]; return [options?.locale];
} }
}); });
}) });
} }
if (options?.proxyUrl) { if (options?.proxyUrl) {

@ -1 +1 @@
Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b