From 9a514cd473d5e61b317a313ba45831a24a992547 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 29 Aug 2024 09:28:17 +0800 Subject: [PATCH] fix: cap browser request freq to avoid block from google --- .../functions/src/cloud-functions/crawler.ts | 7 +++ backend/functions/src/services/puppeteer.ts | 45 ++++++++++++++++--- thinapps-shared | 2 +- 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 2e02b22..2f36263 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; event: 'data', data: formatted, }); + if (chargeAmount && scrapped.pdfs?.length) { + break; + } } } catch (err: any) { this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) }); @@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; if (crawlerOptions.timeout === undefined) { return formatted; } + + if (chargeAmount && scrapped.pdfs?.length) { + return formatted; + } } if (!lastScrapped) { diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 21568d2..a7ae25e 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -1,7 +1,7 @@ import os from 'os'; import fs from 'fs'; import { container, singleton } from 'tsyringe'; -import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; +import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit'; import { Logger } from '../shared/services/logger'; import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer'; @@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); private __healthCheckInterval?: NodeJS.Timeout; + private __reqCapInterval?: NodeJS.Timeout; __loadedPage: Page[] = []; @@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService { livePages = new Set(); lastPageCratedAt: number = 0; + rpsCap: number = 300; + lastReqSentAt: number = 0; + requestDeferredQueue: Deferred[] = []; + circuitBreakerHosts: Set = new Set(); constructor( @@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService { clearInterval(this.__healthCheckInterval); this.__healthCheckInterval = undefined; } + if (this.__reqCapInterval) { + clearInterval(this.__reqCapInterval); + this.__reqCapInterval = undefined; + } await this.dependencyReady(); if (this.browser) { @@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService { this.emit('ready'); - this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000); + this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref(); this.newPage().then((r) => this.__loadedPage.push(r)); } @@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService { this.logger.warn(`Browser killed`); } + reqCapRoutine() { + const now = Date.now(); + const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap); + this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true)); + this.lastReqSentAt = now; + if (!this.requestDeferredQueue.length) { + if (this.__reqCapInterval) { + clearInterval(this.__reqCapInterval); + this.__reqCapInterval = undefined; + } + } else if (!this.__reqCapInterval) { + this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref(); + } + } + async newPage() { await this.serviceReady(); const dedicatedContext = await this.browser.createBrowserContext(); @@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService { let t0: number | undefined; let halt = false; - page.on('request', (req) => { + page.on('request', async (req) => { reqCounter++; if (halt) { return req.abort('blockedbyclient', 1000); @@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService { return req.abort('blockedbyclient', 1000); } + const d = Defer(); + this.requestDeferredQueue.push(d); + process.nextTick(() => this.reqCapRoutine()); + await d.promise; + + if (req.isInterceptResolutionHandled()) { + return; + }; + const continueArgs = req.continueRequestOverrides ? [req.continueRequestOverrides(), 0] as const : []; @@ -483,16 +516,16 @@ document.addEventListener('load', handlePageLoad); await page.evaluateOnNewDocument(() => { Object.defineProperty(navigator, "language", { - get: function() { + get: function () { return options?.locale; } }); Object.defineProperty(navigator, "languages", { - get: function() { + get: function () { return [options?.locale]; } }); - }) + }); } if (options?.proxyUrl) { diff --git a/thinapps-shared b/thinapps-shared index fe71cc2..fb511e6 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d +Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b