mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 07:06:05 +08:00
fix: cap browser request freq to avoid block from google
This commit is contained in:
parent
7e6c2fcf48
commit
9a514cd473
@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
event: 'data',
|
||||
data: formatted,
|
||||
});
|
||||
if (chargeAmount && scrapped.pdfs?.length) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (err: any) {
|
||||
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||
@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
if (crawlerOptions.timeout === undefined) {
|
||||
return formatted;
|
||||
}
|
||||
|
||||
if (chargeAmount && scrapped.pdfs?.length) {
|
||||
return formatted;
|
||||
}
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
|
@ -1,7 +1,7 @@
|
||||
import os from 'os';
|
||||
import fs from 'fs';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
|
||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
||||
@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
private __healthCheckInterval?: NodeJS.Timeout;
|
||||
private __reqCapInterval?: NodeJS.Timeout;
|
||||
|
||||
__loadedPage: Page[] = [];
|
||||
|
||||
@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService {
|
||||
livePages = new Set<Page>();
|
||||
lastPageCratedAt: number = 0;
|
||||
|
||||
rpsCap: number = 300;
|
||||
lastReqSentAt: number = 0;
|
||||
requestDeferredQueue: Deferred<boolean>[] = [];
|
||||
|
||||
circuitBreakerHosts: Set<string> = new Set();
|
||||
|
||||
constructor(
|
||||
@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService {
|
||||
clearInterval(this.__healthCheckInterval);
|
||||
this.__healthCheckInterval = undefined;
|
||||
}
|
||||
if (this.__reqCapInterval) {
|
||||
clearInterval(this.__reqCapInterval);
|
||||
this.__reqCapInterval = undefined;
|
||||
}
|
||||
await this.dependencyReady();
|
||||
|
||||
if (this.browser) {
|
||||
@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
|
||||
this.emit('ready');
|
||||
|
||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
|
||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
|
||||
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||
}
|
||||
|
||||
@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService {
|
||||
this.logger.warn(`Browser killed`);
|
||||
}
|
||||
|
||||
reqCapRoutine() {
|
||||
const now = Date.now();
|
||||
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
|
||||
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
|
||||
this.lastReqSentAt = now;
|
||||
if (!this.requestDeferredQueue.length) {
|
||||
if (this.__reqCapInterval) {
|
||||
clearInterval(this.__reqCapInterval);
|
||||
this.__reqCapInterval = undefined;
|
||||
}
|
||||
} else if (!this.__reqCapInterval) {
|
||||
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
|
||||
}
|
||||
}
|
||||
|
||||
async newPage() {
|
||||
await this.serviceReady();
|
||||
const dedicatedContext = await this.browser.createBrowserContext();
|
||||
@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
let t0: number | undefined;
|
||||
let halt = false;
|
||||
|
||||
page.on('request', (req) => {
|
||||
page.on('request', async (req) => {
|
||||
reqCounter++;
|
||||
if (halt) {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService {
|
||||
return req.abort('blockedbyclient', 1000);
|
||||
}
|
||||
|
||||
const d = Defer();
|
||||
this.requestDeferredQueue.push(d);
|
||||
process.nextTick(() => this.reqCapRoutine());
|
||||
await d.promise;
|
||||
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
};
|
||||
|
||||
const continueArgs = req.continueRequestOverrides
|
||||
? [req.continueRequestOverrides(), 0] as const
|
||||
: [];
|
||||
@ -483,16 +516,16 @@ document.addEventListener('load', handlePageLoad);
|
||||
|
||||
await page.evaluateOnNewDocument(() => {
|
||||
Object.defineProperty(navigator, "language", {
|
||||
get: function() {
|
||||
get: function () {
|
||||
return options?.locale;
|
||||
}
|
||||
});
|
||||
Object.defineProperty(navigator, "languages", {
|
||||
get: function() {
|
||||
get: function () {
|
||||
return [options?.locale];
|
||||
}
|
||||
});
|
||||
})
|
||||
});
|
||||
}
|
||||
|
||||
if (options?.proxyUrl) {
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d
|
||||
Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b
|
Loading…
x
Reference in New Issue
Block a user