mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 01:59:17 +08:00
fix: cap browser request freq to avoid block from google
This commit is contained in:
parent
7e6c2fcf48
commit
9a514cd473
@ -753,6 +753,9 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
event: 'data',
|
event: 'data',
|
||||||
data: formatted,
|
data: formatted,
|
||||||
});
|
});
|
||||||
|
if (chargeAmount && scrapped.pdfs?.length) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
this.logger.error(`Failed to crawl ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||||
@ -781,6 +784,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
if (crawlerOptions.timeout === undefined) {
|
if (crawlerOptions.timeout === undefined) {
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (chargeAmount && scrapped.pdfs?.length) {
|
||||||
|
return formatted;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!lastScrapped) {
|
if (!lastScrapped) {
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
||||||
@ -208,6 +208,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
private __healthCheckInterval?: NodeJS.Timeout;
|
private __healthCheckInterval?: NodeJS.Timeout;
|
||||||
|
private __reqCapInterval?: NodeJS.Timeout;
|
||||||
|
|
||||||
__loadedPage: Page[] = [];
|
__loadedPage: Page[] = [];
|
||||||
|
|
||||||
@ -216,6 +217,10 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
|
|
||||||
|
rpsCap: number = 300;
|
||||||
|
lastReqSentAt: number = 0;
|
||||||
|
requestDeferredQueue: Deferred<boolean>[] = [];
|
||||||
|
|
||||||
circuitBreakerHosts: Set<string> = new Set();
|
circuitBreakerHosts: Set<string> = new Set();
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
@ -239,6 +244,10 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
clearInterval(this.__healthCheckInterval);
|
clearInterval(this.__healthCheckInterval);
|
||||||
this.__healthCheckInterval = undefined;
|
this.__healthCheckInterval = undefined;
|
||||||
}
|
}
|
||||||
|
if (this.__reqCapInterval) {
|
||||||
|
clearInterval(this.__reqCapInterval);
|
||||||
|
this.__reqCapInterval = undefined;
|
||||||
|
}
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
|
|
||||||
if (this.browser) {
|
if (this.browser) {
|
||||||
@ -267,7 +276,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
|
|
||||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
|
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
|
||||||
this.newPage().then((r) => this.__loadedPage.push(r));
|
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,6 +310,21 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.logger.warn(`Browser killed`);
|
this.logger.warn(`Browser killed`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
reqCapRoutine() {
|
||||||
|
const now = Date.now();
|
||||||
|
const numToPass = Math.round((now - this.lastReqSentAt) / 1000 * this.rpsCap);
|
||||||
|
this.requestDeferredQueue.splice(0, numToPass).forEach((x) => x.resolve(true));
|
||||||
|
this.lastReqSentAt = now;
|
||||||
|
if (!this.requestDeferredQueue.length) {
|
||||||
|
if (this.__reqCapInterval) {
|
||||||
|
clearInterval(this.__reqCapInterval);
|
||||||
|
this.__reqCapInterval = undefined;
|
||||||
|
}
|
||||||
|
} else if (!this.__reqCapInterval) {
|
||||||
|
this.__reqCapInterval = setInterval(() => this.reqCapRoutine(), 1000 / this.rpsCap).unref();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async newPage() {
|
async newPage() {
|
||||||
await this.serviceReady();
|
await this.serviceReady();
|
||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
const dedicatedContext = await this.browser.createBrowserContext();
|
||||||
@ -330,7 +354,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
let t0: number | undefined;
|
let t0: number | undefined;
|
||||||
let halt = false;
|
let halt = false;
|
||||||
|
|
||||||
page.on('request', (req) => {
|
page.on('request', async (req) => {
|
||||||
reqCounter++;
|
reqCounter++;
|
||||||
if (halt) {
|
if (halt) {
|
||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
@ -379,6 +403,15 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return req.abort('blockedbyclient', 1000);
|
return req.abort('blockedbyclient', 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const d = Defer();
|
||||||
|
this.requestDeferredQueue.push(d);
|
||||||
|
process.nextTick(() => this.reqCapRoutine());
|
||||||
|
await d.promise;
|
||||||
|
|
||||||
|
if (req.isInterceptResolutionHandled()) {
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
|
||||||
const continueArgs = req.continueRequestOverrides
|
const continueArgs = req.continueRequestOverrides
|
||||||
? [req.continueRequestOverrides(), 0] as const
|
? [req.continueRequestOverrides(), 0] as const
|
||||||
: [];
|
: [];
|
||||||
@ -483,16 +516,16 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
await page.evaluateOnNewDocument(() => {
|
||||||
Object.defineProperty(navigator, "language", {
|
Object.defineProperty(navigator, "language", {
|
||||||
get: function() {
|
get: function () {
|
||||||
return options?.locale;
|
return options?.locale;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
Object.defineProperty(navigator, "languages", {
|
Object.defineProperty(navigator, "languages", {
|
||||||
get: function() {
|
get: function () {
|
||||||
return [options?.locale];
|
return [options?.locale];
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
})
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
if (options?.proxyUrl) {
|
if (options?.proxyUrl) {
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit fe71cc2433f60ada86622f1670a752da40806e4d
|
Subproject commit fb511e6e7af482577ef321b99ccacac51b99df5b
|
Loading…
x
Reference in New Issue
Block a user