mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 17:35:58 +08:00
fix: stop using pool
This commit is contained in:
parent
ba8ab88811
commit
df71c9a534
2
backend/functions/package-lock.json
generated
2
backend/functions/package-lock.json
generated
@ -20,7 +20,6 @@
|
|||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^4.9.0",
|
"firebase-functions": "^4.9.0",
|
||||||
"generic-pool": "^3.9.0",
|
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"jsdom": "^24.0.0",
|
"jsdom": "^24.0.0",
|
||||||
@ -5796,6 +5795,7 @@
|
|||||||
"version": "3.9.0",
|
"version": "3.9.0",
|
||||||
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
|
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
|
||||||
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
|
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
|
||||||
|
"devOptional": true,
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 4"
|
"node": ">= 4"
|
||||||
}
|
}
|
||||||
|
@ -40,7 +40,6 @@
|
|||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^4.9.0",
|
"firebase-functions": "^4.9.0",
|
||||||
"generic-pool": "^3.9.0",
|
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"jsdom": "^24.0.0",
|
"jsdom": "^24.0.0",
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import genericPool from 'generic-pool';
|
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import { JSDOM } from 'jsdom';
|
import { JSDOM } from 'jsdom';
|
||||||
@ -76,43 +75,32 @@ puppeteer.use(puppeteerPageProxy({
|
|||||||
@singleton()
|
@singleton()
|
||||||
export class PuppeteerControl extends AsyncService {
|
export class PuppeteerControl extends AsyncService {
|
||||||
|
|
||||||
|
_sn = 0;
|
||||||
browser!: Browser;
|
browser!: Browser;
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
pagePool = genericPool.createPool({
|
|
||||||
create: async () => {
|
|
||||||
const page = await this.newPage();
|
|
||||||
return page;
|
|
||||||
},
|
|
||||||
destroy: async (page) => {
|
|
||||||
await Promise.race([
|
|
||||||
(async () => {
|
|
||||||
const ctx = page.browserContext();
|
|
||||||
await page.close();
|
|
||||||
await ctx.close();
|
|
||||||
})(), delay(5000)
|
|
||||||
]).catch((err) => {
|
|
||||||
this.logger.error(`Failed to destroy page`, { err: marshalErrorLike(err) });
|
|
||||||
});
|
|
||||||
},
|
|
||||||
validate: async (page) => {
|
|
||||||
return page.browser().connected && !page.isClosed();
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
max: Math.max(1 + Math.floor(os.totalmem() / (256 * 1024 * 1024)), 16),
|
|
||||||
min: 1,
|
|
||||||
acquireTimeoutMillis: 60_000,
|
|
||||||
testOnBorrow: true,
|
|
||||||
testOnReturn: true,
|
|
||||||
autostart: false,
|
|
||||||
priorityRange: 3
|
|
||||||
});
|
|
||||||
|
|
||||||
private __healthCheckInterval?: NodeJS.Timeout;
|
private __healthCheckInterval?: NodeJS.Timeout;
|
||||||
|
|
||||||
|
__loadedPage: Page[] = [];
|
||||||
|
|
||||||
|
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
||||||
|
snMap = new WeakMap<Page, number>();
|
||||||
|
livePages = new Set<Page>();
|
||||||
|
lastPageCratedAt: number = 0;
|
||||||
|
|
||||||
constructor(protected globalLogger: Logger) {
|
constructor(protected globalLogger: Logger) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
this.setMaxListeners(2 * this.pagePool.max + 1);
|
this.setMaxListeners(2 * Math.floor(os.totalmem() / (256 * 1024 * 1024)) + 1); 148 - 95;
|
||||||
|
|
||||||
|
this.on('crippled', () => {
|
||||||
|
this.__loadedPage.length = 0;
|
||||||
|
this.livePages.clear();
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
briefPages() {
|
||||||
|
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
|
||||||
|
this.logger.info(``);
|
||||||
}
|
}
|
||||||
|
|
||||||
override async init() {
|
override async init() {
|
||||||
@ -121,8 +109,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.__healthCheckInterval = undefined;
|
this.__healthCheckInterval = undefined;
|
||||||
}
|
}
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
this.logger.info(`PuppeteerControl initializing with pool size ${this.pagePool.max}`, { poolSize: this.pagePool.max });
|
|
||||||
this.pagePool.start();
|
|
||||||
|
|
||||||
if (this.browser) {
|
if (this.browser) {
|
||||||
if (this.browser.connected) {
|
if (this.browser.connected) {
|
||||||
@ -151,24 +137,33 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
|
|
||||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
|
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000);
|
||||||
|
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||||
}
|
}
|
||||||
|
|
||||||
@maxConcurrency(1)
|
@maxConcurrency(1)
|
||||||
async healthCheck() {
|
async healthCheck() {
|
||||||
this.pagePool.max += 1;
|
if (Date.now() - this.lastPageCratedAt <= 10_000) {
|
||||||
const healthyPage = await this.pagePool.acquire(3).catch((err) => {
|
this.briefPages();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const healthyPage = await this.newPage().catch((err) => {
|
||||||
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
|
||||||
return null;
|
return null;
|
||||||
});
|
});
|
||||||
this.pagePool.max -= 1;
|
|
||||||
|
|
||||||
if (healthyPage) {
|
if (healthyPage) {
|
||||||
this.pagePool.release(healthyPage);
|
this.__loadedPage.push(healthyPage);
|
||||||
|
|
||||||
|
if (this.__loadedPage.length > 3) {
|
||||||
|
this.ditchPage(this.__loadedPage.shift()!);
|
||||||
|
}
|
||||||
|
|
||||||
|
this.briefPages();
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
this.logger.warn(`Trying to clean up...`);
|
this.logger.warn(`Trying to clean up...`);
|
||||||
await this.pagePool.clear();
|
|
||||||
this.browser.process()?.kill('SIGKILL');
|
this.browser.process()?.kill('SIGKILL');
|
||||||
Reflect.deleteProperty(this, 'browser');
|
Reflect.deleteProperty(this, 'browser');
|
||||||
this.emit('crippled');
|
this.emit('crippled');
|
||||||
@ -178,7 +173,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
async newPage() {
|
async newPage() {
|
||||||
await this.serviceReady();
|
await this.serviceReady();
|
||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
const dedicatedContext = await this.browser.createBrowserContext();
|
||||||
|
const sn = this._sn++;
|
||||||
const page = await dedicatedContext.newPage();
|
const page = await dedicatedContext.newPage();
|
||||||
const preparations = [];
|
const preparations = [];
|
||||||
|
|
||||||
@ -300,18 +295,72 @@ document.addEventListener('readystatechange', handlePageLoad);
|
|||||||
document.addEventListener('load', handlePageLoad);
|
document.addEventListener('load', handlePageLoad);
|
||||||
`);
|
`);
|
||||||
|
|
||||||
|
this.snMap.set(page, sn);
|
||||||
|
this.logger.warn(`Page ${sn} created.`);
|
||||||
|
this.lastPageCratedAt = Date.now();
|
||||||
|
this.livePages.add(page);
|
||||||
|
|
||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async getNextPage() {
|
||||||
|
let thePage;
|
||||||
|
if (this.__loadedPage.length) {
|
||||||
|
thePage = this.__loadedPage.shift();
|
||||||
|
if (this.__loadedPage.length <= 1) {
|
||||||
|
this.newPage()
|
||||||
|
.then((r) => this.__loadedPage.push(r))
|
||||||
|
.catch((err) => {
|
||||||
|
this.logger.warn(`Failed to load new page ahead of time`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!thePage) {
|
||||||
|
thePage = await this.newPage();
|
||||||
|
}
|
||||||
|
|
||||||
|
const timer = setTimeout(() => {
|
||||||
|
this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`);
|
||||||
|
this.ditchPage(thePage);
|
||||||
|
}, 300 * 1000);
|
||||||
|
|
||||||
|
this.finalizerMap.set(thePage, timer);
|
||||||
|
|
||||||
|
return thePage;
|
||||||
|
}
|
||||||
|
|
||||||
|
async ditchPage(page: Page) {
|
||||||
|
if (this.finalizerMap.has(page)) {
|
||||||
|
clearTimeout(this.finalizerMap.get(page)!);
|
||||||
|
this.finalizerMap.delete(page);
|
||||||
|
}
|
||||||
|
if (page.isClosed()) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const sn = this.snMap.get(page);
|
||||||
|
this.logger.info(`Closing page ${sn}`);
|
||||||
|
this.livePages.delete(page);
|
||||||
|
await Promise.race([
|
||||||
|
(async () => {
|
||||||
|
const ctx = page.browserContext();
|
||||||
|
await page.close();
|
||||||
|
await ctx.close();
|
||||||
|
})(), delay(5000)
|
||||||
|
]).catch((err) => {
|
||||||
|
this.logger.error(`Failed to destroy page ${sn}`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
async *scrap(parsedUrl: URL, options?: ScrappingOptions): AsyncGenerator<PageSnapshot | undefined> {
|
||||||
// parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
const url = parsedUrl.toString();
|
const url = parsedUrl.toString();
|
||||||
|
|
||||||
this.logger.info(`Scraping ${url}`, { url });
|
|
||||||
let snapshot: PageSnapshot | undefined;
|
let snapshot: PageSnapshot | undefined;
|
||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
|
const page = await this.getNextPage();
|
||||||
const page = await this.pagePool.acquire();
|
const sn = this.snMap.get(page);
|
||||||
|
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||||
if (options?.proxyUrl) {
|
if (options?.proxyUrl) {
|
||||||
await page.useProxy(options.proxyUrl);
|
await page.useProxy(options.proxyUrl);
|
||||||
}
|
}
|
||||||
@ -342,7 +391,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
const gotoPromise = page.goto(url, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 30_000 })
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Browsing of ${url} did not fully succeed`, { err: marshalErrorLike(err) });
|
||||||
return Promise.reject(new AssertionFailureError({
|
return Promise.reject(new AssertionFailureError({
|
||||||
message: `Failed to goto ${url}: ${err}`,
|
message: `Failed to goto ${url}: ${err}`,
|
||||||
cause: err,
|
cause: err,
|
||||||
@ -362,7 +411,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
finalized = true;
|
finalized = true;
|
||||||
this.logger.info(`Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||||
this.emit(
|
this.emit(
|
||||||
'crawled',
|
'crawled',
|
||||||
{ ...snapshot, screenshot },
|
{ ...snapshot, screenshot },
|
||||||
@ -378,7 +427,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
nextSnapshotDeferred.resolve(snapshot);
|
nextSnapshotDeferred.resolve(snapshot);
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
this.logger.warn(`Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Failed to wait for selector ${options.waitForSelector}`, { err: marshalErrorLike(err) });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -401,9 +450,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
} finally {
|
} finally {
|
||||||
gotoPromise.finally(() => {
|
gotoPromise.finally(() => {
|
||||||
page.off('snapshot', hdl);
|
page.off('snapshot', hdl);
|
||||||
this.pagePool.destroy(page).catch((err) => {
|
this.ditchPage(page);
|
||||||
this.logger.warn(`Failed to destroy page`, { err: marshalErrorLike(err) });
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
nextSnapshotDeferred.resolve();
|
nextSnapshotDeferred.resolve();
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user