debug: log jsdom and turndown operations

This commit is contained in:
Yanlong Wang 2024-07-31 11:12:12 +08:00
parent 0f239793d2
commit 4e5aff3332
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 36 additions and 9 deletions

View File

@ -251,9 +251,10 @@ export class CrawlerHost extends RPCHost {
} }
getGeneralSnapshotMixins(snapshot: PageSnapshot) { getGeneralSnapshotMixins(snapshot: PageSnapshot) {
const inferred = this.jsdomControl.inferSnapshot(snapshot); let inferred;
const mixin: any = {}; const mixin: any = {};
if (this.threadLocal.get('withImagesSummary')) { if (this.threadLocal.get('withImagesSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
const imageSummary = {} as { [k: string]: string; }; const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>(); const imageIdxTrack = new Map<string, number[]>();
@ -278,6 +279,7 @@ export class CrawlerHost extends RPCHost {
.value(); .value();
} }
if (this.threadLocal.get('withLinksSummary')) { if (this.threadLocal.get('withLinksSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
mixin.links = _.invert(inferred.links || {}); mixin.links = _.invert(inferred.links || {});
} }
@ -384,8 +386,8 @@ export class CrawlerHost extends RPCHost {
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl }); let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) { if (mode !== 'markdown' && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = turnDownService.turndown(jsDomElementOfHTML); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : ''; const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
// If Readability did its job // If Readability did its job
if (par2.length >= 0.3 * par1.length) { if (par2.length >= 0.3 * par1.length) {
@ -469,12 +471,12 @@ export class CrawlerHost extends RPCHost {
if (toBeTurnedToMd) { if (toBeTurnedToMd) {
try { try {
contentText = turnDownService.turndown(toBeTurnedToMd).trim(); contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
try { try {
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim(); contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
} catch (err2) { } catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
} }
@ -486,12 +488,12 @@ export class CrawlerHost extends RPCHost {
&& toBeTurnedToMd !== jsDomElementOfHTML && toBeTurnedToMd !== jsDomElementOfHTML
) { ) {
try { try {
contentText = turnDownService.turndown(snapshot.html); contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
try { try {
contentText = vanillaTurnDownService.turndown(snapshot.html); contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
} catch (err2) { } catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
} }
@ -959,7 +961,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
cache = await this.queryCache(urlToCrawl, cacheTolerance); cache = await this.queryCache(urlToCrawl, cacheTolerance);
} }
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable)))) {
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts); yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
return; return;

View File

@ -4,6 +4,7 @@ import { Logger } from '../shared/services/logger';
import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
import { JSDOM, VirtualConsole } from 'jsdom'; import { JSDOM, VirtualConsole } from 'jsdom';
import { Readability } from '@mozilla/readability'; import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
const virtualConsole = new VirtualConsole(); const virtualConsole = new VirtualConsole();
virtualConsole.on('error', () => void 0); virtualConsole.on('error', () => void 0);
@ -35,7 +36,7 @@ export class JSDomControl extends AsyncService {
if (!snapshot?.html) { if (!snapshot?.html) {
return snapshot; return snapshot;
} }
const t0 = Date.now();
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const allNodes: Node[] = []; const allNodes: Node[] = [];
if (options?.withIframe) { if (options?.withIframe) {
@ -137,10 +138,16 @@ export class JSDomControl extends AsyncService {
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
} as PageSnapshot; } as PageSnapshot;
const dt = Date.now() - t0;
if (dt > 1000) {
this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt });
}
return r; return r;
} }
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
const t0 = Date.now();
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
try { try {
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
@ -191,6 +198,11 @@ export class JSDomControl extends AsyncService {
void 0; void 0;
} }
const dt = Date.now() - t0;
if (dt > 1000) {
this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt });
}
return extendedSnapshot; return extendedSnapshot;
} }
@ -199,6 +211,19 @@ export class JSDomControl extends AsyncService {
return parsed.window.document.documentElement; return parsed.window.document.documentElement;
} }
runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) {
const t0 = Date.now();
try {
return turndownService.turndown(html);
} finally {
const dt = Date.now() - t0;
if (dt > 1000) {
this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt });
}
}
}
} }
const jsdomControl = container.resolve(JSDomControl); const jsdomControl = container.resolve(JSDomControl);