From 4e5aff33329d9bde6476b5bb2046c79f2bafdec0 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 31 Jul 2024 11:12:12 +0800 Subject: [PATCH] debug: log jsdom and turndown operations --- .../functions/src/cloud-functions/crawler.ts | 18 +++++++------ backend/functions/src/services/jsdom.ts | 27 ++++++++++++++++++- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 321d069..9f8bd93 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -251,9 +251,10 @@ export class CrawlerHost extends RPCHost { } getGeneralSnapshotMixins(snapshot: PageSnapshot) { - const inferred = this.jsdomControl.inferSnapshot(snapshot); + let inferred; const mixin: any = {}; if (this.threadLocal.get('withImagesSummary')) { + inferred ??= this.jsdomControl.inferSnapshot(snapshot); const imageSummary = {} as { [k: string]: string; }; const imageIdxTrack = new Map(); @@ -278,6 +279,7 @@ export class CrawlerHost extends RPCHost { .value(); } if (this.threadLocal.get('withLinksSummary')) { + inferred ??= this.jsdomControl.inferSnapshot(snapshot); mixin.links = _.invert(inferred.links || {}); } @@ -384,8 +386,8 @@ export class CrawlerHost extends RPCHost { let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl }); if (mode !== 'markdown' && snapshot.parsed?.content) { const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); - const par1 = turnDownService.turndown(jsDomElementOfHTML); - const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : ''; + const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); + const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; // If Readability did its job if (par2.length >= 0.3 * par1.length) { @@ -469,12 +471,12 @@ export class CrawlerHost extends RPCHost { if (toBeTurnedToMd) { try { - contentText = turnDownService.turndown(toBeTurnedToMd).trim(); + contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); try { - contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim(); + contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } @@ -486,12 +488,12 @@ export class CrawlerHost extends RPCHost { && toBeTurnedToMd !== jsDomElementOfHTML ) { try { - contentText = turnDownService.turndown(snapshot.html); + contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); try { - contentText = vanillaTurnDownService.turndown(snapshot.html); + contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } @@ -959,7 +961,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; cache = await this.queryCache(urlToCrawl, cacheTolerance); } - if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) { + if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable)))) { yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts); return; diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 0b632fe..0924c1e 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -4,6 +4,7 @@ import { Logger } from '../shared/services/logger'; import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; import { JSDOM, VirtualConsole } from 'jsdom'; import { Readability } from '@mozilla/readability'; +import TurndownService from 'turndown'; const virtualConsole = new VirtualConsole(); virtualConsole.on('error', () => void 0); @@ -35,7 +36,7 @@ export class JSDomControl extends AsyncService { if (!snapshot?.html) { return snapshot; } - + const t0 = Date.now(); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const allNodes: Node[] = []; if (options?.withIframe) { @@ -137,10 +138,16 @@ export class JSDomControl extends AsyncService { imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], } as PageSnapshot; + const dt = Date.now() - t0; + if (dt > 1000) { + this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt }); + } + return r; } inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot { + const t0 = Date.now(); const extendedSnapshot = { ...snapshot } as ExtendedSnapshot; try { const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); @@ -191,6 +198,11 @@ export class JSDomControl extends AsyncService { void 0; } + const dt = Date.now() - t0; + if (dt > 1000) { + this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt }); + } + return extendedSnapshot; } @@ -199,6 +211,19 @@ export class JSDomControl extends AsyncService { return parsed.window.document.documentElement; } + + runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) { + const t0 = Date.now(); + + try { + return turndownService.turndown(html); + } finally { + const dt = Date.now() - t0; + if (dt > 1000) { + this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt }); + } + } + } } const jsdomControl = container.resolve(JSDomControl);