mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 22:25:57 +08:00
debug: log jsdom and turndown operations
This commit is contained in:
parent
0f239793d2
commit
4e5aff3332
@ -251,9 +251,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||||
const inferred = this.jsdomControl.inferSnapshot(snapshot);
|
let inferred;
|
||||||
const mixin: any = {};
|
const mixin: any = {};
|
||||||
if (this.threadLocal.get('withImagesSummary')) {
|
if (this.threadLocal.get('withImagesSummary')) {
|
||||||
|
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
|
|
||||||
@ -278,6 +279,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
.value();
|
.value();
|
||||||
}
|
}
|
||||||
if (this.threadLocal.get('withLinksSummary')) {
|
if (this.threadLocal.get('withLinksSummary')) {
|
||||||
|
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||||
mixin.links = _.invert(inferred.links || {});
|
mixin.links = _.invert(inferred.links || {});
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -384,8 +386,8 @@ export class CrawlerHost extends RPCHost {
|
|||||||
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
||||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||||
const par1 = turnDownService.turndown(jsDomElementOfHTML);
|
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||||
const par2 = snapshot.parsed.content ? turnDownService.turndown(jsDomElementOfParsed) : '';
|
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||||
|
|
||||||
// If Readability did its job
|
// If Readability did its job
|
||||||
if (par2.length >= 0.3 * par1.length) {
|
if (par2.length >= 0.3 * par1.length) {
|
||||||
@ -469,12 +471,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
if (toBeTurnedToMd) {
|
if (toBeTurnedToMd) {
|
||||||
try {
|
try {
|
||||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
}
|
}
|
||||||
@ -486,12 +488,12 @@ export class CrawlerHost extends RPCHost {
|
|||||||
&& toBeTurnedToMd !== jsDomElementOfHTML
|
&& toBeTurnedToMd !== jsDomElementOfHTML
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
contentText = turnDownService.turndown(snapshot.html);
|
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
}
|
}
|
||||||
@ -959,7 +961,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
cache = await this.queryCache(urlToCrawl, cacheTolerance);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && cache?.screenshotAvailable))) {
|
if (cache?.isFresh && (!crawlOpts?.favorScreenshot || (crawlOpts?.favorScreenshot && (cache.screenshotAvailable && cache.pageshotAvailable)))) {
|
||||||
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
yield this.jsdomControl.narrowSnapshot(cache.snapshot, crawlOpts);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
|
@ -4,6 +4,7 @@ import { Logger } from '../shared/services/logger';
|
|||||||
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
||||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
|
import TurndownService from 'turndown';
|
||||||
|
|
||||||
const virtualConsole = new VirtualConsole();
|
const virtualConsole = new VirtualConsole();
|
||||||
virtualConsole.on('error', () => void 0);
|
virtualConsole.on('error', () => void 0);
|
||||||
@ -35,7 +36,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
if (!snapshot?.html) {
|
if (!snapshot?.html) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
|
const t0 = Date.now();
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||||
const allNodes: Node[] = [];
|
const allNodes: Node[] = [];
|
||||||
if (options?.withIframe) {
|
if (options?.withIframe) {
|
||||||
@ -137,10 +138,16 @@ export class JSDomControl extends AsyncService {
|
|||||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
if (dt > 1000) {
|
||||||
|
this.logger.warn(`Performance issue: Narrowing snapshot took ${dt}ms`, { url: snapshot.href, dt });
|
||||||
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
||||||
|
const t0 = Date.now();
|
||||||
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
||||||
try {
|
try {
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||||
@ -191,6 +198,11 @@ export class JSDomControl extends AsyncService {
|
|||||||
void 0;
|
void 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
if (dt > 1000) {
|
||||||
|
this.logger.warn(`Performance issue: Inferring snapshot took ${dt}ms`, { url: snapshot.href, dt });
|
||||||
|
}
|
||||||
|
|
||||||
return extendedSnapshot;
|
return extendedSnapshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -199,6 +211,19 @@ export class JSDomControl extends AsyncService {
|
|||||||
|
|
||||||
return parsed.window.document.documentElement;
|
return parsed.window.document.documentElement;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
runTurndown(turndownService: TurndownService, html: TurndownService.Node | string) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
|
||||||
|
try {
|
||||||
|
return turndownService.turndown(html);
|
||||||
|
} finally {
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
if (dt > 1000) {
|
||||||
|
this.logger.warn(`Performance issue: Turndown took ${dt}ms`, { dt });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const jsdomControl = container.resolve(JSDomControl);
|
const jsdomControl = container.resolve(JSDomControl);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user