fix: performance issue of jsdom

This commit is contained in:
Yanlong Wang 2024-09-08 00:49:38 +08:00
parent 5171e5f94b
commit 94170db060
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
8 changed files with 873 additions and 960 deletions

File diff suppressed because it is too large Load Diff

View File

@ -34,7 +34,7 @@
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"civkit": "^0.6.5-047c0d8",
"civkit": "^0.7.0-0f8889a",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
@ -43,13 +43,13 @@
"firebase-functions": "^4.9.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"jsdom": "^24.0.0",
"langdetect": "^0.2.1",
"linkedom": "^0.18.4",
"maxmind": "^4.3.18",
"minio": "^7.1.3",
"openai": "^4.20.0",
"pdfjs-dist": "^4.2.67",
"puppeteer": "^22.7.1",
"puppeteer": "^23.3.0",
"puppeteer-extra": "^3.3.6",
"puppeteer-extra-plugin-block-resources": "^2.4.3",
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
@ -68,7 +68,7 @@
"@types/bcrypt": "^5.0.0",
"@types/cors": "^2.8.17",
"@types/generic-pool": "^3.8.1",
"@types/node": "^18",
"@types/node": "^20.14.13",
"@types/set-cookie-parser": "^2.4.7",
"@typescript-eslint/eslint-plugin": "^5.12.0",
"@typescript-eslint/parser": "^5.12.0",
@ -77,7 +77,7 @@
"eslint-plugin-import": "^2.25.4",
"firebase-functions-test": "^3.0.0",
"replicate": "^0.16.1",
"typescript": "^5.1.6"
"typescript": "^5.5.4"
},
"private": true,
"exports": {

View File

@ -1,7 +1,6 @@
import {
assignTransferProtocolMeta, marshalErrorLike,
RPCHost, RPCReflection,
HashManager,
AssertionFailureError, ParamValidationError, Defer,
} from 'civkit';
import { singleton } from 'tsyringe';
@ -11,22 +10,17 @@ import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
import { Request, Response } from 'express';
const pNormalizeUrl = import("@esm2cjs/normalize-url");
import { AltTextService } from '../services/alt-text';
import TurndownService from 'turndown';
import { Crawled } from '../db/crawled';
import { cleanAttribute } from '../utils/misc';
import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { PDFExtractor } from '../services/pdf-extract';
import { DomainBlockade } from '../db/domain-blockade';
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
import { JSDomControl } from '../services/jsdom';
const md5Hasher = new HashManager('md5', 'hex');
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean;
@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
keepImgDataUrl?: boolean;
}
export interface FormattedPage {
title?: string;
description?: string;
url?: string;
content?: string;
publishedTime?: string;
html?: string;
text?: string;
screenshotUrl?: string;
screenshot?: Buffer;
pageshotUrl?: string;
pageshot?: Buffer;
links?: { [k: string]: string; };
images?: { [k: string]: string; };
usage?: {
total_tokens?: number;
totalTokens?: number;
tokens?: number;
};
toString: () => string;
}
const indexProto = {
toString: function (): string {
return _(this)
@ -72,8 +43,6 @@ const indexProto = {
export class CrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
turnDownPlugins = [require('turndown-plugin-gfm').tables];
cacheRetentionMs = 1000 * 3600 * 24 * 7;
cacheValidMs = 1000 * 3600;
urlValidMs = 1000 * 3600 * 4;
@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost {
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected jsdomControl: JSDomControl,
protected altTextService: AltTextService,
protected pdfExtractor: PDFExtractor,
protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncContext,
@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost {
return indexObject;
}
getTurndown(options?: {
noRules?: boolean | string,
url?: string | URL;
imgDataUrlToObjectUrl?: boolean;
}) {
const turnDownService = new TurndownService({
codeBlockStyle: 'fenced',
preformattedCode: true,
} as any);
if (!options?.noRules) {
turnDownService.addRule('remove-irrelevant', {
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
replacement: () => ''
});
turnDownService.addRule('truncate-svg', {
filter: 'svg' as any,
replacement: () => ''
});
turnDownService.addRule('title-as-h1', {
filter: ['title'],
replacement: (innerText) => `${innerText}\n===============\n`
});
}
if (options?.imgDataUrlToObjectUrl) {
turnDownService.addRule('data-url-to-pseudo-object-url', {
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
replacement: (_content, node: any) => {
const src = (node.getAttribute('src') || '').trim();
const alt = cleanAttribute(node.getAttribute('alt')) || '';
if (options.url) {
const refUrl = new URL(options.url);
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
return `![${alt}](${mappedUrl})`;
}
return `![${alt}](blob:${md5Hasher.hash(src)})`;
}
});
}
turnDownService.addRule('improved-paragraph', {
filter: 'p',
replacement: (innerText) => {
const trimmed = innerText.trim();
if (!trimmed) {
return '';
}
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
}
});
turnDownService.addRule('improved-inline-link', {
filter: function (node, options) {
return Boolean(
options.linkStyle === 'inlined' &&
node.nodeName === 'A' &&
node.getAttribute('href')
);
},
replacement: function (content, node: any) {
let href = node.getAttribute('href');
if (href) href = href.replace(/([()])/g, '\\$1');
let title = cleanAttribute(node.getAttribute('title'));
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
const fixedContent = content.replace(/\s+/g, ' ').trim();
let fixedHref = href.replace(/\s+/g, '').trim();
if (options?.url) {
try {
fixedHref = new URL(fixedHref, options.url).toString();
} catch (_err) {
void 0;
}
}
return `[${fixedContent}](${fixedHref}${title || ''})`;
}
});
turnDownService.addRule('improved-code', {
filter: function (node: any) {
let hasSiblings = node.previousSibling || node.nextSibling;
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
return node.nodeName === 'CODE' && !isCodeBlock;
},
replacement: function (inputContent: any) {
if (!inputContent) return '';
let content = inputContent;
let delimiter = '`';
let matches = content.match(/`+/gm) || [];
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
if (content.includes('\n')) {
delimiter = '```';
}
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
}
});
return turnDownService;
}
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
let inferred;
const mixin: any = {};
if (this.threadLocal.get('withImagesSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
let imgIdx = 0;
for (const img of inferred.imgs) {
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(img.src, idxArr);
imageSummary[img.src] = img.alt || '';
}
mixin.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
mixin.links = _.invert(inferred.links || {});
}
return mixin;
}
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
}, nominalUrl?: URL) {
if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
...this.getGeneralSnapshotMixins(snapshot),
// html: snapshot.html,
screenshotUrl: snapshot.screenshotUrl,
toString() {
return this.screenshotUrl;
}
} as FormattedPage;
}
if (mode === 'pageshot') {
if (snapshot.pageshot && !snapshot.pageshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
pageshotUrl: snapshot.pageshotUrl,
toString() {
return this.pageshotUrl;
}
} as FormattedPage;
}
if (mode === 'html') {
return {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
toString() {
return this.html;
}
} as FormattedPage;
}
let pdfMode = false;
if (snapshot.pdfs?.length && !snapshot.title) {
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
this.threadLocal.get('cacheTolerance')
);
if (pdf) {
pdfMode = true;
snapshot.title = pdf.meta?.Title;
snapshot.text = pdf.text || snapshot.text;
snapshot.parsed = {
content: pdf.content,
textContent: pdf.content,
length: pdf.content?.length,
byline: pdf.meta?.Author,
lang: pdf.meta?.Language || undefined,
title: pdf.meta?.Title,
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
};
}
}
if (mode === 'text') {
return {
...this.getGeneralSnapshotMixins(snapshot),
text: snapshot.text,
toString() {
return this.text;
}
} as FormattedPage;
}
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
let contentText = '';
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
const uid = this.threadLocal.get('uid');
do {
if (pdfMode) {
contentText = snapshot.parsed?.content || snapshot.text;
break;
}
if (
snapshot.maxElemDepth! > 256 ||
(!uid && snapshot.elemCount! > 10_000) ||
snapshot.elemCount! > 70_000
) {
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
contentText = snapshot.text;
break;
}
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
let toBeTurnedToMd = jsDomElementOfHTML;
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (snapshot.parsed.content) {
toBeTurnedToMd = jsDomElementOfParsed;
}
}
}
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}
const urlToAltMap: { [k: string]: string | undefined; } = {};
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
return undefined;
});
if (r && x.src) {
urlToAltMap[x.src.trim()] = r;
}
});
await Promise.all(tasks);
}
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', {
filter: 'img',
replacement: (_content, node: any) => {
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
const dataSrc = (node.getAttribute('data-src') || '').trim();
if (dataSrc && !dataSrc.startsWith('data:')) {
linkPreferredSrc = dataSrc;
}
}
let src;
try {
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
} catch (_err) {
void 0;
}
const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) {
return '';
}
const mapped = urlToAltMap[src];
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(src, idxArr);
if (mapped) {
imageSummary[src] = mapped || alt;
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
}
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
}
imageSummary[src] = alt || '';
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
}
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
}
});
if (toBeTurnedToMd) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
}
}
if (
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
&& toBeTurnedToMd !== jsDomElementOfHTML
) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
}
}
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
contentText = snapshot.text;
}
} while (false);
const cleanText = (contentText || '').trim();
const formatted: FormattedPage = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(),
content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined,
toString() {
if (mode === 'markdown') {
return this.content as string;
}
const mixins = [];
if (this.publishedTime) {
mixins.push(`Published Time: ${this.publishedTime}`);
}
const suffixMixins = [];
if (this.images) {
const imageSummaryChunks = ['Images:'];
for (const [k, v] of Object.entries(this.images)) {
imageSummaryChunks.push(`- ![${k}](${v})`);
}
if (imageSummaryChunks.length === 1) {
imageSummaryChunks.push('This page does not seem to contain any images.');
}
suffixMixins.push(imageSummaryChunks.join('\n'));
}
if (this.links) {
const linkSummaryChunks = ['Links/Buttons:'];
for (const [k, v] of Object.entries(this.links)) {
linkSummaryChunks.push(`- [${k}](${v})`);
}
if (linkSummaryChunks.length === 1) {
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
}
suffixMixins.push(linkSummaryChunks.join('\n'));
}
return `Title: ${this.title}
URL Source: ${this.url}
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
Markdown Content:
${this.content}
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
};
if (this.threadLocal.get('withImagesSummary')) {
formatted.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
}
return formatted as FormattedPage;
}
@CloudHTTPv2({
name: 'crawl2',
runtime: {
@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
})
@CloudHTTPv2({
runtime: {
memory: '4GiB',
memory: '8GiB',
cpu: 4,
timeoutSeconds: 300,
concurrency: 22,
@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
sseStream.write({
event: 'data',
@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.timeout === undefined) {
@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
return formatted;
@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
continue;
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.timeout === undefined) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
}
@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} catch (err) {
if (lastSnapshot) {
return this.formatSnapshot(mode, lastSnapshot, url);
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
throw err;
@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
throw new AssertionFailureError(`No content available`);
}
return this.formatSnapshot(mode, lastSnapshot, url);
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
}
}

View File

@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises';
import { createGzip } from 'zlib';
import { getFunctions } from 'firebase-admin/functions';
import { GoogleAuth } from 'google-auth-library';
import { SnapshotFormatter } from '../services/snapshot-formatter';
dayjs.extend(require('dayjs/plugin/utc'));
@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost {
protected globalLogger: Logger,
protected crawler: CrawlerHost,
protected snapshotFormatter: SnapshotFormatter,
protected tempFileManager: TempFileManager,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
) {
@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost {
try {
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
let formatted = await this.crawler.formatSnapshot('default', snapshot);
let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
if (!formatted.content) {
formatted = await this.crawler.formatSnapshot('markdown', snapshot);
formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
}
await nextDrainDeferred.promise;

View File

@ -11,11 +11,12 @@ import _ from 'lodash';
import { Request, Response } from 'express';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
import { SearchResult } from '../db/searched';
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
import { CrawlerOptions } from '../dto/scrapping-options';
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
@singleton()
@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost {
protected threadLocal: AsyncContext,
protected braveSearchService: BraveSearchService,
protected crawler: CrawlerHost,
protected snapshotFormatter: SnapshotFormatter,
) {
super(...arguments);
}
@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost {
if (snapshotMap.has(x)) {
return snapshotMap.get(x);
}
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
r.title ??= upstreamSearchResult.title;
r.description = upstreamSearchResult.description;
snapshotMap.set(x, r);

View File

@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe';
import { AsyncService, marshalErrorLike } from 'civkit';
import { Logger } from '../shared/services/logger';
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
import { JSDOM, VirtualConsole } from 'jsdom';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
import { Threaded } from '../shared/services/threaded';
const virtualConsole = new VirtualConsole();
virtualConsole.on('error', () => void 0);
const pLinkedom = import('linkedom');
@singleton()
export class JSDomControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
linkedom!: Awaited<typeof pLinkedom>;
constructor(
protected globalLogger: Logger,
) {
@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService {
override async init() {
await this.dependencyReady();
this.linkedom = await pLinkedom;
this.emit('ready');
}
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}): PageSnapshot | undefined {
}) {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
return snapshot;
}
if (!snapshot?.html) {
return snapshot;
}
return this.actualNarrowSnapshot(snapshot, options);
}
@Threaded()
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}): Promise<PageSnapshot | undefined> {
const t0 = Date.now();
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const jsdom = this.linkedom.parseHTML(snapshot.html);
const allNodes: Node[] = [];
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
if (options?.withIframe) {
@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService {
let rootDoc: Document;
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
rootDoc = allNodes[0] as any;
if (rootDoc.body.textContent) {
textChunks.push(rootDoc.body.textContent);
if (rootDoc.body.innerText) {
textChunks.push(rootDoc.body.innerText);
}
} else {
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
for (const n of allNodes) {
rootDoc.body.appendChild(n);
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
if (n.textContent) {
textChunks.push(n.textContent);
if ((n as HTMLElement).innerText) {
textChunks.push((n as HTMLElement).innerText);
}
}
}
@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService {
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
}
// No innerText in jsdom
// https://github.com/jsdom/jsdom/issues/1245
const textContent = textChunks.join('\n\n');
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
.flat()
@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService {
title: snapshot.title || jsdom.window.document.title,
parsed,
html: rootDoc.documentElement.outerHTML,
text: cleanedText,
text: textChunks.join('\n'),
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
} as PageSnapshot;
@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService {
return r;
}
@Threaded()
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
const t0 = Date.now();
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
try {
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const jsdom = this.linkedom.parseHTML(snapshot.html);
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService {
return extendedSnapshot;
}
snippetToElement(snippet?: string, url?: string) {
const parsed = new JSDOM(snippet || '', { url, virtualConsole });
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
return parsed.window.document.documentElement;
}

View File

@ -1,7 +1,7 @@
import os from 'os';
import fs from 'fs';
import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred, perNextTick } from 'civkit';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService {
browser!: Browser;
logger = this.globalLogger.child({ service: this.constructor.name });
private __healthCheckInterval?: NodeJS.Timeout;
private __reqCapInterval?: NodeJS.Timeout;
__loadedPage: Page[] = [];
@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService {
livePages = new Set<Page>();
lastPageCratedAt: number = 0;
rpsCap: number = 300;
rpsCap: number = 500;
lastReqSentAt: number = 0;
requestDeferredQueue: Deferred<boolean>[] = [];
@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService {
});
}
briefPages() {
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
}
override async init() {
if (this.__healthCheckInterval) {
clearInterval(this.__healthCheckInterval);
this.__healthCheckInterval = undefined;
}
if (this.__reqCapInterval) {
clearInterval(this.__reqCapInterval);
this.__reqCapInterval = undefined;
@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService {
this.emit('ready');
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
this.newPage().then((r) => this.__loadedPage.push(r));
}
@maxConcurrency(1)
async healthCheck() {
if (Date.now() - this.lastPageCratedAt <= 10_000) {
this.briefPages();
return;
}
const healthyPage = await this.newPage().catch((err) => {
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
return null;
});
if (healthyPage) {
this.__loadedPage.push(healthyPage);
if (this.__loadedPage.length > 3) {
this.ditchPage(this.__loadedPage.shift()!);
}
this.briefPages();
return;
}
this.logger.warn(`Trying to clean up...`);
this.browser.process()?.kill('SIGKILL');
Reflect.deleteProperty(this, 'browser');
this.emit('crippled');
this.logger.warn(`Browser killed`);
}
@perNextTick()
reqCapRoutine() {
const now = Date.now();
@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad);
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
screenshot = Buffer.from(await page.screenshot());
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad);
if (salvaged) {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
screenshot = Buffer.from(await page.screenshot());
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad);
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
screenshot = Buffer.from(await page.screenshot());
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad);
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
screenshot = Buffer.from(await page.screenshot());
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {

View File

@ -0,0 +1,539 @@
import { randomUUID } from 'crypto';
import { container, singleton } from 'tsyringe';
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
import TurndownService from 'turndown';
import { Logger } from '../shared/services/logger';
import { PageSnapshot } from './puppeteer';
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
import { AsyncContext } from '../shared/services/async-context';
import { Threaded } from '../shared/services/threaded';
import { JSDomControl } from './jsdom';
import { AltTextService } from './alt-text';
import { PDFExtractor } from './pdf-extract';
import { cleanAttribute } from '../utils/misc';
import _ from 'lodash';
export interface FormattedPage {
title?: string;
description?: string;
url?: string;
content?: string;
publishedTime?: string;
html?: string;
text?: string;
screenshotUrl?: string;
screenshot?: Buffer;
pageshotUrl?: string;
pageshot?: Buffer;
links?: { [k: string]: string; };
images?: { [k: string]: string; };
usage?: {
total_tokens?: number;
totalTokens?: number;
tokens?: number;
};
textRepresentation?: string;
[Symbol.dispose]: () => void;
}
export const md5Hasher = new HashManager('md5', 'hex');
@singleton()
export class SnapshotFormatter extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
turnDownPlugins = [require('turndown-plugin-gfm').tables];
constructor(
protected globalLogger: Logger,
protected jsdomControl: JSDomControl,
protected altTextService: AltTextService,
protected pdfExtractor: PDFExtractor,
protected threadLocal: AsyncContext,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
@Threaded()
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
const t0 = Date.now();
if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
}
const f = {
...this.getGeneralSnapshotMixins(snapshot),
// html: snapshot.html,
screenshotUrl: snapshot.screenshotUrl,
};
Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return f as FormattedPage;
}
if (mode === 'pageshot') {
if (snapshot.pageshot && !snapshot.pageshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
}
const f = {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
pageshotUrl: snapshot.pageshotUrl,
} as FormattedPage;
Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return f;
}
if (mode === 'html') {
const f = {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
} as FormattedPage;
Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return f;
}
let pdfMode = false;
if (snapshot.pdfs?.length && !snapshot.title) {
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
this.threadLocal.get('cacheTolerance')
);
if (pdf) {
pdfMode = true;
snapshot.title = pdf.meta?.Title;
snapshot.text = pdf.text || snapshot.text;
snapshot.parsed = {
content: pdf.content,
textContent: pdf.content,
length: pdf.content?.length,
byline: pdf.meta?.Author,
lang: pdf.meta?.Language || undefined,
title: pdf.meta?.Title,
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
};
}
}
if (mode === 'text') {
const f = {
...this.getGeneralSnapshotMixins(snapshot),
text: snapshot.text,
} as FormattedPage;
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return f;
}
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
let contentText = '';
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
const uid = this.threadLocal.get('uid');
do {
if (pdfMode) {
contentText = snapshot.parsed?.content || snapshot.text;
break;
}
if (
snapshot.maxElemDepth! > 256 ||
(!uid && snapshot.elemCount! > 10_000) ||
snapshot.elemCount! > 70_000
) {
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
contentText = snapshot.text;
break;
}
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
let toBeTurnedToMd = jsDomElementOfHTML;
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (snapshot.parsed.content) {
toBeTurnedToMd = jsDomElementOfParsed;
}
}
}
for (const plugin of this.turnDownPlugins) {
turnDownService = turnDownService.use(plugin);
}
const urlToAltMap: { [k: string]: string | undefined; } = {};
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
const r = await this.altTextService.getAltText(x).catch((err: any) => {
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
return undefined;
});
if (r && x.src) {
urlToAltMap[x.src.trim()] = r;
}
});
await Promise.all(tasks);
}
let imgIdx = 0;
turnDownService.addRule('img-generated-alt', {
filter: 'img',
replacement: (_content, node: any) => {
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
const dataSrc = (node.getAttribute('data-src') || '').trim();
if (dataSrc && !dataSrc.startsWith('data:')) {
linkPreferredSrc = dataSrc;
}
}
let src;
try {
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
} catch (_err) {
void 0;
}
const alt = cleanAttribute(node.getAttribute('alt'));
if (!src) {
return '';
}
const mapped = urlToAltMap[src];
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(src, idxArr);
if (mapped) {
imageSummary[src] = mapped || alt;
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return `![Image ${imgIdx}: ${mapped || alt}](${mappedUrl})`;
}
return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
}
imageSummary[src] = alt || '';
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
mappedUrl.protocol = 'blob:';
return alt ? `![Image ${imgIdx}: ${alt}](${mappedUrl})` : `![Image ${imgIdx}](${mappedUrl})`;
}
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
}
});
if (toBeTurnedToMd) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
}
}
if (
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
&& toBeTurnedToMd !== jsDomElementOfHTML
) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
}
}
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
contentText = snapshot.text;
}
} while (false);
const cleanText = (contentText || '').trim();
const formatted: FormattedPage = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(),
content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined,
[Symbol.dispose]: () => { },
};
if (this.threadLocal.get('withImagesSummary')) {
formatted.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
}
const textRepresentation = (function (this: typeof formatted) {
if (mode === 'markdown') {
return this.content as string;
}
const mixins = [];
if (this.publishedTime) {
mixins.push(`Published Time: ${this.publishedTime}`);
}
const suffixMixins = [];
if (this.images) {
const imageSummaryChunks = ['Images:'];
for (const [k, v] of Object.entries(this.images)) {
imageSummaryChunks.push(`- ![${k}](${v})`);
}
if (imageSummaryChunks.length === 1) {
imageSummaryChunks.push('This page does not seem to contain any images.');
}
suffixMixins.push(imageSummaryChunks.join('\n'));
}
if (this.links) {
const linkSummaryChunks = ['Links/Buttons:'];
for (const [k, v] of Object.entries(this.links)) {
linkSummaryChunks.push(`- [${k}](${v})`);
}
if (linkSummaryChunks.length === 1) {
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
}
suffixMixins.push(linkSummaryChunks.join('\n'));
}
return `Title: ${this.title}
URL Source: ${this.url}
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
Markdown Content:
${this.content}
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}).call(formatted);
Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
const dt = Date.now() - t0;
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
return formatted as FormattedPage;
}
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
let inferred;
const mixin: any = {};
if (this.threadLocal.get('withImagesSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
let imgIdx = 0;
for (const img of inferred.imgs) {
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(img.src, idxArr);
imageSummary[img.src] = img.alt || '';
}
mixin.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
mixin.links = _.invert(inferred.links || {});
}
return mixin;
}
getTurndown(options?: {
noRules?: boolean | string,
url?: string | URL;
imgDataUrlToObjectUrl?: boolean;
}) {
const turnDownService = new TurndownService({
codeBlockStyle: 'fenced',
preformattedCode: true,
} as any);
if (!options?.noRules) {
turnDownService.addRule('remove-irrelevant', {
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
replacement: () => ''
});
turnDownService.addRule('truncate-svg', {
filter: 'svg' as any,
replacement: () => ''
});
turnDownService.addRule('title-as-h1', {
filter: ['title'],
replacement: (innerText) => `${innerText}\n===============\n`
});
}
if (options?.imgDataUrlToObjectUrl) {
turnDownService.addRule('data-url-to-pseudo-object-url', {
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
replacement: (_content, node: any) => {
const src = (node.getAttribute('src') || '').trim();
const alt = cleanAttribute(node.getAttribute('alt')) || '';
if (options.url) {
const refUrl = new URL(options.url);
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
return `![${alt}](${mappedUrl})`;
}
return `![${alt}](blob:${md5Hasher.hash(src)})`;
}
});
}
turnDownService.addRule('improved-paragraph', {
filter: 'p',
replacement: (innerText) => {
const trimmed = innerText.trim();
if (!trimmed) {
return '';
}
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
}
});
turnDownService.addRule('improved-inline-link', {
filter: function (node, options) {
return Boolean(
options.linkStyle === 'inlined' &&
node.nodeName === 'A' &&
node.getAttribute('href')
);
},
replacement: function (content, node: any) {
let href = node.getAttribute('href');
if (href) href = href.replace(/([()])/g, '\\$1');
let title = cleanAttribute(node.getAttribute('title'));
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
const fixedContent = content.replace(/\s+/g, ' ').trim();
let fixedHref = href.replace(/\s+/g, '').trim();
if (options?.url) {
try {
fixedHref = new URL(fixedHref, options.url).toString();
} catch (_err) {
void 0;
}
}
return `[${fixedContent}](${fixedHref}${title || ''})`;
}
});
turnDownService.addRule('improved-code', {
filter: function (node: any) {
let hasSiblings = node.previousSibling || node.nextSibling;
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
return node.nodeName === 'CODE' && !isCodeBlock;
},
replacement: function (inputContent: any) {
if (!inputContent) return '';
let content = inputContent;
let delimiter = '`';
let matches = content.match(/`+/gm) || [];
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
if (content.includes('\n')) {
delimiter = '```';
}
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
}
});
return turnDownService;
}
}
const snapshotFormatter = container.resolve(SnapshotFormatter);
export default snapshotFormatter;