mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-14 14:35:53 +08:00
fix: performance issue of jsdom
This commit is contained in:
parent
5171e5f94b
commit
94170db060
659
backend/functions/package-lock.json
generated
659
backend/functions/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -34,7 +34,7 @@
|
|||||||
"archiver": "^6.0.1",
|
"archiver": "^6.0.1",
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.6.5-047c0d8",
|
"civkit": "^0.7.0-0f8889a",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
@ -43,13 +43,13 @@
|
|||||||
"firebase-functions": "^4.9.0",
|
"firebase-functions": "^4.9.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"jsdom": "^24.0.0",
|
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
|
"linkedom": "^0.18.4",
|
||||||
"maxmind": "^4.3.18",
|
"maxmind": "^4.3.18",
|
||||||
"minio": "^7.1.3",
|
"minio": "^7.1.3",
|
||||||
"openai": "^4.20.0",
|
"openai": "^4.20.0",
|
||||||
"pdfjs-dist": "^4.2.67",
|
"pdfjs-dist": "^4.2.67",
|
||||||
"puppeteer": "^22.7.1",
|
"puppeteer": "^23.3.0",
|
||||||
"puppeteer-extra": "^3.3.6",
|
"puppeteer-extra": "^3.3.6",
|
||||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
||||||
@ -68,7 +68,7 @@
|
|||||||
"@types/bcrypt": "^5.0.0",
|
"@types/bcrypt": "^5.0.0",
|
||||||
"@types/cors": "^2.8.17",
|
"@types/cors": "^2.8.17",
|
||||||
"@types/generic-pool": "^3.8.1",
|
"@types/generic-pool": "^3.8.1",
|
||||||
"@types/node": "^18",
|
"@types/node": "^20.14.13",
|
||||||
"@types/set-cookie-parser": "^2.4.7",
|
"@types/set-cookie-parser": "^2.4.7",
|
||||||
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
||||||
"@typescript-eslint/parser": "^5.12.0",
|
"@typescript-eslint/parser": "^5.12.0",
|
||||||
@ -77,7 +77,7 @@
|
|||||||
"eslint-plugin-import": "^2.25.4",
|
"eslint-plugin-import": "^2.25.4",
|
||||||
"firebase-functions-test": "^3.0.0",
|
"firebase-functions-test": "^3.0.0",
|
||||||
"replicate": "^0.16.1",
|
"replicate": "^0.16.1",
|
||||||
"typescript": "^5.1.6"
|
"typescript": "^5.5.4"
|
||||||
},
|
},
|
||||||
"private": true,
|
"private": true,
|
||||||
"exports": {
|
"exports": {
|
||||||
|
@ -1,7 +1,6 @@
|
|||||||
import {
|
import {
|
||||||
assignTransferProtocolMeta, marshalErrorLike,
|
assignTransferProtocolMeta, marshalErrorLike,
|
||||||
RPCHost, RPCReflection,
|
RPCHost, RPCReflection,
|
||||||
HashManager,
|
|
||||||
AssertionFailureError, ParamValidationError, Defer,
|
AssertionFailureError, ParamValidationError, Defer,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
@ -11,22 +10,17 @@ import _ from 'lodash';
|
|||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||||
import { AltTextService } from '../services/alt-text';
|
|
||||||
import TurndownService from 'turndown';
|
|
||||||
import { Crawled } from '../db/crawled';
|
import { Crawled } from '../db/crawled';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
|
|
||||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { PDFExtractor } from '../services/pdf-extract';
|
|
||||||
import { DomainBlockade } from '../db/domain-blockade';
|
import { DomainBlockade } from '../db/domain-blockade';
|
||||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||||
import { JSDomControl } from '../services/jsdom';
|
import { JSDomControl } from '../services/jsdom';
|
||||||
|
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean;
|
withIframe?: boolean;
|
||||||
@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
|||||||
keepImgDataUrl?: boolean;
|
keepImgDataUrl?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface FormattedPage {
|
|
||||||
title?: string;
|
|
||||||
description?: string;
|
|
||||||
url?: string;
|
|
||||||
content?: string;
|
|
||||||
publishedTime?: string;
|
|
||||||
html?: string;
|
|
||||||
text?: string;
|
|
||||||
screenshotUrl?: string;
|
|
||||||
screenshot?: Buffer;
|
|
||||||
pageshotUrl?: string;
|
|
||||||
pageshot?: Buffer;
|
|
||||||
links?: { [k: string]: string; };
|
|
||||||
images?: { [k: string]: string; };
|
|
||||||
usage?: {
|
|
||||||
total_tokens?: number;
|
|
||||||
totalTokens?: number;
|
|
||||||
tokens?: number;
|
|
||||||
};
|
|
||||||
|
|
||||||
toString: () => string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const indexProto = {
|
const indexProto = {
|
||||||
toString: function (): string {
|
toString: function (): string {
|
||||||
return _(this)
|
return _(this)
|
||||||
@ -72,8 +43,6 @@ const indexProto = {
|
|||||||
export class CrawlerHost extends RPCHost {
|
export class CrawlerHost extends RPCHost {
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
|
||||||
|
|
||||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||||
cacheValidMs = 1000 * 3600;
|
cacheValidMs = 1000 * 3600;
|
||||||
urlValidMs = 1000 * 3600 * 4;
|
urlValidMs = 1000 * 3600 * 4;
|
||||||
@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
protected jsdomControl: JSDomControl,
|
protected jsdomControl: JSDomControl,
|
||||||
protected altTextService: AltTextService,
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
protected pdfExtractor: PDFExtractor,
|
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
protected rateLimitControl: RateLimitControl,
|
protected rateLimitControl: RateLimitControl,
|
||||||
protected threadLocal: AsyncContext,
|
protected threadLocal: AsyncContext,
|
||||||
@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return indexObject;
|
return indexObject;
|
||||||
}
|
}
|
||||||
|
|
||||||
getTurndown(options?: {
|
|
||||||
noRules?: boolean | string,
|
|
||||||
url?: string | URL;
|
|
||||||
imgDataUrlToObjectUrl?: boolean;
|
|
||||||
}) {
|
|
||||||
const turnDownService = new TurndownService({
|
|
||||||
codeBlockStyle: 'fenced',
|
|
||||||
preformattedCode: true,
|
|
||||||
} as any);
|
|
||||||
if (!options?.noRules) {
|
|
||||||
turnDownService.addRule('remove-irrelevant', {
|
|
||||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
|
||||||
replacement: () => ''
|
|
||||||
});
|
|
||||||
turnDownService.addRule('truncate-svg', {
|
|
||||||
filter: 'svg' as any,
|
|
||||||
replacement: () => ''
|
|
||||||
});
|
|
||||||
turnDownService.addRule('title-as-h1', {
|
|
||||||
filter: ['title'],
|
|
||||||
replacement: (innerText) => `${innerText}\n===============\n`
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (options?.imgDataUrlToObjectUrl) {
|
|
||||||
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
|
||||||
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
|
||||||
replacement: (_content, node: any) => {
|
|
||||||
const src = (node.getAttribute('src') || '').trim();
|
|
||||||
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
|
||||||
|
|
||||||
if (options.url) {
|
|
||||||
const refUrl = new URL(options.url);
|
|
||||||
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
|
||||||
|
|
||||||
return ``;
|
|
||||||
}
|
|
||||||
|
|
||||||
return `})`;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
turnDownService.addRule('improved-paragraph', {
|
|
||||||
filter: 'p',
|
|
||||||
replacement: (innerText) => {
|
|
||||||
const trimmed = innerText.trim();
|
|
||||||
if (!trimmed) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
|
|
||||||
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
turnDownService.addRule('improved-inline-link', {
|
|
||||||
filter: function (node, options) {
|
|
||||||
return Boolean(
|
|
||||||
options.linkStyle === 'inlined' &&
|
|
||||||
node.nodeName === 'A' &&
|
|
||||||
node.getAttribute('href')
|
|
||||||
);
|
|
||||||
},
|
|
||||||
|
|
||||||
replacement: function (content, node: any) {
|
|
||||||
let href = node.getAttribute('href');
|
|
||||||
if (href) href = href.replace(/([()])/g, '\\$1');
|
|
||||||
let title = cleanAttribute(node.getAttribute('title'));
|
|
||||||
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
|
||||||
|
|
||||||
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
|
||||||
let fixedHref = href.replace(/\s+/g, '').trim();
|
|
||||||
if (options?.url) {
|
|
||||||
try {
|
|
||||||
fixedHref = new URL(fixedHref, options.url).toString();
|
|
||||||
} catch (_err) {
|
|
||||||
void 0;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
turnDownService.addRule('improved-code', {
|
|
||||||
filter: function (node: any) {
|
|
||||||
let hasSiblings = node.previousSibling || node.nextSibling;
|
|
||||||
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
|
||||||
|
|
||||||
return node.nodeName === 'CODE' && !isCodeBlock;
|
|
||||||
},
|
|
||||||
|
|
||||||
replacement: function (inputContent: any) {
|
|
||||||
if (!inputContent) return '';
|
|
||||||
let content = inputContent;
|
|
||||||
|
|
||||||
let delimiter = '`';
|
|
||||||
let matches = content.match(/`+/gm) || [];
|
|
||||||
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
|
||||||
if (content.includes('\n')) {
|
|
||||||
delimiter = '```';
|
|
||||||
}
|
|
||||||
|
|
||||||
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
|
||||||
|
|
||||||
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
return turnDownService;
|
|
||||||
}
|
|
||||||
|
|
||||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
|
||||||
let inferred;
|
|
||||||
const mixin: any = {};
|
|
||||||
if (this.threadLocal.get('withImagesSummary')) {
|
|
||||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
|
||||||
|
|
||||||
let imgIdx = 0;
|
|
||||||
|
|
||||||
for (const img of inferred.imgs) {
|
|
||||||
const imgSerial = ++imgIdx;
|
|
||||||
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
|
||||||
idxArr.push(imgSerial);
|
|
||||||
imageIdxTrack.set(img.src, idxArr);
|
|
||||||
imageSummary[img.src] = img.alt || '';
|
|
||||||
}
|
|
||||||
|
|
||||||
mixin.images =
|
|
||||||
_(imageSummary)
|
|
||||||
.toPairs()
|
|
||||||
.map(
|
|
||||||
([url, alt], i) => {
|
|
||||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
|
||||||
}
|
|
||||||
).fromPairs()
|
|
||||||
.value();
|
|
||||||
}
|
|
||||||
if (this.threadLocal.get('withLinksSummary')) {
|
|
||||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
|
||||||
mixin.links = _.invert(inferred.links || {});
|
|
||||||
}
|
|
||||||
|
|
||||||
return mixin;
|
|
||||||
}
|
|
||||||
|
|
||||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
|
||||||
screenshotUrl?: string;
|
|
||||||
pageshotUrl?: string;
|
|
||||||
}, nominalUrl?: URL) {
|
|
||||||
if (mode === 'screenshot') {
|
|
||||||
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
|
||||||
const fid = `instant-screenshots/${randomUUID()}`;
|
|
||||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
|
||||||
metadata: {
|
|
||||||
contentType: 'image/png',
|
|
||||||
}
|
|
||||||
});
|
|
||||||
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
|
||||||
// html: snapshot.html,
|
|
||||||
screenshotUrl: snapshot.screenshotUrl,
|
|
||||||
toString() {
|
|
||||||
return this.screenshotUrl;
|
|
||||||
}
|
|
||||||
} as FormattedPage;
|
|
||||||
}
|
|
||||||
if (mode === 'pageshot') {
|
|
||||||
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
|
||||||
const fid = `instant-screenshots/${randomUUID()}`;
|
|
||||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
|
||||||
metadata: {
|
|
||||||
contentType: 'image/png',
|
|
||||||
}
|
|
||||||
});
|
|
||||||
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
|
||||||
html: snapshot.html,
|
|
||||||
pageshotUrl: snapshot.pageshotUrl,
|
|
||||||
toString() {
|
|
||||||
return this.pageshotUrl;
|
|
||||||
}
|
|
||||||
} as FormattedPage;
|
|
||||||
}
|
|
||||||
if (mode === 'html') {
|
|
||||||
return {
|
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
|
||||||
html: snapshot.html,
|
|
||||||
toString() {
|
|
||||||
return this.html;
|
|
||||||
}
|
|
||||||
} as FormattedPage;
|
|
||||||
}
|
|
||||||
|
|
||||||
let pdfMode = false;
|
|
||||||
if (snapshot.pdfs?.length && !snapshot.title) {
|
|
||||||
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
|
||||||
this.threadLocal.get('cacheTolerance')
|
|
||||||
);
|
|
||||||
if (pdf) {
|
|
||||||
pdfMode = true;
|
|
||||||
snapshot.title = pdf.meta?.Title;
|
|
||||||
snapshot.text = pdf.text || snapshot.text;
|
|
||||||
snapshot.parsed = {
|
|
||||||
content: pdf.content,
|
|
||||||
textContent: pdf.content,
|
|
||||||
length: pdf.content?.length,
|
|
||||||
byline: pdf.meta?.Author,
|
|
||||||
lang: pdf.meta?.Language || undefined,
|
|
||||||
title: pdf.meta?.Title,
|
|
||||||
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (mode === 'text') {
|
|
||||||
return {
|
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
|
||||||
text: snapshot.text,
|
|
||||||
toString() {
|
|
||||||
return this.text;
|
|
||||||
}
|
|
||||||
} as FormattedPage;
|
|
||||||
}
|
|
||||||
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
|
||||||
|
|
||||||
let contentText = '';
|
|
||||||
const imageSummary = {} as { [k: string]: string; };
|
|
||||||
const imageIdxTrack = new Map<string, number[]>();
|
|
||||||
const uid = this.threadLocal.get('uid');
|
|
||||||
do {
|
|
||||||
if (pdfMode) {
|
|
||||||
contentText = snapshot.parsed?.content || snapshot.text;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
snapshot.maxElemDepth! > 256 ||
|
|
||||||
(!uid && snapshot.elemCount! > 10_000) ||
|
|
||||||
snapshot.elemCount! > 70_000
|
|
||||||
) {
|
|
||||||
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
|
||||||
contentText = snapshot.text;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
|
||||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
|
||||||
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
|
||||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
|
||||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
|
||||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
|
||||||
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
|
||||||
|
|
||||||
// If Readability did its job
|
|
||||||
if (par2.length >= 0.3 * par1.length) {
|
|
||||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
|
||||||
if (snapshot.parsed.content) {
|
|
||||||
toBeTurnedToMd = jsDomElementOfParsed;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const plugin of this.turnDownPlugins) {
|
|
||||||
turnDownService = turnDownService.use(plugin);
|
|
||||||
}
|
|
||||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
|
||||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
|
||||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
|
||||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
|
||||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
|
||||||
return undefined;
|
|
||||||
});
|
|
||||||
if (r && x.src) {
|
|
||||||
urlToAltMap[x.src.trim()] = r;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
await Promise.all(tasks);
|
|
||||||
}
|
|
||||||
let imgIdx = 0;
|
|
||||||
turnDownService.addRule('img-generated-alt', {
|
|
||||||
filter: 'img',
|
|
||||||
replacement: (_content, node: any) => {
|
|
||||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
|
||||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
|
||||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
|
||||||
if (dataSrc && !dataSrc.startsWith('data:')) {
|
|
||||||
linkPreferredSrc = dataSrc;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
let src;
|
|
||||||
try {
|
|
||||||
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
|
||||||
} catch (_err) {
|
|
||||||
void 0;
|
|
||||||
}
|
|
||||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
|
||||||
if (!src) {
|
|
||||||
return '';
|
|
||||||
}
|
|
||||||
const mapped = urlToAltMap[src];
|
|
||||||
const imgSerial = ++imgIdx;
|
|
||||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
|
||||||
idxArr.push(imgSerial);
|
|
||||||
imageIdxTrack.set(src, idxArr);
|
|
||||||
|
|
||||||
if (mapped) {
|
|
||||||
imageSummary[src] = mapped || alt;
|
|
||||||
|
|
||||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
|
||||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
|
||||||
mappedUrl.protocol = 'blob:';
|
|
||||||
|
|
||||||
return ``;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ``;
|
|
||||||
}
|
|
||||||
|
|
||||||
imageSummary[src] = alt || '';
|
|
||||||
|
|
||||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
|
||||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
|
||||||
mappedUrl.protocol = 'blob:';
|
|
||||||
|
|
||||||
return alt ? `` : ``;
|
|
||||||
}
|
|
||||||
|
|
||||||
return alt ? `` : ``;
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (toBeTurnedToMd) {
|
|
||||||
try {
|
|
||||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
|
||||||
} catch (err) {
|
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
|
||||||
try {
|
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
|
||||||
} catch (err2) {
|
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
|
||||||
&& toBeTurnedToMd !== jsDomElementOfHTML
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
|
||||||
} catch (err) {
|
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
|
||||||
try {
|
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
|
||||||
} catch (err2) {
|
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
|
||||||
contentText = snapshot.text;
|
|
||||||
}
|
|
||||||
} while (false);
|
|
||||||
|
|
||||||
const cleanText = (contentText || '').trim();
|
|
||||||
|
|
||||||
const formatted: FormattedPage = {
|
|
||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
|
||||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
|
||||||
content: cleanText,
|
|
||||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
|
||||||
|
|
||||||
toString() {
|
|
||||||
if (mode === 'markdown') {
|
|
||||||
return this.content as string;
|
|
||||||
}
|
|
||||||
|
|
||||||
const mixins = [];
|
|
||||||
if (this.publishedTime) {
|
|
||||||
mixins.push(`Published Time: ${this.publishedTime}`);
|
|
||||||
}
|
|
||||||
const suffixMixins = [];
|
|
||||||
if (this.images) {
|
|
||||||
const imageSummaryChunks = ['Images:'];
|
|
||||||
for (const [k, v] of Object.entries(this.images)) {
|
|
||||||
imageSummaryChunks.push(`- `);
|
|
||||||
}
|
|
||||||
if (imageSummaryChunks.length === 1) {
|
|
||||||
imageSummaryChunks.push('This page does not seem to contain any images.');
|
|
||||||
}
|
|
||||||
suffixMixins.push(imageSummaryChunks.join('\n'));
|
|
||||||
}
|
|
||||||
if (this.links) {
|
|
||||||
const linkSummaryChunks = ['Links/Buttons:'];
|
|
||||||
for (const [k, v] of Object.entries(this.links)) {
|
|
||||||
linkSummaryChunks.push(`- [${k}](${v})`);
|
|
||||||
}
|
|
||||||
if (linkSummaryChunks.length === 1) {
|
|
||||||
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
|
||||||
}
|
|
||||||
suffixMixins.push(linkSummaryChunks.join('\n'));
|
|
||||||
}
|
|
||||||
|
|
||||||
return `Title: ${this.title}
|
|
||||||
|
|
||||||
URL Source: ${this.url}
|
|
||||||
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
|
||||||
Markdown Content:
|
|
||||||
${this.content}
|
|
||||||
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
if (this.threadLocal.get('withImagesSummary')) {
|
|
||||||
formatted.images =
|
|
||||||
_(imageSummary)
|
|
||||||
.toPairs()
|
|
||||||
.map(
|
|
||||||
([url, alt], i) => {
|
|
||||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
|
||||||
}
|
|
||||||
).fromPairs()
|
|
||||||
.value();
|
|
||||||
}
|
|
||||||
if (this.threadLocal.get('withLinksSummary')) {
|
|
||||||
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
|
||||||
}
|
|
||||||
|
|
||||||
return formatted as FormattedPage;
|
|
||||||
}
|
|
||||||
|
|
||||||
@CloudHTTPv2({
|
@CloudHTTPv2({
|
||||||
name: 'crawl2',
|
name: 'crawl2',
|
||||||
runtime: {
|
runtime: {
|
||||||
@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
})
|
})
|
||||||
@CloudHTTPv2({
|
@CloudHTTPv2({
|
||||||
runtime: {
|
runtime: {
|
||||||
memory: '4GiB',
|
memory: '8GiB',
|
||||||
cpu: 4,
|
cpu: 4,
|
||||||
timeoutSeconds: 300,
|
timeoutSeconds: 300,
|
||||||
concurrency: 22,
|
concurrency: 22,
|
||||||
@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
event: 'data',
|
event: 'data',
|
||||||
@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
|
||||||
if (crawlerOptions.timeout === undefined) {
|
if (crawlerOptions.timeout === undefined) {
|
||||||
@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
|
||||||
if (crawlerOptions.timeout === undefined) {
|
if (crawlerOptions.timeout === undefined) {
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
|
||||||
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
||||||
@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
if (lastSnapshot) {
|
if (lastSnapshot) {
|
||||||
return this.formatSnapshot(mode, lastSnapshot, url);
|
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
throw err;
|
throw err;
|
||||||
@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
throw new AssertionFailureError(`No content available`);
|
throw new AssertionFailureError(`No content available`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return this.formatSnapshot(mode, lastSnapshot, url);
|
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises';
|
|||||||
import { createGzip } from 'zlib';
|
import { createGzip } from 'zlib';
|
||||||
import { getFunctions } from 'firebase-admin/functions';
|
import { getFunctions } from 'firebase-admin/functions';
|
||||||
import { GoogleAuth } from 'google-auth-library';
|
import { GoogleAuth } from 'google-auth-library';
|
||||||
|
import { SnapshotFormatter } from '../services/snapshot-formatter';
|
||||||
|
|
||||||
dayjs.extend(require('dayjs/plugin/utc'));
|
dayjs.extend(require('dayjs/plugin/utc'));
|
||||||
|
|
||||||
@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost {
|
|||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
|
|
||||||
protected crawler: CrawlerHost,
|
protected crawler: CrawlerHost,
|
||||||
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
protected tempFileManager: TempFileManager,
|
protected tempFileManager: TempFileManager,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
) {
|
) {
|
||||||
@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost {
|
|||||||
try {
|
try {
|
||||||
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
||||||
|
|
||||||
let formatted = await this.crawler.formatSnapshot('default', snapshot);
|
let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
|
||||||
if (!formatted.content) {
|
if (!formatted.content) {
|
||||||
formatted = await this.crawler.formatSnapshot('markdown', snapshot);
|
formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
||||||
}
|
}
|
||||||
|
|
||||||
await nextDrainDeferred.promise;
|
await nextDrainDeferred.promise;
|
||||||
|
@ -11,11 +11,12 @@ import _ from 'lodash';
|
|||||||
import { Request, Response } from 'express';
|
import { Request, Response } from 'express';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
||||||
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
|
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||||
import { SearchResult } from '../db/searched';
|
import { SearchResult } from '../db/searched';
|
||||||
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
||||||
import { CrawlerOptions } from '../dto/scrapping-options';
|
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||||
|
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
||||||
|
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
protected threadLocal: AsyncContext,
|
protected threadLocal: AsyncContext,
|
||||||
protected braveSearchService: BraveSearchService,
|
protected braveSearchService: BraveSearchService,
|
||||||
protected crawler: CrawlerHost,
|
protected crawler: CrawlerHost,
|
||||||
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
}
|
}
|
||||||
@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost {
|
|||||||
if (snapshotMap.has(x)) {
|
if (snapshotMap.has(x)) {
|
||||||
return snapshotMap.get(x);
|
return snapshotMap.get(x);
|
||||||
}
|
}
|
||||||
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
|
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||||
r.title ??= upstreamSearchResult.title;
|
r.title ??= upstreamSearchResult.title;
|
||||||
r.description = upstreamSearchResult.description;
|
r.description = upstreamSearchResult.description;
|
||||||
snapshotMap.set(x, r);
|
snapshotMap.set(x, r);
|
||||||
|
@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe';
|
|||||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
import { AsyncService, marshalErrorLike } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
||||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
|
import { Threaded } from '../shared/services/threaded';
|
||||||
|
|
||||||
const virtualConsole = new VirtualConsole();
|
const pLinkedom = import('linkedom');
|
||||||
virtualConsole.on('error', () => void 0);
|
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class JSDomControl extends AsyncService {
|
export class JSDomControl extends AsyncService {
|
||||||
|
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
linkedom!: Awaited<typeof pLinkedom>;
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
) {
|
) {
|
||||||
@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService {
|
|||||||
|
|
||||||
override async init() {
|
override async init() {
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
|
this.linkedom = await pLinkedom;
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
withIframe?: boolean;
|
withIframe?: boolean;
|
||||||
}): PageSnapshot | undefined {
|
}) {
|
||||||
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
if (!snapshot?.html) {
|
if (!snapshot?.html) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return this.actualNarrowSnapshot(snapshot, options);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Threaded()
|
||||||
|
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
|
||||||
|
targetSelector?: string | string[];
|
||||||
|
removeSelector?: string | string[];
|
||||||
|
withIframe?: boolean;
|
||||||
|
}): Promise<PageSnapshot | undefined> {
|
||||||
|
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
||||||
const allNodes: Node[] = [];
|
const allNodes: Node[] = [];
|
||||||
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
||||||
if (options?.withIframe) {
|
if (options?.withIframe) {
|
||||||
@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService {
|
|||||||
let rootDoc: Document;
|
let rootDoc: Document;
|
||||||
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
||||||
rootDoc = allNodes[0] as any;
|
rootDoc = allNodes[0] as any;
|
||||||
if (rootDoc.body.textContent) {
|
if (rootDoc.body.innerText) {
|
||||||
textChunks.push(rootDoc.body.textContent);
|
textChunks.push(rootDoc.body.innerText);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
||||||
for (const n of allNodes) {
|
for (const n of allNodes) {
|
||||||
rootDoc.body.appendChild(n);
|
rootDoc.body.appendChild(n);
|
||||||
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
||||||
if (n.textContent) {
|
if ((n as HTMLElement).innerText) {
|
||||||
textChunks.push(n.textContent);
|
textChunks.push((n as HTMLElement).innerText);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService {
|
|||||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||||
}
|
}
|
||||||
|
|
||||||
// No innerText in jsdom
|
|
||||||
// https://github.com/jsdom/jsdom/issues/1245
|
|
||||||
const textContent = textChunks.join('\n\n');
|
|
||||||
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
|
||||||
|
|
||||||
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
||||||
.flat()
|
.flat()
|
||||||
@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
title: snapshot.title || jsdom.window.document.title,
|
title: snapshot.title || jsdom.window.document.title,
|
||||||
parsed,
|
parsed,
|
||||||
html: rootDoc.documentElement.outerHTML,
|
html: rootDoc.documentElement.outerHTML,
|
||||||
text: cleanedText,
|
text: textChunks.join('\n'),
|
||||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService {
|
|||||||
return r;
|
return r;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Threaded()
|
||||||
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
||||||
try {
|
try {
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
||||||
|
|
||||||
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
||||||
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
||||||
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
||||||
@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService {
|
|||||||
|
|
||||||
return extendedSnapshot;
|
return extendedSnapshot;
|
||||||
}
|
}
|
||||||
|
|
||||||
snippetToElement(snippet?: string, url?: string) {
|
snippetToElement(snippet?: string, url?: string) {
|
||||||
const parsed = new JSDOM(snippet || '', { url, virtualConsole });
|
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
||||||
|
|
||||||
return parsed.window.document.documentElement;
|
return parsed.window.document.documentElement;
|
||||||
}
|
}
|
||||||
|
@ -1,7 +1,7 @@
|
|||||||
import os from 'os';
|
import os from 'os';
|
||||||
import fs from 'fs';
|
import fs from 'fs';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred, perNextTick } from 'civkit';
|
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
||||||
@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
browser!: Browser;
|
browser!: Browser;
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
private __healthCheckInterval?: NodeJS.Timeout;
|
|
||||||
private __reqCapInterval?: NodeJS.Timeout;
|
private __reqCapInterval?: NodeJS.Timeout;
|
||||||
|
|
||||||
__loadedPage: Page[] = [];
|
__loadedPage: Page[] = [];
|
||||||
@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
|
|
||||||
rpsCap: number = 300;
|
rpsCap: number = 500;
|
||||||
lastReqSentAt: number = 0;
|
lastReqSentAt: number = 0;
|
||||||
requestDeferredQueue: Deferred<boolean>[] = [];
|
requestDeferredQueue: Deferred<boolean>[] = [];
|
||||||
|
|
||||||
@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
briefPages() {
|
|
||||||
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
|
|
||||||
}
|
|
||||||
|
|
||||||
override async init() {
|
override async init() {
|
||||||
if (this.__healthCheckInterval) {
|
|
||||||
clearInterval(this.__healthCheckInterval);
|
|
||||||
this.__healthCheckInterval = undefined;
|
|
||||||
}
|
|
||||||
if (this.__reqCapInterval) {
|
if (this.__reqCapInterval) {
|
||||||
clearInterval(this.__reqCapInterval);
|
clearInterval(this.__reqCapInterval);
|
||||||
this.__reqCapInterval = undefined;
|
this.__reqCapInterval = undefined;
|
||||||
@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
|
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
|
|
||||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
|
|
||||||
this.newPage().then((r) => this.__loadedPage.push(r));
|
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||||
}
|
}
|
||||||
|
|
||||||
@maxConcurrency(1)
|
|
||||||
async healthCheck() {
|
|
||||||
if (Date.now() - this.lastPageCratedAt <= 10_000) {
|
|
||||||
this.briefPages();
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
const healthyPage = await this.newPage().catch((err) => {
|
|
||||||
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
|
|
||||||
if (healthyPage) {
|
|
||||||
this.__loadedPage.push(healthyPage);
|
|
||||||
|
|
||||||
if (this.__loadedPage.length > 3) {
|
|
||||||
this.ditchPage(this.__loadedPage.shift()!);
|
|
||||||
}
|
|
||||||
|
|
||||||
this.briefPages();
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
this.logger.warn(`Trying to clean up...`);
|
|
||||||
this.browser.process()?.kill('SIGKILL');
|
|
||||||
Reflect.deleteProperty(this, 'browser');
|
|
||||||
this.emit('crippled');
|
|
||||||
this.logger.warn(`Browser killed`);
|
|
||||||
}
|
|
||||||
|
|
||||||
@perNextTick()
|
@perNextTick()
|
||||||
reqCapRoutine() {
|
reqCapRoutine() {
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
try {
|
try {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = Buffer.from(await page.screenshot());
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = Buffer.from(await page.screenshot());
|
||||||
pageshot = await page.screenshot({ fullPage: true });
|
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
.then(async () => {
|
.then(async () => {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = Buffer.from(await page.screenshot());
|
||||||
pageshot = await page.screenshot({ fullPage: true });
|
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
screenshot = await page.screenshot();
|
screenshot = Buffer.from(await page.screenshot());
|
||||||
pageshot = await page.screenshot({ fullPage: true });
|
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
if (snapshot || screenshot) {
|
if (snapshot || screenshot) {
|
||||||
|
539
backend/functions/src/services/snapshot-formatter.ts
Normal file
539
backend/functions/src/services/snapshot-formatter.ts
Normal file
@ -0,0 +1,539 @@
|
|||||||
|
import { randomUUID } from 'crypto';
|
||||||
|
import { container, singleton } from 'tsyringe';
|
||||||
|
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
||||||
|
import TurndownService from 'turndown';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import { PageSnapshot } from './puppeteer';
|
||||||
|
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||||
|
import { AsyncContext } from '../shared/services/async-context';
|
||||||
|
import { Threaded } from '../shared/services/threaded';
|
||||||
|
import { JSDomControl } from './jsdom';
|
||||||
|
import { AltTextService } from './alt-text';
|
||||||
|
import { PDFExtractor } from './pdf-extract';
|
||||||
|
import { cleanAttribute } from '../utils/misc';
|
||||||
|
import _ from 'lodash';
|
||||||
|
|
||||||
|
|
||||||
|
export interface FormattedPage {
|
||||||
|
title?: string;
|
||||||
|
description?: string;
|
||||||
|
url?: string;
|
||||||
|
content?: string;
|
||||||
|
publishedTime?: string;
|
||||||
|
html?: string;
|
||||||
|
text?: string;
|
||||||
|
screenshotUrl?: string;
|
||||||
|
screenshot?: Buffer;
|
||||||
|
pageshotUrl?: string;
|
||||||
|
pageshot?: Buffer;
|
||||||
|
links?: { [k: string]: string; };
|
||||||
|
images?: { [k: string]: string; };
|
||||||
|
usage?: {
|
||||||
|
total_tokens?: number;
|
||||||
|
totalTokens?: number;
|
||||||
|
tokens?: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
textRepresentation?: string;
|
||||||
|
|
||||||
|
[Symbol.dispose]: () => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class SnapshotFormatter extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected jsdomControl: JSDomControl,
|
||||||
|
protected altTextService: AltTextService,
|
||||||
|
protected pdfExtractor: PDFExtractor,
|
||||||
|
protected threadLocal: AsyncContext,
|
||||||
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Threaded()
|
||||||
|
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
||||||
|
screenshotUrl?: string;
|
||||||
|
pageshotUrl?: string;
|
||||||
|
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
||||||
|
const t0 = Date.now();
|
||||||
|
if (mode === 'screenshot') {
|
||||||
|
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
||||||
|
const fid = `instant-screenshots/${randomUUID()}`;
|
||||||
|
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
||||||
|
metadata: {
|
||||||
|
contentType: 'image/png',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
const f = {
|
||||||
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
// html: snapshot.html,
|
||||||
|
screenshotUrl: snapshot.screenshotUrl,
|
||||||
|
};
|
||||||
|
|
||||||
|
Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
return f as FormattedPage;
|
||||||
|
}
|
||||||
|
if (mode === 'pageshot') {
|
||||||
|
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
||||||
|
const fid = `instant-screenshots/${randomUUID()}`;
|
||||||
|
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
||||||
|
metadata: {
|
||||||
|
contentType: 'image/png',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
const f = {
|
||||||
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
html: snapshot.html,
|
||||||
|
pageshotUrl: snapshot.pageshotUrl,
|
||||||
|
} as FormattedPage;
|
||||||
|
|
||||||
|
Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
if (mode === 'html') {
|
||||||
|
const f = {
|
||||||
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
html: snapshot.html,
|
||||||
|
} as FormattedPage;
|
||||||
|
|
||||||
|
Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
|
||||||
|
let pdfMode = false;
|
||||||
|
if (snapshot.pdfs?.length && !snapshot.title) {
|
||||||
|
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
||||||
|
this.threadLocal.get('cacheTolerance')
|
||||||
|
);
|
||||||
|
if (pdf) {
|
||||||
|
pdfMode = true;
|
||||||
|
snapshot.title = pdf.meta?.Title;
|
||||||
|
snapshot.text = pdf.text || snapshot.text;
|
||||||
|
snapshot.parsed = {
|
||||||
|
content: pdf.content,
|
||||||
|
textContent: pdf.content,
|
||||||
|
length: pdf.content?.length,
|
||||||
|
byline: pdf.meta?.Author,
|
||||||
|
lang: pdf.meta?.Language || undefined,
|
||||||
|
title: pdf.meta?.Title,
|
||||||
|
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mode === 'text') {
|
||||||
|
const f = {
|
||||||
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
text: snapshot.text,
|
||||||
|
} as FormattedPage;
|
||||||
|
|
||||||
|
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
return f;
|
||||||
|
}
|
||||||
|
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
||||||
|
|
||||||
|
let contentText = '';
|
||||||
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
|
const uid = this.threadLocal.get('uid');
|
||||||
|
do {
|
||||||
|
if (pdfMode) {
|
||||||
|
contentText = snapshot.parsed?.content || snapshot.text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
snapshot.maxElemDepth! > 256 ||
|
||||||
|
(!uid && snapshot.elemCount! > 10_000) ||
|
||||||
|
snapshot.elemCount! > 70_000
|
||||||
|
) {
|
||||||
|
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
||||||
|
contentText = snapshot.text;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||||
|
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||||
|
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
|
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||||
|
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||||
|
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||||
|
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||||
|
|
||||||
|
// If Readability did its job
|
||||||
|
if (par2.length >= 0.3 * par1.length) {
|
||||||
|
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
|
if (snapshot.parsed.content) {
|
||||||
|
toBeTurnedToMd = jsDomElementOfParsed;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const plugin of this.turnDownPlugins) {
|
||||||
|
turnDownService = turnDownService.use(plugin);
|
||||||
|
}
|
||||||
|
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||||
|
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||||
|
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||||
|
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||||
|
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||||
|
return undefined;
|
||||||
|
});
|
||||||
|
if (r && x.src) {
|
||||||
|
urlToAltMap[x.src.trim()] = r;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.all(tasks);
|
||||||
|
}
|
||||||
|
let imgIdx = 0;
|
||||||
|
turnDownService.addRule('img-generated-alt', {
|
||||||
|
filter: 'img',
|
||||||
|
replacement: (_content, node: any) => {
|
||||||
|
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||||
|
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||||
|
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||||
|
if (dataSrc && !dataSrc.startsWith('data:')) {
|
||||||
|
linkPreferredSrc = dataSrc;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let src;
|
||||||
|
try {
|
||||||
|
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
||||||
|
} catch (_err) {
|
||||||
|
void 0;
|
||||||
|
}
|
||||||
|
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||||
|
if (!src) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
const mapped = urlToAltMap[src];
|
||||||
|
const imgSerial = ++imgIdx;
|
||||||
|
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||||
|
idxArr.push(imgSerial);
|
||||||
|
imageIdxTrack.set(src, idxArr);
|
||||||
|
|
||||||
|
if (mapped) {
|
||||||
|
imageSummary[src] = mapped || alt;
|
||||||
|
|
||||||
|
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||||
|
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||||
|
mappedUrl.protocol = 'blob:';
|
||||||
|
|
||||||
|
return ``;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ``;
|
||||||
|
}
|
||||||
|
|
||||||
|
imageSummary[src] = alt || '';
|
||||||
|
|
||||||
|
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||||
|
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||||
|
mappedUrl.protocol = 'blob:';
|
||||||
|
|
||||||
|
return alt ? `` : ``;
|
||||||
|
}
|
||||||
|
|
||||||
|
return alt ? `` : ``;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (toBeTurnedToMd) {
|
||||||
|
try {
|
||||||
|
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
|
try {
|
||||||
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||||
|
} catch (err2) {
|
||||||
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||||
|
&& toBeTurnedToMd !== jsDomElementOfHTML
|
||||||
|
) {
|
||||||
|
try {
|
||||||
|
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
|
try {
|
||||||
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||||
|
} catch (err2) {
|
||||||
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
||||||
|
contentText = snapshot.text;
|
||||||
|
}
|
||||||
|
} while (false);
|
||||||
|
|
||||||
|
const cleanText = (contentText || '').trim();
|
||||||
|
|
||||||
|
const formatted: FormattedPage = {
|
||||||
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
|
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||||
|
content: cleanText,
|
||||||
|
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||||
|
[Symbol.dispose]: () => { },
|
||||||
|
};
|
||||||
|
|
||||||
|
if (this.threadLocal.get('withImagesSummary')) {
|
||||||
|
formatted.images =
|
||||||
|
_(imageSummary)
|
||||||
|
.toPairs()
|
||||||
|
.map(
|
||||||
|
([url, alt], i) => {
|
||||||
|
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||||
|
}
|
||||||
|
).fromPairs()
|
||||||
|
.value();
|
||||||
|
}
|
||||||
|
if (this.threadLocal.get('withLinksSummary')) {
|
||||||
|
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
||||||
|
}
|
||||||
|
|
||||||
|
const textRepresentation = (function (this: typeof formatted) {
|
||||||
|
if (mode === 'markdown') {
|
||||||
|
return this.content as string;
|
||||||
|
}
|
||||||
|
|
||||||
|
const mixins = [];
|
||||||
|
if (this.publishedTime) {
|
||||||
|
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||||
|
}
|
||||||
|
const suffixMixins = [];
|
||||||
|
if (this.images) {
|
||||||
|
const imageSummaryChunks = ['Images:'];
|
||||||
|
for (const [k, v] of Object.entries(this.images)) {
|
||||||
|
imageSummaryChunks.push(`- `);
|
||||||
|
}
|
||||||
|
if (imageSummaryChunks.length === 1) {
|
||||||
|
imageSummaryChunks.push('This page does not seem to contain any images.');
|
||||||
|
}
|
||||||
|
suffixMixins.push(imageSummaryChunks.join('\n'));
|
||||||
|
}
|
||||||
|
if (this.links) {
|
||||||
|
const linkSummaryChunks = ['Links/Buttons:'];
|
||||||
|
for (const [k, v] of Object.entries(this.links)) {
|
||||||
|
linkSummaryChunks.push(`- [${k}](${v})`);
|
||||||
|
}
|
||||||
|
if (linkSummaryChunks.length === 1) {
|
||||||
|
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
||||||
|
}
|
||||||
|
suffixMixins.push(linkSummaryChunks.join('\n'));
|
||||||
|
}
|
||||||
|
|
||||||
|
return `Title: ${this.title}
|
||||||
|
|
||||||
|
URL Source: ${this.url}
|
||||||
|
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
||||||
|
Markdown Content:
|
||||||
|
${this.content}
|
||||||
|
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||||
|
}).call(formatted);
|
||||||
|
|
||||||
|
Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
|
||||||
|
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||||
|
|
||||||
|
return formatted as FormattedPage;
|
||||||
|
}
|
||||||
|
|
||||||
|
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||||
|
let inferred;
|
||||||
|
const mixin: any = {};
|
||||||
|
if (this.threadLocal.get('withImagesSummary')) {
|
||||||
|
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||||
|
const imageSummary = {} as { [k: string]: string; };
|
||||||
|
const imageIdxTrack = new Map<string, number[]>();
|
||||||
|
|
||||||
|
let imgIdx = 0;
|
||||||
|
|
||||||
|
for (const img of inferred.imgs) {
|
||||||
|
const imgSerial = ++imgIdx;
|
||||||
|
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
||||||
|
idxArr.push(imgSerial);
|
||||||
|
imageIdxTrack.set(img.src, idxArr);
|
||||||
|
imageSummary[img.src] = img.alt || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
mixin.images =
|
||||||
|
_(imageSummary)
|
||||||
|
.toPairs()
|
||||||
|
.map(
|
||||||
|
([url, alt], i) => {
|
||||||
|
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||||
|
}
|
||||||
|
).fromPairs()
|
||||||
|
.value();
|
||||||
|
}
|
||||||
|
if (this.threadLocal.get('withLinksSummary')) {
|
||||||
|
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||||
|
mixin.links = _.invert(inferred.links || {});
|
||||||
|
}
|
||||||
|
|
||||||
|
return mixin;
|
||||||
|
}
|
||||||
|
|
||||||
|
getTurndown(options?: {
|
||||||
|
noRules?: boolean | string,
|
||||||
|
url?: string | URL;
|
||||||
|
imgDataUrlToObjectUrl?: boolean;
|
||||||
|
}) {
|
||||||
|
const turnDownService = new TurndownService({
|
||||||
|
codeBlockStyle: 'fenced',
|
||||||
|
preformattedCode: true,
|
||||||
|
} as any);
|
||||||
|
if (!options?.noRules) {
|
||||||
|
turnDownService.addRule('remove-irrelevant', {
|
||||||
|
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
||||||
|
replacement: () => ''
|
||||||
|
});
|
||||||
|
turnDownService.addRule('truncate-svg', {
|
||||||
|
filter: 'svg' as any,
|
||||||
|
replacement: () => ''
|
||||||
|
});
|
||||||
|
turnDownService.addRule('title-as-h1', {
|
||||||
|
filter: ['title'],
|
||||||
|
replacement: (innerText) => `${innerText}\n===============\n`
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (options?.imgDataUrlToObjectUrl) {
|
||||||
|
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
||||||
|
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
||||||
|
replacement: (_content, node: any) => {
|
||||||
|
const src = (node.getAttribute('src') || '').trim();
|
||||||
|
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
||||||
|
|
||||||
|
if (options.url) {
|
||||||
|
const refUrl = new URL(options.url);
|
||||||
|
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
||||||
|
|
||||||
|
return ``;
|
||||||
|
}
|
||||||
|
|
||||||
|
return `})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
turnDownService.addRule('improved-paragraph', {
|
||||||
|
filter: 'p',
|
||||||
|
replacement: (innerText) => {
|
||||||
|
const trimmed = innerText.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
turnDownService.addRule('improved-inline-link', {
|
||||||
|
filter: function (node, options) {
|
||||||
|
return Boolean(
|
||||||
|
options.linkStyle === 'inlined' &&
|
||||||
|
node.nodeName === 'A' &&
|
||||||
|
node.getAttribute('href')
|
||||||
|
);
|
||||||
|
},
|
||||||
|
|
||||||
|
replacement: function (content, node: any) {
|
||||||
|
let href = node.getAttribute('href');
|
||||||
|
if (href) href = href.replace(/([()])/g, '\\$1');
|
||||||
|
let title = cleanAttribute(node.getAttribute('title'));
|
||||||
|
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
||||||
|
|
||||||
|
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
||||||
|
let fixedHref = href.replace(/\s+/g, '').trim();
|
||||||
|
if (options?.url) {
|
||||||
|
try {
|
||||||
|
fixedHref = new URL(fixedHref, options.url).toString();
|
||||||
|
} catch (_err) {
|
||||||
|
void 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
turnDownService.addRule('improved-code', {
|
||||||
|
filter: function (node: any) {
|
||||||
|
let hasSiblings = node.previousSibling || node.nextSibling;
|
||||||
|
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
||||||
|
|
||||||
|
return node.nodeName === 'CODE' && !isCodeBlock;
|
||||||
|
},
|
||||||
|
|
||||||
|
replacement: function (inputContent: any) {
|
||||||
|
if (!inputContent) return '';
|
||||||
|
let content = inputContent;
|
||||||
|
|
||||||
|
let delimiter = '`';
|
||||||
|
let matches = content.match(/`+/gm) || [];
|
||||||
|
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
||||||
|
if (content.includes('\n')) {
|
||||||
|
delimiter = '```';
|
||||||
|
}
|
||||||
|
|
||||||
|
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
||||||
|
|
||||||
|
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
return turnDownService;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
const snapshotFormatter = container.resolve(SnapshotFormatter);
|
||||||
|
|
||||||
|
export default snapshotFormatter;
|
Loading…
x
Reference in New Issue
Block a user