mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-13 23:36:00 +08:00
fix: performance issue of jsdom
This commit is contained in:
parent
5171e5f94b
commit
94170db060
659
backend/functions/package-lock.json
generated
659
backend/functions/package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -34,7 +34,7 @@
|
||||
"archiver": "^6.0.1",
|
||||
"axios": "^1.3.3",
|
||||
"bcrypt": "^5.1.0",
|
||||
"civkit": "^0.6.5-047c0d8",
|
||||
"civkit": "^0.7.0-0f8889a",
|
||||
"core-js": "^3.37.1",
|
||||
"cors": "^2.8.5",
|
||||
"dayjs": "^1.11.9",
|
||||
@ -43,13 +43,13 @@
|
||||
"firebase-functions": "^4.9.0",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"jsdom": "^24.0.0",
|
||||
"langdetect": "^0.2.1",
|
||||
"linkedom": "^0.18.4",
|
||||
"maxmind": "^4.3.18",
|
||||
"minio": "^7.1.3",
|
||||
"openai": "^4.20.0",
|
||||
"pdfjs-dist": "^4.2.67",
|
||||
"puppeteer": "^22.7.1",
|
||||
"puppeteer": "^23.3.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"puppeteer-extra-plugin-block-resources": "^2.4.3",
|
||||
"puppeteer-extra-plugin-page-proxy": "^2.0.0",
|
||||
@ -68,7 +68,7 @@
|
||||
"@types/bcrypt": "^5.0.0",
|
||||
"@types/cors": "^2.8.17",
|
||||
"@types/generic-pool": "^3.8.1",
|
||||
"@types/node": "^18",
|
||||
"@types/node": "^20.14.13",
|
||||
"@types/set-cookie-parser": "^2.4.7",
|
||||
"@typescript-eslint/eslint-plugin": "^5.12.0",
|
||||
"@typescript-eslint/parser": "^5.12.0",
|
||||
@ -77,7 +77,7 @@
|
||||
"eslint-plugin-import": "^2.25.4",
|
||||
"firebase-functions-test": "^3.0.0",
|
||||
"replicate": "^0.16.1",
|
||||
"typescript": "^5.1.6"
|
||||
"typescript": "^5.5.4"
|
||||
},
|
||||
"private": true,
|
||||
"exports": {
|
||||
|
@ -1,7 +1,6 @@
|
||||
import {
|
||||
assignTransferProtocolMeta, marshalErrorLike,
|
||||
RPCHost, RPCReflection,
|
||||
HashManager,
|
||||
AssertionFailureError, ParamValidationError, Defer,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
@ -11,22 +10,17 @@ import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
import { Request, Response } from 'express';
|
||||
const pNormalizeUrl = import("@esm2cjs/normalize-url");
|
||||
import { AltTextService } from '../services/alt-text';
|
||||
import TurndownService from 'turndown';
|
||||
import { Crawled } from '../db/crawled';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import { randomUUID } from 'crypto';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
|
||||
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||
import { CrawlerOptions, CrawlerOptionsHeaderOnly } from '../dto/scrapping-options';
|
||||
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { PDFExtractor } from '../services/pdf-extract';
|
||||
import { DomainBlockade } from '../db/domain-blockade';
|
||||
import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-checker';
|
||||
import { JSDomControl } from '../services/jsdom';
|
||||
|
||||
const md5Hasher = new HashManager('md5', 'hex');
|
||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean;
|
||||
@ -35,29 +29,6 @@ export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
keepImgDataUrl?: boolean;
|
||||
}
|
||||
|
||||
export interface FormattedPage {
|
||||
title?: string;
|
||||
description?: string;
|
||||
url?: string;
|
||||
content?: string;
|
||||
publishedTime?: string;
|
||||
html?: string;
|
||||
text?: string;
|
||||
screenshotUrl?: string;
|
||||
screenshot?: Buffer;
|
||||
pageshotUrl?: string;
|
||||
pageshot?: Buffer;
|
||||
links?: { [k: string]: string; };
|
||||
images?: { [k: string]: string; };
|
||||
usage?: {
|
||||
total_tokens?: number;
|
||||
totalTokens?: number;
|
||||
tokens?: number;
|
||||
};
|
||||
|
||||
toString: () => string;
|
||||
}
|
||||
|
||||
const indexProto = {
|
||||
toString: function (): string {
|
||||
return _(this)
|
||||
@ -72,8 +43,6 @@ const indexProto = {
|
||||
export class CrawlerHost extends RPCHost {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
||||
|
||||
cacheRetentionMs = 1000 * 3600 * 24 * 7;
|
||||
cacheValidMs = 1000 * 3600;
|
||||
urlValidMs = 1000 * 3600 * 4;
|
||||
@ -83,8 +52,7 @@ export class CrawlerHost extends RPCHost {
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected altTextService: AltTextService,
|
||||
protected pdfExtractor: PDFExtractor,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncContext,
|
||||
@ -148,448 +116,6 @@ export class CrawlerHost extends RPCHost {
|
||||
return indexObject;
|
||||
}
|
||||
|
||||
getTurndown(options?: {
|
||||
noRules?: boolean | string,
|
||||
url?: string | URL;
|
||||
imgDataUrlToObjectUrl?: boolean;
|
||||
}) {
|
||||
const turnDownService = new TurndownService({
|
||||
codeBlockStyle: 'fenced',
|
||||
preformattedCode: true,
|
||||
} as any);
|
||||
if (!options?.noRules) {
|
||||
turnDownService.addRule('remove-irrelevant', {
|
||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
||||
replacement: () => ''
|
||||
});
|
||||
turnDownService.addRule('truncate-svg', {
|
||||
filter: 'svg' as any,
|
||||
replacement: () => ''
|
||||
});
|
||||
turnDownService.addRule('title-as-h1', {
|
||||
filter: ['title'],
|
||||
replacement: (innerText) => `${innerText}\n===============\n`
|
||||
});
|
||||
}
|
||||
|
||||
if (options?.imgDataUrlToObjectUrl) {
|
||||
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
||||
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
||||
replacement: (_content, node: any) => {
|
||||
const src = (node.getAttribute('src') || '').trim();
|
||||
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
||||
|
||||
if (options.url) {
|
||||
const refUrl = new URL(options.url);
|
||||
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return `})`;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
turnDownService.addRule('improved-paragraph', {
|
||||
filter: 'p',
|
||||
replacement: (innerText) => {
|
||||
const trimmed = innerText.trim();
|
||||
if (!trimmed) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
||||
}
|
||||
});
|
||||
turnDownService.addRule('improved-inline-link', {
|
||||
filter: function (node, options) {
|
||||
return Boolean(
|
||||
options.linkStyle === 'inlined' &&
|
||||
node.nodeName === 'A' &&
|
||||
node.getAttribute('href')
|
||||
);
|
||||
},
|
||||
|
||||
replacement: function (content, node: any) {
|
||||
let href = node.getAttribute('href');
|
||||
if (href) href = href.replace(/([()])/g, '\\$1');
|
||||
let title = cleanAttribute(node.getAttribute('title'));
|
||||
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
||||
|
||||
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
||||
let fixedHref = href.replace(/\s+/g, '').trim();
|
||||
if (options?.url) {
|
||||
try {
|
||||
fixedHref = new URL(fixedHref, options.url).toString();
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
}
|
||||
|
||||
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
||||
}
|
||||
});
|
||||
turnDownService.addRule('improved-code', {
|
||||
filter: function (node: any) {
|
||||
let hasSiblings = node.previousSibling || node.nextSibling;
|
||||
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
||||
|
||||
return node.nodeName === 'CODE' && !isCodeBlock;
|
||||
},
|
||||
|
||||
replacement: function (inputContent: any) {
|
||||
if (!inputContent) return '';
|
||||
let content = inputContent;
|
||||
|
||||
let delimiter = '`';
|
||||
let matches = content.match(/`+/gm) || [];
|
||||
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
||||
if (content.includes('\n')) {
|
||||
delimiter = '```';
|
||||
}
|
||||
|
||||
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
||||
|
||||
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
||||
}
|
||||
});
|
||||
|
||||
return turnDownService;
|
||||
}
|
||||
|
||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||
let inferred;
|
||||
const mixin: any = {};
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
const imageIdxTrack = new Map<string, number[]>();
|
||||
|
||||
let imgIdx = 0;
|
||||
|
||||
for (const img of inferred.imgs) {
|
||||
const imgSerial = ++imgIdx;
|
||||
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
||||
idxArr.push(imgSerial);
|
||||
imageIdxTrack.set(img.src, idxArr);
|
||||
imageSummary[img.src] = img.alt || '';
|
||||
}
|
||||
|
||||
mixin.images =
|
||||
_(imageSummary)
|
||||
.toPairs()
|
||||
.map(
|
||||
([url, alt], i) => {
|
||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||
}
|
||||
).fromPairs()
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
mixin.links = _.invert(inferred.links || {});
|
||||
}
|
||||
|
||||
return mixin;
|
||||
}
|
||||
|
||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
pageshotUrl?: string;
|
||||
}, nominalUrl?: URL) {
|
||||
if (mode === 'screenshot') {
|
||||
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
||||
const fid = `instant-screenshots/${randomUUID()}`;
|
||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
||||
}
|
||||
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
// html: snapshot.html,
|
||||
screenshotUrl: snapshot.screenshotUrl,
|
||||
toString() {
|
||||
return this.screenshotUrl;
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
if (mode === 'pageshot') {
|
||||
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
||||
const fid = `instant-screenshots/${randomUUID()}`;
|
||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
||||
}
|
||||
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
html: snapshot.html,
|
||||
pageshotUrl: snapshot.pageshotUrl,
|
||||
toString() {
|
||||
return this.pageshotUrl;
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
if (mode === 'html') {
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
html: snapshot.html,
|
||||
toString() {
|
||||
return this.html;
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
|
||||
let pdfMode = false;
|
||||
if (snapshot.pdfs?.length && !snapshot.title) {
|
||||
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
||||
this.threadLocal.get('cacheTolerance')
|
||||
);
|
||||
if (pdf) {
|
||||
pdfMode = true;
|
||||
snapshot.title = pdf.meta?.Title;
|
||||
snapshot.text = pdf.text || snapshot.text;
|
||||
snapshot.parsed = {
|
||||
content: pdf.content,
|
||||
textContent: pdf.content,
|
||||
length: pdf.content?.length,
|
||||
byline: pdf.meta?.Author,
|
||||
lang: pdf.meta?.Language || undefined,
|
||||
title: pdf.meta?.Title,
|
||||
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (mode === 'text') {
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
text: snapshot.text,
|
||||
toString() {
|
||||
return this.text;
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
||||
|
||||
let contentText = '';
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
const imageIdxTrack = new Map<string, number[]>();
|
||||
const uid = this.threadLocal.get('uid');
|
||||
do {
|
||||
if (pdfMode) {
|
||||
contentText = snapshot.parsed?.content || snapshot.text;
|
||||
break;
|
||||
}
|
||||
|
||||
if (
|
||||
snapshot.maxElemDepth! > 256 ||
|
||||
(!uid && snapshot.elemCount! > 10_000) ||
|
||||
snapshot.elemCount! > 70_000
|
||||
) {
|
||||
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
||||
contentText = snapshot.text;
|
||||
break;
|
||||
}
|
||||
|
||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (snapshot.parsed.content) {
|
||||
toBeTurnedToMd = jsDomElementOfParsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const plugin of this.turnDownPlugins) {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
});
|
||||
if (r && x.src) {
|
||||
urlToAltMap[x.src.trim()] = r;
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(tasks);
|
||||
}
|
||||
let imgIdx = 0;
|
||||
turnDownService.addRule('img-generated-alt', {
|
||||
filter: 'img',
|
||||
replacement: (_content, node: any) => {
|
||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||
if (dataSrc && !dataSrc.startsWith('data:')) {
|
||||
linkPreferredSrc = dataSrc;
|
||||
}
|
||||
}
|
||||
|
||||
let src;
|
||||
try {
|
||||
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
if (!src) {
|
||||
return '';
|
||||
}
|
||||
const mapped = urlToAltMap[src];
|
||||
const imgSerial = ++imgIdx;
|
||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||
idxArr.push(imgSerial);
|
||||
imageIdxTrack.set(src, idxArr);
|
||||
|
||||
if (mapped) {
|
||||
imageSummary[src] = mapped || alt;
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
imageSummary[src] = alt || '';
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
});
|
||||
|
||||
if (toBeTurnedToMd) {
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||
&& toBeTurnedToMd !== jsDomElementOfHTML
|
||||
) {
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
||||
contentText = snapshot.text;
|
||||
}
|
||||
} while (false);
|
||||
|
||||
const cleanText = (contentText || '').trim();
|
||||
|
||||
const formatted: FormattedPage = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||
content: cleanText,
|
||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||
|
||||
toString() {
|
||||
if (mode === 'markdown') {
|
||||
return this.content as string;
|
||||
}
|
||||
|
||||
const mixins = [];
|
||||
if (this.publishedTime) {
|
||||
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||
}
|
||||
const suffixMixins = [];
|
||||
if (this.images) {
|
||||
const imageSummaryChunks = ['Images:'];
|
||||
for (const [k, v] of Object.entries(this.images)) {
|
||||
imageSummaryChunks.push(`- `);
|
||||
}
|
||||
if (imageSummaryChunks.length === 1) {
|
||||
imageSummaryChunks.push('This page does not seem to contain any images.');
|
||||
}
|
||||
suffixMixins.push(imageSummaryChunks.join('\n'));
|
||||
}
|
||||
if (this.links) {
|
||||
const linkSummaryChunks = ['Links/Buttons:'];
|
||||
for (const [k, v] of Object.entries(this.links)) {
|
||||
linkSummaryChunks.push(`- [${k}](${v})`);
|
||||
}
|
||||
if (linkSummaryChunks.length === 1) {
|
||||
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
||||
}
|
||||
suffixMixins.push(linkSummaryChunks.join('\n'));
|
||||
}
|
||||
|
||||
return `Title: ${this.title}
|
||||
|
||||
URL Source: ${this.url}
|
||||
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
||||
Markdown Content:
|
||||
${this.content}
|
||||
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
}
|
||||
};
|
||||
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
formatted.images =
|
||||
_(imageSummary)
|
||||
.toPairs()
|
||||
.map(
|
||||
([url, alt], i) => {
|
||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||
}
|
||||
).fromPairs()
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
||||
}
|
||||
|
||||
return formatted as FormattedPage;
|
||||
}
|
||||
|
||||
@CloudHTTPv2({
|
||||
name: 'crawl2',
|
||||
runtime: {
|
||||
@ -604,7 +130,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
})
|
||||
@CloudHTTPv2({
|
||||
runtime: {
|
||||
memory: '4GiB',
|
||||
memory: '8GiB',
|
||||
cpu: 4,
|
||||
timeoutSeconds: 300,
|
||||
concurrency: 22,
|
||||
@ -723,7 +249,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
sseStream.write({
|
||||
event: 'data',
|
||||
@ -754,7 +280,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
|
||||
if (crawlerOptions.timeout === undefined) {
|
||||
@ -770,7 +296,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
|
||||
return formatted;
|
||||
@ -782,24 +308,24 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl);
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
|
||||
if (crawlerOptions.timeout === undefined) {
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
}
|
||||
|
||||
@ -807,22 +333,22 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl);
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
|
||||
async getTargetUrl(originPath: string, crawlerOptions: CrawlerOptions) {
|
||||
@ -1181,7 +707,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
} catch (err) {
|
||||
if (lastSnapshot) {
|
||||
return this.formatSnapshot(mode, lastSnapshot, url);
|
||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||
}
|
||||
|
||||
throw err;
|
||||
@ -1191,6 +717,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
throw new AssertionFailureError(`No content available`);
|
||||
}
|
||||
|
||||
return this.formatSnapshot(mode, lastSnapshot, url);
|
||||
return this.snapshotFormatter.formatSnapshot(mode, lastSnapshot, url, this.urlValidMs);
|
||||
}
|
||||
}
|
||||
|
@ -18,6 +18,7 @@ import { appendFile } from 'fs/promises';
|
||||
import { createGzip } from 'zlib';
|
||||
import { getFunctions } from 'firebase-admin/functions';
|
||||
import { GoogleAuth } from 'google-auth-library';
|
||||
import { SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
|
||||
dayjs.extend(require('dayjs/plugin/utc'));
|
||||
|
||||
@ -57,6 +58,7 @@ export class DataCrunchingHost extends RPCHost {
|
||||
protected globalLogger: Logger,
|
||||
|
||||
protected crawler: CrawlerHost,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected tempFileManager: TempFileManager,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
) {
|
||||
@ -265,9 +267,9 @@ export class DataCrunchingHost extends RPCHost {
|
||||
try {
|
||||
const snapshot = JSON.parse(snapshotTxt.toString('utf-8'));
|
||||
|
||||
let formatted = await this.crawler.formatSnapshot('default', snapshot);
|
||||
let formatted = await this.snapshotFormatter.formatSnapshot('default', snapshot);
|
||||
if (!formatted.content) {
|
||||
formatted = await this.crawler.formatSnapshot('markdown', snapshot);
|
||||
formatted = await this.snapshotFormatter.formatSnapshot('markdown', snapshot);
|
||||
}
|
||||
|
||||
await nextDrainDeferred.promise;
|
||||
|
@ -11,11 +11,12 @@ import _ from 'lodash';
|
||||
import { Request, Response } from 'express';
|
||||
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||
import { BraveSearchExplicitOperatorsDto, BraveSearchService } from '../services/brave-search';
|
||||
import { CrawlerHost, ExtraScrappingOptions, FormattedPage } from './crawler';
|
||||
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||
import { WebSearchQueryParams } from '../shared/3rd-party/brave-search';
|
||||
import { SearchResult } from '../db/searched';
|
||||
import { WebSearchApiResponse, SearchResult as WebSearchResult } from '../shared/3rd-party/brave-types';
|
||||
import { CrawlerOptions } from '../dto/scrapping-options';
|
||||
import { SnapshotFormatter, FormattedPage } from '../services/snapshot-formatter';
|
||||
|
||||
|
||||
@singleton()
|
||||
@ -36,6 +37,7 @@ export class SearcherHost extends RPCHost {
|
||||
protected threadLocal: AsyncContext,
|
||||
protected braveSearchService: BraveSearchService,
|
||||
protected crawler: CrawlerHost,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
@ -324,7 +326,7 @@ export class SearcherHost extends RPCHost {
|
||||
if (snapshotMap.has(x)) {
|
||||
return snapshotMap.get(x);
|
||||
}
|
||||
return this.crawler.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||
return this.snapshotFormatter.formatSnapshot(mode, x, urls[i]).then((r) => {
|
||||
r.title ??= upstreamSearchResult.title;
|
||||
r.description = upstreamSearchResult.description;
|
||||
snapshotMap.set(x, r);
|
||||
|
@ -2,18 +2,19 @@ import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
||||
import { JSDOM, VirtualConsole } from 'jsdom';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import TurndownService from 'turndown';
|
||||
import { Threaded } from '../shared/services/threaded';
|
||||
|
||||
const virtualConsole = new VirtualConsole();
|
||||
virtualConsole.on('error', () => void 0);
|
||||
const pLinkedom = import('linkedom');
|
||||
|
||||
@singleton()
|
||||
export class JSDomControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
linkedom!: Awaited<typeof pLinkedom>;
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
) {
|
||||
@ -22,22 +23,34 @@ export class JSDomControl extends AsyncService {
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.linkedom = await pLinkedom;
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
||||
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
||||
targetSelector?: string | string[];
|
||||
removeSelector?: string | string[];
|
||||
withIframe?: boolean;
|
||||
}): PageSnapshot | undefined {
|
||||
}) {
|
||||
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
||||
return snapshot;
|
||||
}
|
||||
if (!snapshot?.html) {
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
return this.actualNarrowSnapshot(snapshot, options);
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
|
||||
targetSelector?: string | string[];
|
||||
removeSelector?: string | string[];
|
||||
withIframe?: boolean;
|
||||
}): Promise<PageSnapshot | undefined> {
|
||||
|
||||
const t0 = Date.now();
|
||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
||||
const allNodes: Node[] = [];
|
||||
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
||||
if (options?.withIframe) {
|
||||
@ -90,16 +103,16 @@ export class JSDomControl extends AsyncService {
|
||||
let rootDoc: Document;
|
||||
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
||||
rootDoc = allNodes[0] as any;
|
||||
if (rootDoc.body.textContent) {
|
||||
textChunks.push(rootDoc.body.textContent);
|
||||
if (rootDoc.body.innerText) {
|
||||
textChunks.push(rootDoc.body.innerText);
|
||||
}
|
||||
} else {
|
||||
rootDoc = new JSDOM('', { url: snapshot.href, virtualConsole }).window.document;
|
||||
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
||||
for (const n of allNodes) {
|
||||
rootDoc.body.appendChild(n);
|
||||
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
||||
if (n.textContent) {
|
||||
textChunks.push(n.textContent);
|
||||
if ((n as HTMLElement).innerText) {
|
||||
textChunks.push((n as HTMLElement).innerText);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -111,11 +124,6 @@ export class JSDomControl extends AsyncService {
|
||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||
}
|
||||
|
||||
// No innerText in jsdom
|
||||
// https://github.com/jsdom/jsdom/issues/1245
|
||||
const textContent = textChunks.join('\n\n');
|
||||
const cleanedText = textContent?.split('\n').map((x: any) => x.trimEnd()).join('\n').replace(/\n{3,}/g, '\n\n');
|
||||
|
||||
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
||||
.flat()
|
||||
@ -135,7 +143,7 @@ export class JSDomControl extends AsyncService {
|
||||
title: snapshot.title || jsdom.window.document.title,
|
||||
parsed,
|
||||
html: rootDoc.documentElement.outerHTML,
|
||||
text: cleanedText,
|
||||
text: textChunks.join('\n'),
|
||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||
} as PageSnapshot;
|
||||
|
||||
@ -147,11 +155,13 @@ export class JSDomControl extends AsyncService {
|
||||
return r;
|
||||
}
|
||||
|
||||
@Threaded()
|
||||
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
|
||||
const t0 = Date.now();
|
||||
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
|
||||
try {
|
||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
||||
|
||||
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
||||
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
|
||||
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
|
||||
@ -207,9 +217,8 @@ export class JSDomControl extends AsyncService {
|
||||
|
||||
return extendedSnapshot;
|
||||
}
|
||||
|
||||
snippetToElement(snippet?: string, url?: string) {
|
||||
const parsed = new JSDOM(snippet || '', { url, virtualConsole });
|
||||
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
||||
|
||||
return parsed.window.document.documentElement;
|
||||
}
|
||||
|
@ -1,7 +1,7 @@
|
||||
import os from 'os';
|
||||
import fs from 'fs';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency, Deferred, perNextTick } from 'civkit';
|
||||
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, Deferred, perNextTick } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
|
||||
import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
|
||||
@ -207,7 +207,6 @@ export class PuppeteerControl extends AsyncService {
|
||||
browser!: Browser;
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
private __healthCheckInterval?: NodeJS.Timeout;
|
||||
private __reqCapInterval?: NodeJS.Timeout;
|
||||
|
||||
__loadedPage: Page[] = [];
|
||||
@ -217,7 +216,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
livePages = new Set<Page>();
|
||||
lastPageCratedAt: number = 0;
|
||||
|
||||
rpsCap: number = 300;
|
||||
rpsCap: number = 500;
|
||||
lastReqSentAt: number = 0;
|
||||
requestDeferredQueue: Deferred<boolean>[] = [];
|
||||
|
||||
@ -235,15 +234,7 @@ export class PuppeteerControl extends AsyncService {
|
||||
});
|
||||
}
|
||||
|
||||
briefPages() {
|
||||
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
if (this.__healthCheckInterval) {
|
||||
clearInterval(this.__healthCheckInterval);
|
||||
this.__healthCheckInterval = undefined;
|
||||
}
|
||||
if (this.__reqCapInterval) {
|
||||
clearInterval(this.__reqCapInterval);
|
||||
this.__reqCapInterval = undefined;
|
||||
@ -276,40 +267,9 @@ export class PuppeteerControl extends AsyncService {
|
||||
|
||||
this.emit('ready');
|
||||
|
||||
this.__healthCheckInterval = setInterval(() => this.healthCheck(), 30_000).unref();
|
||||
this.newPage().then((r) => this.__loadedPage.push(r));
|
||||
}
|
||||
|
||||
@maxConcurrency(1)
|
||||
async healthCheck() {
|
||||
if (Date.now() - this.lastPageCratedAt <= 10_000) {
|
||||
this.briefPages();
|
||||
return;
|
||||
}
|
||||
const healthyPage = await this.newPage().catch((err) => {
|
||||
this.logger.warn(`Health check failed`, { err: marshalErrorLike(err) });
|
||||
return null;
|
||||
});
|
||||
|
||||
if (healthyPage) {
|
||||
this.__loadedPage.push(healthyPage);
|
||||
|
||||
if (this.__loadedPage.length > 3) {
|
||||
this.ditchPage(this.__loadedPage.shift()!);
|
||||
}
|
||||
|
||||
this.briefPages();
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.warn(`Trying to clean up...`);
|
||||
this.browser.process()?.kill('SIGKILL');
|
||||
Reflect.deleteProperty(this, 'browser');
|
||||
this.emit('crippled');
|
||||
this.logger.warn(`Browser killed`);
|
||||
}
|
||||
|
||||
@perNextTick()
|
||||
reqCapRoutine() {
|
||||
const now = Date.now();
|
||||
@ -620,7 +580,7 @@ document.addEventListener('load', handlePageLoad);
|
||||
try {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot();
|
||||
screenshot = Buffer.from(await page.screenshot());
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -643,8 +603,8 @@ document.addEventListener('load', handlePageLoad);
|
||||
if (salvaged) {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = Buffer.from(await page.screenshot());
|
||||
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -678,8 +638,8 @@ document.addEventListener('load', handlePageLoad);
|
||||
.then(async () => {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = Buffer.from(await page.screenshot());
|
||||
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -716,8 +676,8 @@ document.addEventListener('load', handlePageLoad);
|
||||
break;
|
||||
}
|
||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = Buffer.from(await page.screenshot());
|
||||
pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
|
||||
lastHTML = snapshot.html;
|
||||
}
|
||||
if (snapshot || screenshot) {
|
||||
|
539
backend/functions/src/services/snapshot-formatter.ts
Normal file
539
backend/functions/src/services/snapshot-formatter.ts
Normal file
@ -0,0 +1,539 @@
|
||||
import { randomUUID } from 'crypto';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, HashManager, marshalErrorLike } from 'civkit';
|
||||
import TurndownService from 'turndown';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket';
|
||||
import { AsyncContext } from '../shared/services/async-context';
|
||||
import { Threaded } from '../shared/services/threaded';
|
||||
import { JSDomControl } from './jsdom';
|
||||
import { AltTextService } from './alt-text';
|
||||
import { PDFExtractor } from './pdf-extract';
|
||||
import { cleanAttribute } from '../utils/misc';
|
||||
import _ from 'lodash';
|
||||
|
||||
|
||||
export interface FormattedPage {
|
||||
title?: string;
|
||||
description?: string;
|
||||
url?: string;
|
||||
content?: string;
|
||||
publishedTime?: string;
|
||||
html?: string;
|
||||
text?: string;
|
||||
screenshotUrl?: string;
|
||||
screenshot?: Buffer;
|
||||
pageshotUrl?: string;
|
||||
pageshot?: Buffer;
|
||||
links?: { [k: string]: string; };
|
||||
images?: { [k: string]: string; };
|
||||
usage?: {
|
||||
total_tokens?: number;
|
||||
totalTokens?: number;
|
||||
tokens?: number;
|
||||
};
|
||||
|
||||
textRepresentation?: string;
|
||||
|
||||
[Symbol.dispose]: () => void;
|
||||
}
|
||||
|
||||
export const md5Hasher = new HashManager('md5', 'hex');
|
||||
|
||||
@singleton()
|
||||
export class SnapshotFormatter extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
turnDownPlugins = [require('turndown-plugin-gfm').tables];
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected altTextService: AltTextService,
|
||||
protected pdfExtractor: PDFExtractor,
|
||||
protected threadLocal: AsyncContext,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
|
||||
@Threaded()
|
||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
pageshotUrl?: string;
|
||||
}, nominalUrl?: URL, urlValidMs = 3600 * 1000 * 4) {
|
||||
const t0 = Date.now();
|
||||
if (mode === 'screenshot') {
|
||||
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
||||
const fid = `instant-screenshots/${randomUUID()}`;
|
||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.screenshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
snapshot.screenshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
||||
}
|
||||
|
||||
const f = {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
// html: snapshot.html,
|
||||
screenshotUrl: snapshot.screenshotUrl,
|
||||
};
|
||||
|
||||
Object.defineProperty(f, 'textRepresentation', { value: `${f.screenshotUrl}\n`, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return f as FormattedPage;
|
||||
}
|
||||
if (mode === 'pageshot') {
|
||||
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
||||
const fid = `instant-screenshots/${randomUUID()}`;
|
||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + urlValidMs);
|
||||
}
|
||||
|
||||
const f = {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
html: snapshot.html,
|
||||
pageshotUrl: snapshot.pageshotUrl,
|
||||
} as FormattedPage;
|
||||
|
||||
Object.defineProperty(f, 'textRepresentation', { value: `${f.pageshotUrl}\n`, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return f;
|
||||
}
|
||||
if (mode === 'html') {
|
||||
const f = {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
html: snapshot.html,
|
||||
} as FormattedPage;
|
||||
|
||||
Object.defineProperty(f, 'textRepresentation', { value: snapshot.html, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return f;
|
||||
}
|
||||
|
||||
let pdfMode = false;
|
||||
if (snapshot.pdfs?.length && !snapshot.title) {
|
||||
const pdf = await this.pdfExtractor.cachedExtract(snapshot.pdfs[0],
|
||||
this.threadLocal.get('cacheTolerance')
|
||||
);
|
||||
if (pdf) {
|
||||
pdfMode = true;
|
||||
snapshot.title = pdf.meta?.Title;
|
||||
snapshot.text = pdf.text || snapshot.text;
|
||||
snapshot.parsed = {
|
||||
content: pdf.content,
|
||||
textContent: pdf.content,
|
||||
length: pdf.content?.length,
|
||||
byline: pdf.meta?.Author,
|
||||
lang: pdf.meta?.Language || undefined,
|
||||
title: pdf.meta?.Title,
|
||||
publishedTime: this.pdfExtractor.parsePdfDate(pdf.meta?.ModDate || pdf.meta?.CreationDate)?.toISOString(),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (mode === 'text') {
|
||||
const f = {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
text: snapshot.text,
|
||||
} as FormattedPage;
|
||||
|
||||
Object.defineProperty(f, 'textRepresentation', { value: snapshot.text, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return f;
|
||||
}
|
||||
const imgDataUrlToObjectUrl = !Boolean(this.threadLocal.get('keepImgDataUrl'));
|
||||
|
||||
let contentText = '';
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
const imageIdxTrack = new Map<string, number[]>();
|
||||
const uid = this.threadLocal.get('uid');
|
||||
do {
|
||||
if (pdfMode) {
|
||||
contentText = snapshot.parsed?.content || snapshot.text;
|
||||
break;
|
||||
}
|
||||
|
||||
if (
|
||||
snapshot.maxElemDepth! > 256 ||
|
||||
(!uid && snapshot.elemCount! > 10_000) ||
|
||||
snapshot.elemCount! > 70_000
|
||||
) {
|
||||
this.logger.warn('Degrading to text to protect the server', { url: snapshot.href });
|
||||
contentText = snapshot.text;
|
||||
break;
|
||||
}
|
||||
|
||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (snapshot.parsed.content) {
|
||||
toBeTurnedToMd = jsDomElementOfParsed;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const plugin of this.turnDownPlugins) {
|
||||
turnDownService = turnDownService.use(plugin);
|
||||
}
|
||||
const urlToAltMap: { [k: string]: string | undefined; } = {};
|
||||
if (snapshot.imgs?.length && this.threadLocal.get('withGeneratedAlt')) {
|
||||
const tasks = _.uniqBy((snapshot.imgs || []), 'src').map(async (x) => {
|
||||
const r = await this.altTextService.getAltText(x).catch((err: any) => {
|
||||
this.logger.warn(`Failed to get alt text for ${x.src}`, { err: marshalErrorLike(err) });
|
||||
return undefined;
|
||||
});
|
||||
if (r && x.src) {
|
||||
urlToAltMap[x.src.trim()] = r;
|
||||
}
|
||||
});
|
||||
|
||||
await Promise.all(tasks);
|
||||
}
|
||||
let imgIdx = 0;
|
||||
turnDownService.addRule('img-generated-alt', {
|
||||
filter: 'img',
|
||||
replacement: (_content, node: any) => {
|
||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||
if (!linkPreferredSrc || linkPreferredSrc.startsWith('data:')) {
|
||||
const dataSrc = (node.getAttribute('data-src') || '').trim();
|
||||
if (dataSrc && !dataSrc.startsWith('data:')) {
|
||||
linkPreferredSrc = dataSrc;
|
||||
}
|
||||
}
|
||||
|
||||
let src;
|
||||
try {
|
||||
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
const alt = cleanAttribute(node.getAttribute('alt'));
|
||||
if (!src) {
|
||||
return '';
|
||||
}
|
||||
const mapped = urlToAltMap[src];
|
||||
const imgSerial = ++imgIdx;
|
||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||
idxArr.push(imgSerial);
|
||||
imageIdxTrack.set(src, idxArr);
|
||||
|
||||
if (mapped) {
|
||||
imageSummary[src] = mapped || alt;
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
imageSummary[src] = alt || '';
|
||||
|
||||
if (src?.startsWith('data:') && imgDataUrlToObjectUrl) {
|
||||
const mappedUrl = new URL(`blob:${nominalUrl?.origin || ''}/${md5Hasher.hash(src)}`);
|
||||
mappedUrl.protocol = 'blob:';
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
|
||||
return alt ? `` : ``;
|
||||
}
|
||||
});
|
||||
|
||||
if (toBeTurnedToMd) {
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (
|
||||
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||
&& toBeTurnedToMd !== jsDomElementOfHTML
|
||||
) {
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!contentText || (contentText.startsWith('<') || contentText.endsWith('>'))) {
|
||||
contentText = snapshot.text;
|
||||
}
|
||||
} while (false);
|
||||
|
||||
const cleanText = (contentText || '').trim();
|
||||
|
||||
const formatted: FormattedPage = {
|
||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||
url: nominalUrl?.toString() || snapshot.href?.trim(),
|
||||
content: cleanText,
|
||||
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||
[Symbol.dispose]: () => { },
|
||||
};
|
||||
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
formatted.images =
|
||||
_(imageSummary)
|
||||
.toPairs()
|
||||
.map(
|
||||
([url, alt], i) => {
|
||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||
}
|
||||
).fromPairs()
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
formatted.links = _.invert(this.jsdomControl.inferSnapshot(snapshot).links || {});
|
||||
}
|
||||
|
||||
const textRepresentation = (function (this: typeof formatted) {
|
||||
if (mode === 'markdown') {
|
||||
return this.content as string;
|
||||
}
|
||||
|
||||
const mixins = [];
|
||||
if (this.publishedTime) {
|
||||
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||
}
|
||||
const suffixMixins = [];
|
||||
if (this.images) {
|
||||
const imageSummaryChunks = ['Images:'];
|
||||
for (const [k, v] of Object.entries(this.images)) {
|
||||
imageSummaryChunks.push(`- `);
|
||||
}
|
||||
if (imageSummaryChunks.length === 1) {
|
||||
imageSummaryChunks.push('This page does not seem to contain any images.');
|
||||
}
|
||||
suffixMixins.push(imageSummaryChunks.join('\n'));
|
||||
}
|
||||
if (this.links) {
|
||||
const linkSummaryChunks = ['Links/Buttons:'];
|
||||
for (const [k, v] of Object.entries(this.links)) {
|
||||
linkSummaryChunks.push(`- [${k}](${v})`);
|
||||
}
|
||||
if (linkSummaryChunks.length === 1) {
|
||||
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
|
||||
}
|
||||
suffixMixins.push(linkSummaryChunks.join('\n'));
|
||||
}
|
||||
|
||||
return `Title: ${this.title}
|
||||
|
||||
URL Source: ${this.url}
|
||||
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
||||
Markdown Content:
|
||||
${this.content}
|
||||
${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
}).call(formatted);
|
||||
|
||||
Object.defineProperty(formatted, 'textRepresentation', { value: textRepresentation, enumerable: false });
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Formatting took ${dt}ms`, { mode, url: nominalUrl?.toString(), dt });
|
||||
|
||||
return formatted as FormattedPage;
|
||||
}
|
||||
|
||||
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
|
||||
let inferred;
|
||||
const mixin: any = {};
|
||||
if (this.threadLocal.get('withImagesSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
const imageSummary = {} as { [k: string]: string; };
|
||||
const imageIdxTrack = new Map<string, number[]>();
|
||||
|
||||
let imgIdx = 0;
|
||||
|
||||
for (const img of inferred.imgs) {
|
||||
const imgSerial = ++imgIdx;
|
||||
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
|
||||
idxArr.push(imgSerial);
|
||||
imageIdxTrack.set(img.src, idxArr);
|
||||
imageSummary[img.src] = img.alt || '';
|
||||
}
|
||||
|
||||
mixin.images =
|
||||
_(imageSummary)
|
||||
.toPairs()
|
||||
.map(
|
||||
([url, alt], i) => {
|
||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||
}
|
||||
).fromPairs()
|
||||
.value();
|
||||
}
|
||||
if (this.threadLocal.get('withLinksSummary')) {
|
||||
inferred ??= this.jsdomControl.inferSnapshot(snapshot);
|
||||
mixin.links = _.invert(inferred.links || {});
|
||||
}
|
||||
|
||||
return mixin;
|
||||
}
|
||||
|
||||
getTurndown(options?: {
|
||||
noRules?: boolean | string,
|
||||
url?: string | URL;
|
||||
imgDataUrlToObjectUrl?: boolean;
|
||||
}) {
|
||||
const turnDownService = new TurndownService({
|
||||
codeBlockStyle: 'fenced',
|
||||
preformattedCode: true,
|
||||
} as any);
|
||||
if (!options?.noRules) {
|
||||
turnDownService.addRule('remove-irrelevant', {
|
||||
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea', 'select'],
|
||||
replacement: () => ''
|
||||
});
|
||||
turnDownService.addRule('truncate-svg', {
|
||||
filter: 'svg' as any,
|
||||
replacement: () => ''
|
||||
});
|
||||
turnDownService.addRule('title-as-h1', {
|
||||
filter: ['title'],
|
||||
replacement: (innerText) => `${innerText}\n===============\n`
|
||||
});
|
||||
}
|
||||
|
||||
if (options?.imgDataUrlToObjectUrl) {
|
||||
turnDownService.addRule('data-url-to-pseudo-object-url', {
|
||||
filter: (node) => Boolean(node.tagName === 'IMG' && node.getAttribute('src')?.startsWith('data:')),
|
||||
replacement: (_content, node: any) => {
|
||||
const src = (node.getAttribute('src') || '').trim();
|
||||
const alt = cleanAttribute(node.getAttribute('alt')) || '';
|
||||
|
||||
if (options.url) {
|
||||
const refUrl = new URL(options.url);
|
||||
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(src)}`);
|
||||
|
||||
return ``;
|
||||
}
|
||||
|
||||
return `})`;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
turnDownService.addRule('improved-paragraph', {
|
||||
filter: 'p',
|
||||
replacement: (innerText) => {
|
||||
const trimmed = innerText.trim();
|
||||
if (!trimmed) {
|
||||
return '';
|
||||
}
|
||||
|
||||
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
||||
}
|
||||
});
|
||||
turnDownService.addRule('improved-inline-link', {
|
||||
filter: function (node, options) {
|
||||
return Boolean(
|
||||
options.linkStyle === 'inlined' &&
|
||||
node.nodeName === 'A' &&
|
||||
node.getAttribute('href')
|
||||
);
|
||||
},
|
||||
|
||||
replacement: function (content, node: any) {
|
||||
let href = node.getAttribute('href');
|
||||
if (href) href = href.replace(/([()])/g, '\\$1');
|
||||
let title = cleanAttribute(node.getAttribute('title'));
|
||||
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
||||
|
||||
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
||||
let fixedHref = href.replace(/\s+/g, '').trim();
|
||||
if (options?.url) {
|
||||
try {
|
||||
fixedHref = new URL(fixedHref, options.url).toString();
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
}
|
||||
|
||||
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
||||
}
|
||||
});
|
||||
turnDownService.addRule('improved-code', {
|
||||
filter: function (node: any) {
|
||||
let hasSiblings = node.previousSibling || node.nextSibling;
|
||||
let isCodeBlock = node.parentNode.nodeName === 'PRE' && !hasSiblings;
|
||||
|
||||
return node.nodeName === 'CODE' && !isCodeBlock;
|
||||
},
|
||||
|
||||
replacement: function (inputContent: any) {
|
||||
if (!inputContent) return '';
|
||||
let content = inputContent;
|
||||
|
||||
let delimiter = '`';
|
||||
let matches = content.match(/`+/gm) || [];
|
||||
while (matches.indexOf(delimiter) !== -1) delimiter = delimiter + '`';
|
||||
if (content.includes('\n')) {
|
||||
delimiter = '```';
|
||||
}
|
||||
|
||||
let extraSpace = delimiter === '```' ? '\n' : /^`|^ .*?[^ ].* $|`$/.test(content) ? ' ' : '';
|
||||
|
||||
return delimiter + extraSpace + content + (delimiter === '```' && !content.endsWith(extraSpace) ? extraSpace : '') + delimiter;
|
||||
}
|
||||
});
|
||||
|
||||
return turnDownService;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
const snapshotFormatter = container.resolve(SnapshotFormatter);
|
||||
|
||||
export default snapshotFormatter;
|
Loading…
x
Reference in New Issue
Block a user