feat: links and images summary (#63)

* wip: dedicated link and image summary

* fix

* fix

* fix

* fix: docs

* fix

* fix

* fix
This commit is contained in:
Yanlong Wang 2024-05-21 17:34:19 +08:00 committed by GitHub
parent df71c9a534
commit a8e0628460
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 239 additions and 33 deletions

View File

@ -38,6 +38,8 @@ export interface FormattedPage {
text?: string; text?: string;
screenshotUrl?: string; screenshotUrl?: string;
screenshot?: Buffer; screenshot?: Buffer;
links?: { [k: string]: string; };
images?: { [k: string]: string; };
toString: () => string; toString: () => string;
} }
@ -135,9 +137,43 @@ export class CrawlerHost extends RPCHost {
return turnDownService; return turnDownService;
} }
getGeneralSnapshotMixins(snapshot: PageSnapshot) {
const inferred = this.puppeteerControl.inferSnapshot(snapshot);
const mixin: any = {};
if (this.threadLocal.get('withImagesSummary')) {
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
let imgIdx = 0;
for (const img of inferred.imgs) {
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(img.src) ? imageIdxTrack.get(img.src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(img.src, idxArr);
imageSummary[img.src] = img.alt || '';
}
mixin.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
mixin.links = _.invert(inferred.links || {});
}
return mixin;
}
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
screenshotUrl?: string; screenshotUrl?: string;
}, nominalUrl?: URL){ }, nominalUrl?: URL) {
if (mode === 'screenshot') { if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) { if (snapshot.screenshot && !snapshot.screenshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`; const fid = `instant-screenshots/${randomUUID()}`;
@ -150,6 +186,7 @@ export class CrawlerHost extends RPCHost {
} }
return { return {
...this.getGeneralSnapshotMixins(snapshot),
screenshotUrl: snapshot.screenshotUrl, screenshotUrl: snapshot.screenshotUrl,
toString() { toString() {
return this.screenshotUrl; return this.screenshotUrl;
@ -158,6 +195,7 @@ export class CrawlerHost extends RPCHost {
} }
if (mode === 'html') { if (mode === 'html') {
return { return {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html, html: snapshot.html,
toString() { toString() {
return this.html; return this.html;
@ -166,6 +204,7 @@ export class CrawlerHost extends RPCHost {
} }
if (mode === 'text') { if (mode === 'text') {
return { return {
...this.getGeneralSnapshotMixins(snapshot),
text: snapshot.text, text: snapshot.text,
toString() { toString() {
return this.text; return this.text;
@ -193,6 +232,8 @@ export class CrawlerHost extends RPCHost {
await Promise.all(tasks); await Promise.all(tasks);
} }
let imgIdx = 0; let imgIdx = 0;
const imageSummary = {} as { [k: string]: string; };
const imageIdxTrack = new Map<string, number[]>();
turnDownService.addRule('img-generated-alt', { turnDownService.addRule('img-generated-alt', {
filter: 'img', filter: 'img',
replacement: (_content, node) => { replacement: (_content, node) => {
@ -215,10 +256,19 @@ export class CrawlerHost extends RPCHost {
return ''; return '';
} }
const mapped = urlToAltMap[src]; const mapped = urlToAltMap[src];
imgIdx++; const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
idxArr.push(imgSerial);
imageIdxTrack.set(src, idxArr);
if (mapped) { if (mapped) {
imageSummary[src] = mapped || alt;
return `![Image ${imgIdx}: ${mapped || alt}](${src})`; return `![Image ${imgIdx}: ${mapped || alt}](${src})`;
} }
imageSummary[src] = alt || '';
return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`; return alt ? `![Image ${imgIdx}: ${alt}](${src})` : `![Image ${imgIdx}](${src})`;
} }
}); });
@ -260,20 +310,41 @@ export class CrawlerHost extends RPCHost {
const cleanText = (contentText || '').trim(); const cleanText = (contentText || '').trim();
const formatted = { const formatted: FormattedPage = {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl?.toString() || snapshot.href?.trim(), url: nominalUrl?.toString() || snapshot.href?.trim(),
content: cleanText, content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined, publishedTime: snapshot.parsed?.publishedTime || undefined,
toString() { toString() {
if (mode === 'markdown') {
return this.content as string;
}
const mixins = []; const mixins = [];
if (this.publishedTime) { if (this.publishedTime) {
mixins.push(`Published Time: ${this.publishedTime}`); mixins.push(`Published Time: ${this.publishedTime}`);
} }
const suffixMixins = [];
if (mode === 'markdown') { if (this.images) {
return this.content; const imageSummaryChunks = ['Images:'];
for (const [k, v] of Object.entries(this.images)) {
imageSummaryChunks.push(`- ![${k}](${v})`);
}
if (imageSummaryChunks.length === 1) {
imageSummaryChunks.push('This page does not seem to contain any images.');
}
suffixMixins.push(imageSummaryChunks.join('\n'));
}
if (this.links) {
const linkSummaryChunks = ['Links/Buttons:'];
for (const [k, v] of Object.entries(this.links)) {
linkSummaryChunks.push(`- [${k}](${v})`);
}
if (linkSummaryChunks.length === 1) {
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
}
suffixMixins.push(linkSummaryChunks.join('\n'));
} }
return `Title: ${this.title} return `Title: ${this.title}
@ -282,10 +353,25 @@ URL Source: ${this.url}
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''} ${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
Markdown Content: Markdown Content:
${this.content} ${this.content}
`; ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
} }
}; };
if (this.threadLocal.get('withImagesSummary')) {
formatted.images =
_(imageSummary)
.toPairs()
.map(
([url, alt], i) => {
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()
.value();
}
if (this.threadLocal.get('withLinksSummary')) {
formatted.links = _.invert(this.puppeteerControl.inferSnapshot(snapshot).links || {});
}
return formatted as FormattedPage; return formatted as FormattedPage;
} }
@ -313,9 +399,9 @@ ${this.content}
operation: { operation: {
parameters: { parameters: {
'Accept': { 'Accept': {
description: `Specifies your preference for the response format. \n\n` + description: `Specifies your preference for the response format.\n\n` +
`Supported formats:\n` + `Supported formats: \n` +
`- text/event-stream\n` + `- text/event - stream\n` +
`- application/json or text/json\n` + `- application/json or text/json\n` +
`- text/plain` `- text/plain`
, ,
@ -333,8 +419,8 @@ ${this.content}
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Respond-With': { 'X-Respond-With': {
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` + description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
`Supported formats:\n` + `Supported formats: \n` +
`- markdown\n` + `- markdown\n` +
`- html\n` + `- html\n` +
`- text\n` + `- text\n` +
@ -344,22 +430,22 @@ ${this.content}
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Wait-For-Selector': { 'X-Wait-For-Selector': {
description: `Specifies a CSS selector to wait for the appearance of such an element before returning. \n\n` + description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
'Example: `X-Wait-For-Selector: .content-block`\n' 'Example: `X-Wait-For-Selector: .content-block`\n'
, ,
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Target-Selector': { 'X-Target-Selector': {
description: `Specifies a CSS selector for return target instead of the full html. \n\n` + description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
'Implies `X-Wait-For-Selector: (same selector)`' 'Implies `X-Wait-For-Selector: (same selector)`'
, ,
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Proxy-Url': { 'X-Proxy-Url': {
description: `Specifies your custom proxy if you prefer to use one. \n\n` + description: `Specifies your custom proxy if you prefer to use one.\n\n` +
`Supported protocols:\n` + `Supported protocols: \n` +
`- http\n` + `- http\n` +
`- https\n` + `- https\n` +
`- socks4\n` + `- socks4\n` +
@ -375,7 +461,18 @@ ${this.content}
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-With-Generated-Alt': { 'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`, description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
`Note: Does not work when \`X-Respond-With\` is specified`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Images-Summary': {
description: `Enable dedicated summary section for images on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-With-links-Summary': {
description: `Enable dedicated summary section for hyper links on the page.`,
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
@ -465,6 +562,8 @@ ${this.content}
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
const noCache = Boolean(ctx.req.get('x-no-cache')); const noCache = Boolean(ctx.req.get('x-no-cache'));
let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; let cacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
if (isNaN(cacheTolerance)) { if (isNaN(cacheTolerance)) {
@ -491,6 +590,8 @@ ${this.content}
}); });
} }
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
this.threadLocal.set('withLinksSummary', withLinksSummary);
this.threadLocal.set('withImagesSummary', withImagesSummary);
const crawlOpts: ExtraScrappingOptions = { const crawlOpts: ExtraScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'), proxyUrl: ctx.req.get('x-proxy-url'),

View File

@ -116,7 +116,18 @@ export class SearcherHost extends RPCHost {
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-With-Generated-Alt': { 'X-With-Generated-Alt': {
description: `Enable automatic alt-text generating for images without an meaningful alt-text.`, description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
`Note: Does not work when \`X-Respond-With\` is specified`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Images-Summary': {
description: `Enable dedicated summary section for images on the page.`,
in: 'header',
schema: { type: 'string' }
},
'X-With-links-Summary': {
description: `Enable dedicated summary section for hyper links on the page.`,
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
@ -189,6 +200,8 @@ export class SearcherHost extends RPCHost {
const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default'; const customMode = ctx.req.get('x-respond-with') || ctx.req.get('x-return-format') || 'default';
const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt')); const withGeneratedAlt = Boolean(ctx.req.get('x-with-generated-alt'));
const withLinksSummary = Boolean(ctx.req.get('x-with-links-summary'));
const withImagesSummary = Boolean(ctx.req.get('x-with-images-summary'));
const noCache = Boolean(ctx.req.get('x-no-cache')); const noCache = Boolean(ctx.req.get('x-no-cache'));
let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000; let pageCacheTolerance = parseInt(ctx.req.get('x-cache-tolerance') || '') * 1000;
if (isNaN(pageCacheTolerance)) { if (isNaN(pageCacheTolerance)) {
@ -211,6 +224,9 @@ export class SearcherHost extends RPCHost {
}); });
} }
this.threadLocal.set('withGeneratedAlt', withGeneratedAlt); this.threadLocal.set('withGeneratedAlt', withGeneratedAlt);
this.threadLocal.set('withLinksSummary', withLinksSummary);
this.threadLocal.set('withImagesSummary', withImagesSummary);
const crawlOpts: ScrappingOptions = { const crawlOpts: ScrappingOptions = {
proxyUrl: ctx.req.get('x-proxy-url'), proxyUrl: ctx.req.get('x-proxy-url'),
cookies, cookies,
@ -395,11 +411,33 @@ export class SearcherHost extends RPCHost {
mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`); mixins.push(`[${i + 1}] Published Time: ${this.publishedTime}`);
} }
const suffixMixins = [];
if (this.images) {
const imageSummaryChunks = [`[${i + 1}] Images:`];
for (const [k, v] of Object.entries(this.images)) {
imageSummaryChunks.push(`- ![${k}](${v})`);
}
if (imageSummaryChunks.length === 1) {
imageSummaryChunks.push('This page does not seem to contain any images.');
}
suffixMixins.push(imageSummaryChunks.join('\n'));
}
if (this.links) {
const linkSummaryChunks = [`[${i + 1}] Links/Buttons:`];
for (const [k, v] of Object.entries(this.links)) {
linkSummaryChunks.push(`- [${k}](${v})`);
}
if (linkSummaryChunks.length === 1) {
linkSummaryChunks.push('This page does not seem to contain any buttons/links.');
}
suffixMixins.push(linkSummaryChunks.join('\n'));
}
return `[${i + 1}] Title: ${this.title} return `[${i + 1}] Title: ${this.title}
[${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''} [${i + 1}] URL Source: ${this.url}${mixins.length ? `\n${mixins.join('\n')}` : ''}
[${i + 1}] Markdown Content: [${i + 1}] Markdown Content:
${this.content} ${this.content}
`; ${suffixMixins.length ? `\n${suffixMixins.join('\n')}\n` : ''}`;
} }
}; };
}); });

View File

@ -3,7 +3,7 @@ import fs from 'fs';
import { container, singleton } from 'tsyringe'; import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
import { JSDOM } from 'jsdom'; import { JSDOM, VirtualConsole } from 'jsdom';
import type { Browser, CookieParam, Page } from 'puppeteer'; import type { Browser, CookieParam, Page } from 'puppeteer';
import puppeteer from 'puppeteer-extra'; import puppeteer from 'puppeteer-extra';
@ -15,13 +15,17 @@ import { Readability } from '@mozilla/readability';
const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8'); const READABILITY_JS = fs.readFileSync(require.resolve('@mozilla/readability/Readability.js'), 'utf-8');
const virtualConsole = new VirtualConsole();
virtualConsole.on('error', () => void 0);
export interface ImgBrief { export interface ImgBrief {
src: string; src: string;
loaded: boolean; loaded?: boolean;
width: number; width?: number;
height: number; height?: number;
naturalWidth: number; naturalWidth?: number;
naturalHeight: number; naturalHeight?: number;
alt?: string; alt?: string;
} }
@ -48,6 +52,11 @@ export interface PageSnapshot {
imgs?: ImgBrief[]; imgs?: ImgBrief[];
} }
export interface ExtendedSnapshot extends PageSnapshot {
links: { [url: string]: string; };
imgs: ImgBrief[];
}
export interface ScrappingOptions { export interface ScrappingOptions {
proxyUrl?: string; proxyUrl?: string;
cookies?: CookieParam[]; cookies?: CookieParam[];
@ -100,7 +109,6 @@ export class PuppeteerControl extends AsyncService {
briefPages() { briefPages() {
this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`); this.logger.info(`Status: ${this.livePages.size} pages alive: ${Array.from(this.livePages).map((x) => this.snMap.get(x)).sort().join(', ')}; ${this.__loadedPage.length} idle pages: ${this.__loadedPage.map((x) => this.snMap.get(x)).sort().join(', ')}`);
this.logger.info(``);
} }
override async init() { override async init() {
@ -304,7 +312,7 @@ document.addEventListener('load', handlePageLoad);
} }
async getNextPage() { async getNextPage() {
let thePage; let thePage: Page | undefined;
if (this.__loadedPage.length) { if (this.__loadedPage.length) {
thePage = this.__loadedPage.shift(); thePage = this.__loadedPage.shift();
if (this.__loadedPage.length <= 1) { if (this.__loadedPage.length <= 1) {
@ -321,8 +329,8 @@ document.addEventListener('load', handlePageLoad);
} }
const timer = setTimeout(() => { const timer = setTimeout(() => {
this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage)}...`); this.logger.warn(`Page is not allowed to live past 5 minutes, ditching page ${this.snMap.get(thePage!)}...`);
this.ditchPage(thePage); this.ditchPage(thePage!);
}, 300 * 1000); }, 300 * 1000);
this.finalizerMap.set(thePage, timer); this.finalizerMap.set(thePage, timer);
@ -487,14 +495,14 @@ document.addEventListener('load', handlePageLoad);
return snapshot; return snapshot;
} }
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href }); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const elem = jsdom.window.document.querySelector(targetSelect); const elem = jsdom.window.document.querySelector(targetSelect);
if (!elem) { if (!elem) {
return snapshot; return snapshot;
} }
const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href }); const selectedJsDom = new JSDOM(elem.outerHTML, { url: snapshot.href, virtualConsole });
let parsed; let parsed;
try { try {
parsed = new Readability(selectedJsDom.window.document).parse(); parsed = new Readability(selectedJsDom.window.document).parse();
@ -531,6 +539,60 @@ document.addEventListener('load', handlePageLoad);
return r; return r;
} }
inferSnapshot(snapshot: PageSnapshot): ExtendedSnapshot {
const extendedSnapshot = { ...snapshot } as ExtendedSnapshot;
try {
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const links = Array.from(jsdom.window.document.querySelectorAll('a[href]'))
.map((x: any) => [x.getAttribute('href'), x.textContent.replace(/\s+/g, ' ').trim()])
.map(([href, text]) => {
if (!text) {
return undefined;
}
try {
const parsed = new URL(href, snapshot.href);
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
return undefined;
}
return [parsed.toString(), text] as const;
} catch (err) {
return undefined;
}
})
.filter(Boolean)
.reduce((acc, pair) => {
acc[pair![0]] = pair![1];
return acc;
}, {} as { [k: string]: string; });
extendedSnapshot.links = links;
const imgs = Array.from(jsdom.window.document.querySelectorAll('img[src],img[data-src]'))
.map((x: any) => {
let linkPreferredSrc = x.getAttribute('src') || '';
if (linkPreferredSrc.startsWith('data:')) {
const dataSrc = x.getAttribute('data-src') || '';
if (dataSrc && !dataSrc.startsWith('data:')) {
linkPreferredSrc = dataSrc;
}
}
return {
src: new URL(linkPreferredSrc, snapshot.href).toString(),
width: parseInt(x.getAttribute('width') || '0'),
height: parseInt(x.getAttribute('height') || '0'),
alt: x.getAttribute('alt') || x.getAttribute('title'),
};
});
extendedSnapshot.imgs = imgs as any;
} catch (_err) {
void 0;
}
return extendedSnapshot;
}
} }
const puppeteerControl = container.resolve(PuppeteerControl); const puppeteerControl = container.resolve(PuppeteerControl);

View File

@ -9,8 +9,13 @@ declare module 'langdetect' {
} }
declare module 'jsdom' { declare module 'jsdom' {
import EventEmitter from 'events';
export class JSDOM { export class JSDOM {
constructor(html: string, options?: any); constructor(html: string, options?: any);
window: typeof window; window: typeof window;
} }
export class VirtualConsole extends EventEmitter{
constructor();
sendTo(console: any, options?: any);
}
} }

@ -1 +1 @@
Subproject commit fc3545e3a7ae27968e69f351f109d3ffb535f963 Subproject commit 1b28100c71b3c7e37669fa98756affbac3095ced