fix: image in summary

This commit is contained in:
yanlong.wang 2025-02-17 17:41:39 +08:00
parent fc2824b115
commit 008dcbaf22
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 43 additions and 16 deletions

View File

@ -1,7 +1,7 @@
import { container, singleton } from 'tsyringe';
import { AsyncService, marshalErrorLike } from 'civkit';
import { Logger } from '../shared/services/logger';
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
import { Threaded } from '../shared/services/threaded';
@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService {
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
}
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
.flat()
.map((x) => {
try {
return new URL(x, snapshot.rebase || snapshot.href).toString();
} catch (err) {
return null;
const imgSet = new Set<string>();
const rebuiltImgs: ImgBrief[] = [];
Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
.forEach(([u1, u2, alt]) => {
if (u1) {
try {
const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
imgSet.add(u1Txt);
} catch (err) {
// void 0;
}
}
})
.filter(Boolean);
if (u2) {
try {
const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
imgSet.add(u2Txt);
} catch (err) {
// void 0;
}
}
rebuiltImgs.push({
src: u1 || u2,
alt
});
});
const imageSet = new Set(imageTags);
const r = {
...snapshot,
title: snapshot.title || jsdom.window.document.title,
@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService {
parsed,
html: rootDoc.documentElement.outerHTML,
text: textChunks.join('\n'),
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [],
} as PageSnapshot;
const dt = Date.now() - t0;
@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService {
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
}
jsdom.window.document.querySelectorAll('*').forEach((x)=> {
jsdom.window.document.querySelectorAll('*').forEach((x) => {
const attrs = x.getAttributeNames();
for (const attr of attrs) {
if (attr.startsWith('data-') || attr.startsWith('aria-')) {

View File

@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService {
if (imageRetention === 'alt') {
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
}
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
let originalSrc = (node.getAttribute('src') || '').trim();
let linkPreferredSrc = originalSrc;
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
if (!linkPreferredSrc && maybeSrcSet) {
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService {
if (!src) {
return '';
}
const mapped = urlToAltMap[src];
const mapped = urlToAltMap[originalSrc];
const imgSerial = ++imgIdx;
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
idxArr.push(imgSerial);
@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService {
if (!mode.includes('markdown') && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
imgIdx = 0;
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
imgIdx = 0;
if (snapshot.parsed.content) {
toBeTurnedToMd = jsDomElementOfParsed;
}
@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService {
if (toBeTurnedToMd) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
imgIdx = 0;
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
imgIdx = 0;
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService {
toBeTurnedToMd = jsDomElementOfHTML;
try {
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
imgIdx = 0;
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
imgIdx = 0;
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}
@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService {
.toPairs()
.map(
([url, alt], i) => {
if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
const refUrl = new URL(formatted.url!);
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
url = mappedUrl.toString();
}
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
}
).fromPairs()