mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 01:05:59 +08:00
fix: image in summary
This commit is contained in:
parent
fc2824b115
commit
008dcbaf22
@ -1,7 +1,7 @@
|
||||
import { container, singleton } from 'tsyringe';
|
||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
||||
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
||||
import { Readability } from '@mozilla/readability';
|
||||
import TurndownService from 'turndown';
|
||||
import { Threaded } from '../shared/services/threaded';
|
||||
@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService {
|
||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||
}
|
||||
|
||||
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
||||
.flat()
|
||||
.map((x) => {
|
||||
try {
|
||||
return new URL(x, snapshot.rebase || snapshot.href).toString();
|
||||
} catch (err) {
|
||||
return null;
|
||||
const imgSet = new Set<string>();
|
||||
const rebuiltImgs: ImgBrief[] = [];
|
||||
Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
|
||||
.forEach(([u1, u2, alt]) => {
|
||||
if (u1) {
|
||||
try {
|
||||
const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
|
||||
imgSet.add(u1Txt);
|
||||
} catch (err) {
|
||||
// void 0;
|
||||
}
|
||||
}
|
||||
})
|
||||
.filter(Boolean);
|
||||
if (u2) {
|
||||
try {
|
||||
const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
|
||||
imgSet.add(u2Txt);
|
||||
} catch (err) {
|
||||
// void 0;
|
||||
}
|
||||
}
|
||||
rebuiltImgs.push({
|
||||
src: u1 || u2,
|
||||
alt
|
||||
});
|
||||
});
|
||||
|
||||
const imageSet = new Set(imageTags);
|
||||
const r = {
|
||||
...snapshot,
|
||||
title: snapshot.title || jsdom.window.document.title,
|
||||
@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService {
|
||||
parsed,
|
||||
html: rootDoc.documentElement.outerHTML,
|
||||
text: textChunks.join('\n'),
|
||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
||||
imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [],
|
||||
} as PageSnapshot;
|
||||
|
||||
const dt = Date.now() - t0;
|
||||
@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService {
|
||||
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
||||
}
|
||||
|
||||
jsdom.window.document.querySelectorAll('*').forEach((x)=> {
|
||||
jsdom.window.document.querySelectorAll('*').forEach((x) => {
|
||||
const attrs = x.getAttributeNames();
|
||||
for (const attr of attrs) {
|
||||
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
||||
|
@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (imageRetention === 'alt') {
|
||||
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
||||
}
|
||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
||||
let originalSrc = (node.getAttribute('src') || '').trim();
|
||||
let linkPreferredSrc = originalSrc;
|
||||
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
|
||||
if (!linkPreferredSrc && maybeSrcSet) {
|
||||
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
|
||||
@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (!src) {
|
||||
return '';
|
||||
}
|
||||
const mapped = urlToAltMap[src];
|
||||
const mapped = urlToAltMap[originalSrc];
|
||||
const imgSerial = ++imgIdx;
|
||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||
idxArr.push(imgSerial);
|
||||
@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||
imgIdx = 0;
|
||||
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
||||
imgIdx = 0;
|
||||
if (snapshot.parsed.content) {
|
||||
toBeTurnedToMd = jsDomElementOfParsed;
|
||||
}
|
||||
@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService {
|
||||
if (toBeTurnedToMd) {
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||
imgIdx = 0;
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||
imgIdx = 0;
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService {
|
||||
toBeTurnedToMd = jsDomElementOfHTML;
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
||||
imgIdx = 0;
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
||||
imgIdx = 0;
|
||||
} catch (err2) {
|
||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||
}
|
||||
@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService {
|
||||
.toPairs()
|
||||
.map(
|
||||
([url, alt], i) => {
|
||||
if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
|
||||
const refUrl = new URL(formatted.url!);
|
||||
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
|
||||
|
||||
url = mappedUrl.toString();
|
||||
}
|
||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||
}
|
||||
).fromPairs()
|
||||
|
Loading…
x
Reference in New Issue
Block a user