mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 08:45:58 +08:00
fix: image in summary
This commit is contained in:
parent
fc2824b115
commit
008dcbaf22
@ -1,7 +1,7 @@
|
|||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import { AsyncService, marshalErrorLike } from 'civkit';
|
import { AsyncService, marshalErrorLike } from 'civkit';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer';
|
||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
import { Threaded } from '../shared/services/threaded';
|
import { Threaded } from '../shared/services/threaded';
|
||||||
@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService {
|
|||||||
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) });
|
||||||
}
|
}
|
||||||
|
|
||||||
const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
const imgSet = new Set<string>();
|
||||||
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')])
|
const rebuiltImgs: ImgBrief[] = [];
|
||||||
.flat()
|
Array.from(rootDoc.querySelectorAll('img[src],img[data-src]'))
|
||||||
.map((x) => {
|
.map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')])
|
||||||
try {
|
.forEach(([u1, u2, alt]) => {
|
||||||
return new URL(x, snapshot.rebase || snapshot.href).toString();
|
if (u1) {
|
||||||
} catch (err) {
|
try {
|
||||||
return null;
|
const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString();
|
||||||
|
imgSet.add(u1Txt);
|
||||||
|
} catch (err) {
|
||||||
|
// void 0;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
})
|
if (u2) {
|
||||||
.filter(Boolean);
|
try {
|
||||||
|
const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString();
|
||||||
|
imgSet.add(u2Txt);
|
||||||
|
} catch (err) {
|
||||||
|
// void 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rebuiltImgs.push({
|
||||||
|
src: u1 || u2,
|
||||||
|
alt
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
const imageSet = new Set(imageTags);
|
|
||||||
const r = {
|
const r = {
|
||||||
...snapshot,
|
...snapshot,
|
||||||
title: snapshot.title || jsdom.window.document.title,
|
title: snapshot.title || jsdom.window.document.title,
|
||||||
@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
parsed,
|
parsed,
|
||||||
html: rootDoc.documentElement.outerHTML,
|
html: rootDoc.documentElement.outerHTML,
|
||||||
text: textChunks.join('\n'),
|
text: textChunks.join('\n'),
|
||||||
imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [],
|
imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [],
|
||||||
} as PageSnapshot;
|
} as PageSnapshot;
|
||||||
|
|
||||||
const dt = Date.now() - t0;
|
const dt = Date.now() - t0;
|
||||||
@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
||||||
}
|
}
|
||||||
|
|
||||||
jsdom.window.document.querySelectorAll('*').forEach((x)=> {
|
jsdom.window.document.querySelectorAll('*').forEach((x) => {
|
||||||
const attrs = x.getAttributeNames();
|
const attrs = x.getAttributeNames();
|
||||||
for (const attr of attrs) {
|
for (const attr of attrs) {
|
||||||
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
||||||
|
@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
if (imageRetention === 'alt') {
|
if (imageRetention === 'alt') {
|
||||||
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
return alt ? `(Image ${++imgIdx}: ${alt})` : '';
|
||||||
}
|
}
|
||||||
let linkPreferredSrc = (node.getAttribute('src') || '').trim();
|
let originalSrc = (node.getAttribute('src') || '').trim();
|
||||||
|
let linkPreferredSrc = originalSrc;
|
||||||
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
|
const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim();
|
||||||
if (!linkPreferredSrc && maybeSrcSet) {
|
if (!linkPreferredSrc && maybeSrcSet) {
|
||||||
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
|
linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0];
|
||||||
@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
if (!src) {
|
if (!src) {
|
||||||
return '';
|
return '';
|
||||||
}
|
}
|
||||||
const mapped = urlToAltMap[src];
|
const mapped = urlToAltMap[originalSrc];
|
||||||
const imgSerial = ++imgIdx;
|
const imgSerial = ++imgIdx;
|
||||||
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : [];
|
||||||
idxArr.push(imgSerial);
|
idxArr.push(imgSerial);
|
||||||
@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
if (!mode.includes('markdown') && snapshot.parsed?.content) {
|
||||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||||
|
imgIdx = 0;
|
||||||
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : '';
|
||||||
|
|
||||||
// If Readability did its job
|
// If Readability did its job
|
||||||
if (par2.length >= 0.3 * par1.length) {
|
if (par2.length >= 0.3 * par1.length) {
|
||||||
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
turnDownService = this.getTurndown({ noRules: true, ...optsMixin });
|
||||||
|
imgIdx = 0;
|
||||||
if (snapshot.parsed.content) {
|
if (snapshot.parsed.content) {
|
||||||
toBeTurnedToMd = jsDomElementOfParsed;
|
toBeTurnedToMd = jsDomElementOfParsed;
|
||||||
}
|
}
|
||||||
@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
if (toBeTurnedToMd) {
|
if (toBeTurnedToMd) {
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||||
|
imgIdx = 0;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||||
|
imgIdx = 0;
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
}
|
}
|
||||||
@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
toBeTurnedToMd = jsDomElementOfHTML;
|
toBeTurnedToMd = jsDomElementOfHTML;
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim();
|
||||||
|
imgIdx = 0;
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
const vanillaTurnDownService = this.getTurndown({ ...optsMixin });
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim();
|
||||||
|
imgIdx = 0;
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
}
|
}
|
||||||
@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
.toPairs()
|
.toPairs()
|
||||||
.map(
|
.map(
|
||||||
([url, alt], i) => {
|
([url, alt], i) => {
|
||||||
|
if (imgDataUrlToObjectUrl && url.startsWith('data:')) {
|
||||||
|
const refUrl = new URL(formatted.url!);
|
||||||
|
const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`);
|
||||||
|
|
||||||
|
url = mappedUrl.toString();
|
||||||
|
}
|
||||||
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url];
|
||||||
}
|
}
|
||||||
).fromPairs()
|
).fromPairs()
|
||||||
|
Loading…
x
Reference in New Issue
Block a user