diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 88cadac..fb6d172 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -1,7 +1,7 @@ import { container, singleton } from 'tsyringe'; import { AsyncService, marshalErrorLike } from 'civkit'; import { Logger } from '../shared/services/logger'; -import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; +import { ExtendedSnapshot, ImgBrief, PageSnapshot } from './puppeteer'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; import { Threaded } from '../shared/services/threaded'; @@ -144,19 +144,33 @@ export class JSDomControl extends AsyncService { this.logger.warn(`Failed to parse selected element`, { err: marshalErrorLike(err) }); } - const imageTags = Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) - .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src')]) - .flat() - .map((x) => { - try { - return new URL(x, snapshot.rebase || snapshot.href).toString(); - } catch (err) { - return null; + const imgSet = new Set(); + const rebuiltImgs: ImgBrief[] = []; + Array.from(rootDoc.querySelectorAll('img[src],img[data-src]')) + .map((x: any) => [x.getAttribute('src'), x.getAttribute('data-src'), x.getAttribute('alt')]) + .forEach(([u1, u2, alt]) => { + if (u1) { + try { + const u1Txt = new URL(u1, snapshot.rebase || snapshot.href).toString(); + imgSet.add(u1Txt); + } catch (err) { + // void 0; + } } - }) - .filter(Boolean); + if (u2) { + try { + const u2Txt = new URL(u2, snapshot.rebase || snapshot.href).toString(); + imgSet.add(u2Txt); + } catch (err) { + // void 0; + } + } + rebuiltImgs.push({ + src: u1 || u2, + alt + }); + }); - const imageSet = new Set(imageTags); const r = { ...snapshot, title: snapshot.title || jsdom.window.document.title, @@ -165,7 +179,7 @@ export class JSDomControl extends AsyncService { parsed, html: rootDoc.documentElement.outerHTML, text: textChunks.join('\n'), - imgs: snapshot.imgs?.filter((x) => imageSet.has(x.src)) || [], + imgs: (snapshot.imgs || rebuiltImgs)?.filter((x) => imgSet.has(x.src)) || [], } as PageSnapshot; const dt = Date.now() - t0; @@ -283,7 +297,7 @@ export class JSDomControl extends AsyncService { currentNode.parentNode?.removeChild(currentNode); // Remove each comment node } - jsdom.window.document.querySelectorAll('*').forEach((x)=> { + jsdom.window.document.querySelectorAll('*').forEach((x) => { const attrs = x.getAttributeNames(); for (const attr of attrs) { if (attr.startsWith('data-') || attr.startsWith('aria-')) { diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index 0601ac9..d48dcd3 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -231,7 +231,8 @@ export class SnapshotFormatter extends AsyncService { if (imageRetention === 'alt') { return alt ? `(Image ${++imgIdx}: ${alt})` : ''; } - let linkPreferredSrc = (node.getAttribute('src') || '').trim(); + let originalSrc = (node.getAttribute('src') || '').trim(); + let linkPreferredSrc = originalSrc; const maybeSrcSet: string = (node.getAttribute('srcset') || '').trim(); if (!linkPreferredSrc && maybeSrcSet) { linkPreferredSrc = maybeSrcSet.split(',').map((x) => x.trim()).filter(Boolean)[0]; @@ -252,7 +253,7 @@ export class SnapshotFormatter extends AsyncService { if (!src) { return ''; } - const mapped = urlToAltMap[src]; + const mapped = urlToAltMap[originalSrc]; const imgSerial = ++imgIdx; const idxArr = imageIdxTrack.has(src) ? imageIdxTrack.get(src)! : []; idxArr.push(imgSerial); @@ -303,11 +304,13 @@ export class SnapshotFormatter extends AsyncService { if (!mode.includes('markdown') && snapshot.parsed?.content) { const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); + imgIdx = 0; const par2 = snapshot.parsed.content ? this.jsdomControl.runTurndown(turnDownService, jsDomElementOfParsed) : ''; // If Readability did its job if (par2.length >= 0.3 * par1.length) { turnDownService = this.getTurndown({ noRules: true, ...optsMixin }); + imgIdx = 0; if (snapshot.parsed.content) { toBeTurnedToMd = jsDomElementOfParsed; } @@ -336,11 +339,13 @@ export class SnapshotFormatter extends AsyncService { if (toBeTurnedToMd) { try { contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); + imgIdx = 0; } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ ...optsMixin }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); + imgIdx = 0; } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } @@ -354,11 +359,13 @@ export class SnapshotFormatter extends AsyncService { toBeTurnedToMd = jsDomElementOfHTML; try { contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML).trim(); + imgIdx = 0; } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ ...optsMixin }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML).trim(); + imgIdx = 0; } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } @@ -393,6 +400,12 @@ export class SnapshotFormatter extends AsyncService { .toPairs() .map( ([url, alt], i) => { + if (imgDataUrlToObjectUrl && url.startsWith('data:')) { + const refUrl = new URL(formatted.url!); + const mappedUrl = new URL(`blob:${refUrl.origin}/${md5Hasher.hash(url)}`); + + url = mappedUrl.toString(); + } return [`Image ${(imageIdxTrack?.get(url) || [i + 1]).join(',')}${alt ? `: ${alt}` : ''}`, url]; } ).fromPairs()