diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 7b03143..5ea7509 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost { const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); let toBeTurnedToMd = jsDomElementOfHTML; - let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl }); + let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); if (mode !== 'markdown' && snapshot.parsed?.content) { const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); @@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost { // If Readability did its job if (par2.length >= 0.3 * par1.length) { - turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl }); + turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); if (snapshot.parsed.content) { toBeTurnedToMd = jsDomElementOfParsed; } @@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost { let src; try { - src = new URL(linkPreferredSrc, nominalUrl).toString(); + src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString(); } catch (_err) { void 0; } @@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost { contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); } catch (err2) { @@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost { contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); - const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); + const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); try { contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); } catch (err2) { diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 42b9de9..84adcaf 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService { .flat() .map((x) => { try { - return new URL(x, snapshot.href).toString(); + return new URL(x, snapshot.rebase || snapshot.href).toString(); } catch (err) { return null; } @@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService { return undefined; } try { - const parsed = new URL(href, snapshot.href); + const parsed = new URL(href, snapshot.rebase || snapshot.href); if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') { return undefined; } @@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService { } return { - src: new URL(linkPreferredSrc, snapshot.href).toString(), + src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(), width: parseInt(x.getAttribute('width') || '0'), height: parseInt(x.getAttribute('height') || '0'), alt: x.getAttribute('alt') || x.getAttribute('title'), diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index cbffb60..b43a92d 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -42,6 +42,7 @@ export interface ReadabilityParsed { export interface PageSnapshot { title: string; href: string; + rebase?: string; html: string; text: string; parsed?: Partial | null; @@ -101,7 +102,7 @@ function briefImgs(elem) { } return { - src: new URL(linkPreferredSrc, document.location.href).toString(), + src: new URL(linkPreferredSrc, document.baseURI).toString(), loaded: x.complete, width: x.width, height: x.height, @@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) { maxElemDepth: domAnalysis.maxDepth, elemCount: domAnalysis.elementCount, }; + if (document.baseURI !== r.href) { + r.rebase = document.baseURI; + } if (parsed && parsed.content) { const elem = document.createElement('div'); elem.innerHTML = parsed.content;