fix: html rebasing with <base> tag

This commit is contained in:
Yanlong Wang 2024-08-06 13:15:10 +08:00
parent 40e91853e2
commit 7af2bde01f
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 13 additions and 9 deletions

View File

@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
let toBeTurnedToMd = jsDomElementOfHTML;
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
// If Readability did its job
if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (snapshot.parsed.content) {
toBeTurnedToMd = jsDomElementOfParsed;
}
@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
let src;
try {
src = new URL(linkPreferredSrc, nominalUrl).toString();
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
} catch (_err) {
void 0;
}
@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
} catch (err2) {
@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
} catch (err2) {

View File

@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
.flat()
.map((x) => {
try {
return new URL(x, snapshot.href).toString();
return new URL(x, snapshot.rebase || snapshot.href).toString();
} catch (err) {
return null;
}
@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
return undefined;
}
try {
const parsed = new URL(href, snapshot.href);
const parsed = new URL(href, snapshot.rebase || snapshot.href);
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
return undefined;
}
@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
}
return {
src: new URL(linkPreferredSrc, snapshot.href).toString(),
src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
width: parseInt(x.getAttribute('width') || '0'),
height: parseInt(x.getAttribute('height') || '0'),
alt: x.getAttribute('alt') || x.getAttribute('title'),

View File

@ -42,6 +42,7 @@ export interface ReadabilityParsed {
export interface PageSnapshot {
title: string;
href: string;
rebase?: string;
html: string;
text: string;
parsed?: Partial<ReadabilityParsed> | null;
@ -101,7 +102,7 @@ function briefImgs(elem) {
}
return {
src: new URL(linkPreferredSrc, document.location.href).toString(),
src: new URL(linkPreferredSrc, document.baseURI).toString(),
loaded: x.complete,
width: x.width,
height: x.height,
@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
maxElemDepth: domAnalysis.maxDepth,
elemCount: domAnalysis.elementCount,
};
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
if (parsed && parsed.content) {
const elem = document.createElement('div');
elem.innerHTML = parsed.content;