fix: html rebasing with <base> tag

This commit is contained in:
Yanlong Wang 2024-08-06 13:15:10 +08:00
parent 40e91853e2
commit 7af2bde01f
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 13 additions and 9 deletions

View File

@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href); const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
let toBeTurnedToMd = jsDomElementOfHTML; let toBeTurnedToMd = jsDomElementOfHTML;
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl }); let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (mode !== 'markdown' && snapshot.parsed?.content) { if (mode !== 'markdown' && snapshot.parsed?.content) {
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href); const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
// If Readability did its job // If Readability did its job
if (par2.length >= 0.3 * par1.length) { if (par2.length >= 0.3 * par1.length) {
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl }); turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
if (snapshot.parsed.content) { if (snapshot.parsed.content) {
toBeTurnedToMd = jsDomElementOfParsed; toBeTurnedToMd = jsDomElementOfParsed;
} }
@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
let src; let src;
try { try {
src = new URL(linkPreferredSrc, nominalUrl).toString(); src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
} catch (_err) { } catch (_err) {
void 0; void 0;
} }
@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim(); contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try { try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim(); contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
} catch (err2) { } catch (err2) {
@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try { try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
} catch (err2) { } catch (err2) {

View File

@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
.flat() .flat()
.map((x) => { .map((x) => {
try { try {
return new URL(x, snapshot.href).toString(); return new URL(x, snapshot.rebase || snapshot.href).toString();
} catch (err) { } catch (err) {
return null; return null;
} }
@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
return undefined; return undefined;
} }
try { try {
const parsed = new URL(href, snapshot.href); const parsed = new URL(href, snapshot.rebase || snapshot.href);
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') { if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
return undefined; return undefined;
} }
@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
} }
return { return {
src: new URL(linkPreferredSrc, snapshot.href).toString(), src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
width: parseInt(x.getAttribute('width') || '0'), width: parseInt(x.getAttribute('width') || '0'),
height: parseInt(x.getAttribute('height') || '0'), height: parseInt(x.getAttribute('height') || '0'),
alt: x.getAttribute('alt') || x.getAttribute('title'), alt: x.getAttribute('alt') || x.getAttribute('title'),

View File

@ -42,6 +42,7 @@ export interface ReadabilityParsed {
export interface PageSnapshot { export interface PageSnapshot {
title: string; title: string;
href: string; href: string;
rebase?: string;
html: string; html: string;
text: string; text: string;
parsed?: Partial<ReadabilityParsed> | null; parsed?: Partial<ReadabilityParsed> | null;
@ -101,7 +102,7 @@ function briefImgs(elem) {
} }
return { return {
src: new URL(linkPreferredSrc, document.location.href).toString(), src: new URL(linkPreferredSrc, document.baseURI).toString(),
loaded: x.complete, loaded: x.complete,
width: x.width, width: x.width,
height: x.height, height: x.height,
@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
maxElemDepth: domAnalysis.maxDepth, maxElemDepth: domAnalysis.maxDepth,
elemCount: domAnalysis.elementCount, elemCount: domAnalysis.elementCount,
}; };
if (document.baseURI !== r.href) {
r.rebase = document.baseURI;
}
if (parsed && parsed.content) { if (parsed && parsed.content) {
const elem = document.createElement('div'); const elem = document.createElement('div');
elem.innerHTML = parsed.content; elem.innerHTML = parsed.content;