mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 11:06:02 +08:00
fix: html rebasing with <base> tag
This commit is contained in:
parent
40e91853e2
commit
7af2bde01f
@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
||||
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||
@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
// If Readability did its job
|
||||
if (par2.length >= 0.3 * par1.length) {
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
if (snapshot.parsed.content) {
|
||||
toBeTurnedToMd = jsDomElementOfParsed;
|
||||
}
|
||||
@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
let src;
|
||||
try {
|
||||
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
||||
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
||||
} catch (_err) {
|
||||
void 0;
|
||||
}
|
||||
@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||
} catch (err2) {
|
||||
@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
|
||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||
} catch (err) {
|
||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||
try {
|
||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||
} catch (err2) {
|
||||
|
@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
|
||||
.flat()
|
||||
.map((x) => {
|
||||
try {
|
||||
return new URL(x, snapshot.href).toString();
|
||||
return new URL(x, snapshot.rebase || snapshot.href).toString();
|
||||
} catch (err) {
|
||||
return null;
|
||||
}
|
||||
@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
|
||||
return undefined;
|
||||
}
|
||||
try {
|
||||
const parsed = new URL(href, snapshot.href);
|
||||
const parsed = new URL(href, snapshot.rebase || snapshot.href);
|
||||
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
||||
return undefined;
|
||||
}
|
||||
@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
|
||||
}
|
||||
|
||||
return {
|
||||
src: new URL(linkPreferredSrc, snapshot.href).toString(),
|
||||
src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
|
||||
width: parseInt(x.getAttribute('width') || '0'),
|
||||
height: parseInt(x.getAttribute('height') || '0'),
|
||||
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
||||
|
@ -42,6 +42,7 @@ export interface ReadabilityParsed {
|
||||
export interface PageSnapshot {
|
||||
title: string;
|
||||
href: string;
|
||||
rebase?: string;
|
||||
html: string;
|
||||
text: string;
|
||||
parsed?: Partial<ReadabilityParsed> | null;
|
||||
@ -101,7 +102,7 @@ function briefImgs(elem) {
|
||||
}
|
||||
|
||||
return {
|
||||
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
||||
src: new URL(linkPreferredSrc, document.baseURI).toString(),
|
||||
loaded: x.complete,
|
||||
width: x.width,
|
||||
height: x.height,
|
||||
@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
||||
maxElemDepth: domAnalysis.maxDepth,
|
||||
elemCount: domAnalysis.elementCount,
|
||||
};
|
||||
if (document.baseURI !== r.href) {
|
||||
r.rebase = document.baseURI;
|
||||
}
|
||||
if (parsed && parsed.content) {
|
||||
const elem = document.createElement('div');
|
||||
elem.innerHTML = parsed.content;
|
||||
|
Loading…
x
Reference in New Issue
Block a user