mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 21:55:56 +08:00
fix: html rebasing with <base> tag
This commit is contained in:
parent
40e91853e2
commit
7af2bde01f
@ -394,7 +394,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
const jsDomElementOfHTML = this.jsdomControl.snippetToElement(snapshot.html, snapshot.href);
|
||||||
let toBeTurnedToMd = jsDomElementOfHTML;
|
let toBeTurnedToMd = jsDomElementOfHTML;
|
||||||
let turnDownService = this.getTurndown({ url: nominalUrl, imgDataUrlToObjectUrl });
|
let turnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
if (mode !== 'markdown' && snapshot.parsed?.content) {
|
||||||
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
const jsDomElementOfParsed = this.jsdomControl.snippetToElement(snapshot.parsed.content, snapshot.href);
|
||||||
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
const par1 = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||||
@ -402,7 +402,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
// If Readability did its job
|
// If Readability did its job
|
||||||
if (par2.length >= 0.3 * par1.length) {
|
if (par2.length >= 0.3 * par1.length) {
|
||||||
turnDownService = this.getTurndown({ noRules: true, url: snapshot.href, imgDataUrlToObjectUrl });
|
turnDownService = this.getTurndown({ noRules: true, url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
if (snapshot.parsed.content) {
|
if (snapshot.parsed.content) {
|
||||||
toBeTurnedToMd = jsDomElementOfParsed;
|
toBeTurnedToMd = jsDomElementOfParsed;
|
||||||
}
|
}
|
||||||
@ -440,7 +440,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
let src;
|
let src;
|
||||||
try {
|
try {
|
||||||
src = new URL(linkPreferredSrc, nominalUrl).toString();
|
src = new URL(linkPreferredSrc, snapshot.rebase || nominalUrl).toString();
|
||||||
} catch (_err) {
|
} catch (_err) {
|
||||||
void 0;
|
void 0;
|
||||||
}
|
}
|
||||||
@ -485,7 +485,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(turnDownService, toBeTurnedToMd).trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, toBeTurnedToMd).trim();
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -502,7 +502,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.href, imgDataUrlToObjectUrl });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
|
@ -121,7 +121,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
.flat()
|
.flat()
|
||||||
.map((x) => {
|
.map((x) => {
|
||||||
try {
|
try {
|
||||||
return new URL(x, snapshot.href).toString();
|
return new URL(x, snapshot.rebase || snapshot.href).toString();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@ -160,7 +160,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
const parsed = new URL(href, snapshot.href);
|
const parsed = new URL(href, snapshot.rebase || snapshot.href);
|
||||||
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
if (parsed.protocol === 'file:' || parsed.protocol === 'javascript:') {
|
||||||
return undefined;
|
return undefined;
|
||||||
}
|
}
|
||||||
@ -188,7 +188,7 @@ export class JSDomControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
src: new URL(linkPreferredSrc, snapshot.href).toString(),
|
src: new URL(linkPreferredSrc, snapshot.rebase || snapshot.href).toString(),
|
||||||
width: parseInt(x.getAttribute('width') || '0'),
|
width: parseInt(x.getAttribute('width') || '0'),
|
||||||
height: parseInt(x.getAttribute('height') || '0'),
|
height: parseInt(x.getAttribute('height') || '0'),
|
||||||
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
alt: x.getAttribute('alt') || x.getAttribute('title'),
|
||||||
|
@ -42,6 +42,7 @@ export interface ReadabilityParsed {
|
|||||||
export interface PageSnapshot {
|
export interface PageSnapshot {
|
||||||
title: string;
|
title: string;
|
||||||
href: string;
|
href: string;
|
||||||
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
text: string;
|
text: string;
|
||||||
parsed?: Partial<ReadabilityParsed> | null;
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
@ -101,7 +102,7 @@ function briefImgs(elem) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
src: new URL(linkPreferredSrc, document.baseURI).toString(),
|
||||||
loaded: x.complete,
|
loaded: x.complete,
|
||||||
width: x.width,
|
width: x.width,
|
||||||
height: x.height,
|
height: x.height,
|
||||||
@ -179,6 +180,9 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
maxElemDepth: domAnalysis.maxDepth,
|
maxElemDepth: domAnalysis.maxDepth,
|
||||||
elemCount: domAnalysis.elementCount,
|
elemCount: domAnalysis.elementCount,
|
||||||
};
|
};
|
||||||
|
if (document.baseURI !== r.href) {
|
||||||
|
r.rebase = document.baseURI;
|
||||||
|
}
|
||||||
if (parsed && parsed.content) {
|
if (parsed && parsed.content) {
|
||||||
const elem = document.createElement('div');
|
const elem = document.createElement('div');
|
||||||
elem.innerHTML = parsed.content;
|
elem.innerHTML = parsed.content;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user