mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 01:15:55 +08:00
feat: expand shadow dom
This commit is contained in:
parent
00a1278385
commit
102a1686b0
42
backend/functions/package-lock.json
generated
42
backend/functions/package-lock.json
generated
@ -15,13 +15,13 @@
|
|||||||
"archiver": "^6.0.1",
|
"archiver": "^6.0.1",
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.8.0-8592519",
|
"civkit": "^0.8.1-1f42c5a",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^6.0.1",
|
"firebase-functions": "^6.1.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
@ -2176,12 +2176,14 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/@types/express": {
|
"node_modules/@types/express": {
|
||||||
"version": "4.17.3",
|
"version": "4.17.21",
|
||||||
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz",
|
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
|
||||||
"integrity": "sha512-I8cGRJj3pyOLs/HndoP+25vOqhqWkAZsWMEmq1qXy/b/M3ppufecUwaK2/TVDVxcV61/iSdhykUjQQ2DLSrTdg==",
|
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
|
||||||
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/body-parser": "*",
|
"@types/body-parser": "*",
|
||||||
"@types/express-serve-static-core": "*",
|
"@types/express-serve-static-core": "^4.17.33",
|
||||||
|
"@types/qs": "*",
|
||||||
"@types/serve-static": "*"
|
"@types/serve-static": "*"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
@ -3727,9 +3729,10 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/civkit": {
|
"node_modules/civkit": {
|
||||||
"version": "0.8.0-8592519",
|
"version": "0.8.1-1f42c5a",
|
||||||
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.0-8592519.tgz",
|
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-1f42c5a.tgz",
|
||||||
"integrity": "sha512-CFd6RLjYyKkNNlzE/kBqWqiYQJOzMXL2uuMiDYGy+IY4WnO5U9wzQ1VQDEWSPWDZl+czybyVGTp0Uz5s9NyA5A==",
|
"integrity": "sha512-+cXywfdiu9+QbnNmJXKCjiAdEUdGRiiZ8zg/YKRqsr4vaX6lFNEI3P0J1FOj1x3vRL9cESGucXN6rh0AfmHHTQ==",
|
||||||
|
"license": "AGPL",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"lodash": "^4.17.21",
|
"lodash": "^4.17.21",
|
||||||
"tslib": "^2.5.0"
|
"tslib": "^2.5.0"
|
||||||
@ -5510,15 +5513,15 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/firebase-functions": {
|
"node_modules/firebase-functions": {
|
||||||
"version": "6.0.1",
|
"version": "6.1.0",
|
||||||
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
|
||||||
"integrity": "sha512-0rIpTU6dnLRvP3IK+okn1FDjoqjzShm0/S+i4OMY7JFu/HJoyJ1JNkrT4KjECy1/mCHK49KsmH8iYE0rzrglHg==",
|
"integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
|
||||||
"license": "MIT",
|
"license": "MIT",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@types/cors": "^2.8.5",
|
"@types/cors": "^2.8.5",
|
||||||
"@types/express": "4.17.3",
|
"@types/express": "^4.17.21",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"express": "^4.17.1",
|
"express": "^4.21.0",
|
||||||
"protobufjs": "^7.2.2"
|
"protobufjs": "^7.2.2"
|
||||||
},
|
},
|
||||||
"bin": {
|
"bin": {
|
||||||
@ -7848,17 +7851,6 @@
|
|||||||
"node": ">=14"
|
"node": ">=14"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/jwks-rsa/node_modules/@types/express": {
|
|
||||||
"version": "4.17.21",
|
|
||||||
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
|
|
||||||
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
|
|
||||||
"dependencies": {
|
|
||||||
"@types/body-parser": "*",
|
|
||||||
"@types/express-serve-static-core": "^4.17.33",
|
|
||||||
"@types/qs": "*",
|
|
||||||
"@types/serve-static": "*"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/jwks-rsa/node_modules/jose": {
|
"node_modules/jwks-rsa/node_modules/jose": {
|
||||||
"version": "4.15.5",
|
"version": "4.15.5",
|
||||||
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",
|
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",
|
||||||
|
@ -35,13 +35,13 @@
|
|||||||
"archiver": "^6.0.1",
|
"archiver": "^6.0.1",
|
||||||
"axios": "^1.3.3",
|
"axios": "^1.3.3",
|
||||||
"bcrypt": "^5.1.0",
|
"bcrypt": "^5.1.0",
|
||||||
"civkit": "^0.8.0-8592519",
|
"civkit": "^0.8.1-1f42c5a",
|
||||||
"core-js": "^3.37.1",
|
"core-js": "^3.37.1",
|
||||||
"cors": "^2.8.5",
|
"cors": "^2.8.5",
|
||||||
"dayjs": "^1.11.9",
|
"dayjs": "^1.11.9",
|
||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^6.0.1",
|
"firebase-functions": "^6.1.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"langdetect": "^0.2.1",
|
"langdetect": "^0.2.1",
|
||||||
|
@ -24,6 +24,7 @@ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapsho
|
|||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean;
|
withIframe?: boolean;
|
||||||
|
withShadowDom?: boolean;
|
||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
keepImgDataUrl?: boolean;
|
keepImgDataUrl?: boolean;
|
||||||
@ -571,7 +572,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
|
||||||
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
||||||
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
|
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
|
||||||
}
|
}
|
||||||
@ -686,6 +687,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
overrideUserAgent: opts.userAgent,
|
overrideUserAgent: opts.userAgent,
|
||||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||||
withIframe: opts.withIframe,
|
withIframe: opts.withIframe,
|
||||||
|
withShadowDom: opts.withShadowDom,
|
||||||
locale: opts.locale,
|
locale: opts.locale,
|
||||||
referer: opts.referer,
|
referer: opts.referer,
|
||||||
};
|
};
|
||||||
|
@ -101,6 +101,16 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
|
'X-With-Iframe': {
|
||||||
|
description: `Enable filling iframe contents into main. (violates standards)`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-With-Shadow-Dom': {
|
||||||
|
description: `Enable filling shadow dom contents into main. (violates standards)`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
'X-User-Agent': {
|
'X-User-Agent': {
|
||||||
description: `Override User-Agent.`,
|
description: `Override User-Agent.`,
|
||||||
in: 'header',
|
in: 'header',
|
||||||
@ -185,6 +195,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
})
|
})
|
||||||
withIframe!: boolean;
|
withIframe!: boolean;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
withShadowDom!: boolean;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
arrayOf: String,
|
arrayOf: String,
|
||||||
})
|
})
|
||||||
@ -283,6 +298,13 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (instance.withIframe) {
|
if (instance.withIframe) {
|
||||||
instance.timeout ??= null;
|
instance.timeout ??= null;
|
||||||
}
|
}
|
||||||
|
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
|
||||||
|
if (withShadowDom) {
|
||||||
|
instance.withShadowDom = Boolean(withShadowDom);
|
||||||
|
}
|
||||||
|
if (instance.withShadowDom) {
|
||||||
|
instance.timeout ??= null;
|
||||||
|
}
|
||||||
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import 'reflect-metadata';
|
import 'reflect-metadata';
|
||||||
import './shared/lib/doom-domain';
|
// import './shared/lib/doom-domain';
|
||||||
import { initializeApp } from 'firebase-admin/app';
|
import { initializeApp } from 'firebase-admin/app';
|
||||||
initializeApp();
|
initializeApp();
|
||||||
|
|
||||||
|
@ -5,6 +5,7 @@ import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
|
|||||||
import { Readability } from '@mozilla/readability';
|
import { Readability } from '@mozilla/readability';
|
||||||
import TurndownService from 'turndown';
|
import TurndownService from 'turndown';
|
||||||
import { Threaded } from '../shared/services/threaded';
|
import { Threaded } from '../shared/services/threaded';
|
||||||
|
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
|
||||||
|
|
||||||
const pLinkedom = import('linkedom');
|
const pLinkedom = import('linkedom');
|
||||||
|
|
||||||
@ -27,12 +28,8 @@ export class JSDomControl extends AsyncService {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: ExtraScrappingOptions) {
|
||||||
targetSelector?: string | string[];
|
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) {
|
||||||
removeSelector?: string | string[];
|
|
||||||
withIframe?: boolean;
|
|
||||||
}) {
|
|
||||||
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
|
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
if (!snapshot?.html) {
|
if (!snapshot?.html) {
|
||||||
@ -43,14 +40,13 @@ export class JSDomControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@Threaded()
|
@Threaded()
|
||||||
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
|
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScrappingOptions): Promise<PageSnapshot | undefined> {
|
||||||
targetSelector?: string | string[];
|
|
||||||
removeSelector?: string | string[];
|
|
||||||
withIframe?: boolean;
|
|
||||||
}): Promise<PageSnapshot | undefined> {
|
|
||||||
|
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
const jsdom = this.linkedom.parseHTML(snapshot.html);
|
let sourceHTML = snapshot.html;
|
||||||
|
if (options?.withShadowDom && snapshot.shadowExpanded) {
|
||||||
|
sourceHTML = snapshot.shadowExpanded;
|
||||||
|
}
|
||||||
|
const jsdom = this.linkedom.parseHTML(sourceHTML);
|
||||||
const allNodes: Node[] = [];
|
const allNodes: Node[] = [];
|
||||||
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
|
||||||
if (options?.withIframe) {
|
if (options?.withIframe) {
|
||||||
@ -107,12 +103,12 @@ export class JSDomControl extends AsyncService {
|
|||||||
|
|
||||||
return snapshot;
|
return snapshot;
|
||||||
}
|
}
|
||||||
const textChunks: string[] = [];
|
const textNodes: HTMLElement[] = [];
|
||||||
let rootDoc: Document;
|
let rootDoc: Document;
|
||||||
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
|
||||||
rootDoc = allNodes[0] as any;
|
rootDoc = allNodes[0] as any;
|
||||||
if (rootDoc.body.innerText) {
|
if (rootDoc.body.innerText) {
|
||||||
textChunks.push(rootDoc.body.innerText);
|
textNodes.push(rootDoc.body);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
|
||||||
@ -120,10 +116,16 @@ export class JSDomControl extends AsyncService {
|
|||||||
rootDoc.body.appendChild(n);
|
rootDoc.body.appendChild(n);
|
||||||
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
|
||||||
if ((n as HTMLElement).innerText) {
|
if ((n as HTMLElement).innerText) {
|
||||||
textChunks.push((n as HTMLElement).innerText);
|
textNodes.push(n as HTMLElement);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const textChunks = textNodes.map((x) => {
|
||||||
|
const clone = x.cloneNode(true) as HTMLElement;
|
||||||
|
clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
|
||||||
|
|
||||||
|
return clone.innerText;
|
||||||
|
});
|
||||||
|
|
||||||
let parsed;
|
let parsed;
|
||||||
try {
|
try {
|
||||||
@ -229,6 +231,14 @@ export class JSDomControl extends AsyncService {
|
|||||||
snippetToElement(snippet?: string, url?: string) {
|
snippetToElement(snippet?: string, url?: string) {
|
||||||
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
|
||||||
|
|
||||||
|
// Hack for turndown gfm table plugin.
|
||||||
|
parsed.window.document.querySelectorAll('table').forEach((x) => {
|
||||||
|
Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true });
|
||||||
|
});
|
||||||
|
Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', {
|
||||||
|
value: function () { return this; },
|
||||||
|
});
|
||||||
|
|
||||||
return parsed.window.document.documentElement;
|
return parsed.window.document.documentElement;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -46,6 +46,7 @@ export interface PageSnapshot {
|
|||||||
href: string;
|
href: string;
|
||||||
rebase?: string;
|
rebase?: string;
|
||||||
html: string;
|
html: string;
|
||||||
|
shadowExpanded?: string
|
||||||
text: string;
|
text: string;
|
||||||
status?: number;
|
status?: number;
|
||||||
statusText?: string;
|
statusText?: string;
|
||||||
@ -157,6 +158,79 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function cloneAndExpandShadowRoots(rootElement = document.documentElement) {
|
||||||
|
// Create a shallow clone of the root element
|
||||||
|
const clone = rootElement.cloneNode(false);
|
||||||
|
// Function to process an element and its shadow root
|
||||||
|
function processShadowRoot(original, cloned) {
|
||||||
|
if (original.shadowRoot && original.shadowRoot.mode === 'open') {
|
||||||
|
shadowDomPresents = true;
|
||||||
|
const shadowContent = document.createDocumentFragment();
|
||||||
|
|
||||||
|
// Clone shadow root content normally
|
||||||
|
original.shadowRoot.childNodes.forEach(childNode => {
|
||||||
|
const clonedNode = childNode.cloneNode(true);
|
||||||
|
shadowContent.appendChild(clonedNode);
|
||||||
|
});
|
||||||
|
|
||||||
|
// Handle slots
|
||||||
|
const slots = shadowContent.querySelectorAll('slot');
|
||||||
|
slots.forEach(slot => {
|
||||||
|
const slotName = slot.getAttribute('name') || '';
|
||||||
|
const assignedElements = original.querySelectorAll(
|
||||||
|
slotName ? \`[slot="\${slotName}"]\` : ':not([slot])'
|
||||||
|
);
|
||||||
|
|
||||||
|
if (assignedElements.length > 0) {
|
||||||
|
const slotContent = document.createDocumentFragment();
|
||||||
|
assignedElements.forEach(el => {
|
||||||
|
const clonedEl = el.cloneNode(true);
|
||||||
|
slotContent.appendChild(clonedEl);
|
||||||
|
});
|
||||||
|
slot.parentNode.replaceChild(slotContent, slot);
|
||||||
|
} else if (!slotName) {
|
||||||
|
// Keep default slot content
|
||||||
|
// No need to do anything as it's already cloned
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
cloned.appendChild(shadowContent);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Use a TreeWalker on the original root to clone the entire structure
|
||||||
|
const treeWalker = document.createTreeWalker(
|
||||||
|
rootElement,
|
||||||
|
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
|
||||||
|
);
|
||||||
|
|
||||||
|
const elementMap = new Map([[rootElement, clone]]);
|
||||||
|
|
||||||
|
let currentNode;
|
||||||
|
while (currentNode = treeWalker.nextNode()) {
|
||||||
|
const parentClone = elementMap.get(currentNode.parentNode);
|
||||||
|
const clonedNode = currentNode.cloneNode(false);
|
||||||
|
parentClone.appendChild(clonedNode);
|
||||||
|
|
||||||
|
if (currentNode.nodeType === Node.ELEMENT_NODE) {
|
||||||
|
elementMap.set(currentNode, clonedNode);
|
||||||
|
processShadowRoot(currentNode, clonedNode);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return clone;
|
||||||
|
}
|
||||||
|
|
||||||
|
function shadowDomPresent(rootElement = document.documentElement) {
|
||||||
|
const elems = rootElement.querySelectorAll('*');
|
||||||
|
for (const x of elems) {
|
||||||
|
if (x.shadowRoot && x.shadowRoot.mode === 'open') {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
if (stopActiveSnapshot) {
|
if (stopActiveSnapshot) {
|
||||||
window.haltSnapshot = true;
|
window.haltSnapshot = true;
|
||||||
@ -174,6 +248,7 @@ function giveSnapshot(stopActiveSnapshot) {
|
|||||||
href: document.location.href,
|
href: document.location.href,
|
||||||
html: document.documentElement?.outerHTML,
|
html: document.documentElement?.outerHTML,
|
||||||
text: document.body?.innerText,
|
text: document.body?.innerText,
|
||||||
|
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
|
||||||
parsed: parsed,
|
parsed: parsed,
|
||||||
imgs: [],
|
imgs: [],
|
||||||
maxElemDepth: domAnalysis.maxDepth,
|
maxElemDepth: domAnalysis.maxDepth,
|
||||||
|
@ -299,12 +299,12 @@ export class SnapshotFormatter extends AsyncService {
|
|||||||
&& toBeTurnedToMd !== jsDomElementOfHTML
|
&& toBeTurnedToMd !== jsDomElementOfHTML
|
||||||
) {
|
) {
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
|
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
|
||||||
try {
|
try {
|
||||||
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
|
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
|
||||||
}
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 09a88ebec8ba6154df6cb0b5a3caab07fe7cd150
|
Subproject commit fecbdd92230de5ebd0de168b43b0358d8221769f
|
Loading…
x
Reference in New Issue
Block a user