feat: expand shadow dom

This commit is contained in:
yanlong.wang 2024-10-22 19:04:49 +08:00
parent 00a1278385
commit 102a1686b0
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
9 changed files with 149 additions and 48 deletions

View File

@ -15,13 +15,13 @@
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"civkit": "^0.8.0-8592519",
"civkit": "^0.8.1-1f42c5a",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.0.1",
"firebase-functions": "^6.1.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",
@ -2176,12 +2176,14 @@
}
},
"node_modules/@types/express": {
"version": "4.17.3",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz",
"integrity": "sha512-I8cGRJj3pyOLs/HndoP+25vOqhqWkAZsWMEmq1qXy/b/M3ppufecUwaK2/TVDVxcV61/iSdhykUjQQ2DLSrTdg==",
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
"license": "MIT",
"dependencies": {
"@types/body-parser": "*",
"@types/express-serve-static-core": "*",
"@types/express-serve-static-core": "^4.17.33",
"@types/qs": "*",
"@types/serve-static": "*"
}
},
@ -3727,9 +3729,10 @@
}
},
"node_modules/civkit": {
"version": "0.8.0-8592519",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.0-8592519.tgz",
"integrity": "sha512-CFd6RLjYyKkNNlzE/kBqWqiYQJOzMXL2uuMiDYGy+IY4WnO5U9wzQ1VQDEWSPWDZl+czybyVGTp0Uz5s9NyA5A==",
"version": "0.8.1-1f42c5a",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-1f42c5a.tgz",
"integrity": "sha512-+cXywfdiu9+QbnNmJXKCjiAdEUdGRiiZ8zg/YKRqsr4vaX6lFNEI3P0J1FOj1x3vRL9cESGucXN6rh0AfmHHTQ==",
"license": "AGPL",
"dependencies": {
"lodash": "^4.17.21",
"tslib": "^2.5.0"
@ -5510,15 +5513,15 @@
}
},
"node_modules/firebase-functions": {
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.0.1.tgz",
"integrity": "sha512-0rIpTU6dnLRvP3IK+okn1FDjoqjzShm0/S+i4OMY7JFu/HJoyJ1JNkrT4KjECy1/mCHK49KsmH8iYE0rzrglHg==",
"version": "6.1.0",
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
"integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
"license": "MIT",
"dependencies": {
"@types/cors": "^2.8.5",
"@types/express": "4.17.3",
"@types/express": "^4.17.21",
"cors": "^2.8.5",
"express": "^4.17.1",
"express": "^4.21.0",
"protobufjs": "^7.2.2"
},
"bin": {
@ -7848,17 +7851,6 @@
"node": ">=14"
}
},
"node_modules/jwks-rsa/node_modules/@types/express": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
"dependencies": {
"@types/body-parser": "*",
"@types/express-serve-static-core": "^4.17.33",
"@types/qs": "*",
"@types/serve-static": "*"
}
},
"node_modules/jwks-rsa/node_modules/jose": {
"version": "4.15.5",
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",

View File

@ -35,13 +35,13 @@
"archiver": "^6.0.1",
"axios": "^1.3.3",
"bcrypt": "^5.1.0",
"civkit": "^0.8.0-8592519",
"civkit": "^0.8.1-1f42c5a",
"core-js": "^3.37.1",
"cors": "^2.8.5",
"dayjs": "^1.11.9",
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.0.1",
"firebase-functions": "^6.1.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"langdetect": "^0.2.1",

View File

@ -24,6 +24,7 @@ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapsho
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean;
withShadowDom?: boolean;
targetSelector?: string | string[];
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
@ -571,7 +572,7 @@ export class CrawlerHost extends RPCHost {
}
try {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
}
@ -686,6 +687,7 @@ export class CrawlerHost extends RPCHost {
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe,
withShadowDom: opts.withShadowDom,
locale: opts.locale,
referer: opts.referer,
};

View File

@ -101,6 +101,16 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header',
schema: { type: 'string' }
},
'X-With-Iframe': {
description: `Enable filling iframe contents into main. (violates standards)`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Shadow-Dom': {
description: `Enable filling shadow dom contents into main. (violates standards)`,
in: 'header',
schema: { type: 'string' }
},
'X-User-Agent': {
description: `Override User-Agent.`,
in: 'header',
@ -185,6 +195,11 @@ export class CrawlerOptions extends AutoCastable {
})
withIframe!: boolean;
@Prop({
default: false,
})
withShadowDom!: boolean;
@Prop({
arrayOf: String,
})
@ -283,6 +298,13 @@ export class CrawlerOptions extends AutoCastable {
if (instance.withIframe) {
instance.timeout ??= null;
}
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
if (withShadowDom) {
instance.withShadowDom = Boolean(withShadowDom);
}
if (instance.withShadowDom) {
instance.timeout ??= null;
}
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);

View File

@ -1,5 +1,5 @@
import 'reflect-metadata';
import './shared/lib/doom-domain';
// import './shared/lib/doom-domain';
import { initializeApp } from 'firebase-admin/app';
initializeApp();

View File

@ -5,6 +5,7 @@ import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown';
import { Threaded } from '../shared/services/threaded';
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
const pLinkedom = import('linkedom');
@ -27,12 +28,8 @@ export class JSDomControl extends AsyncService {
this.emit('ready');
}
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}) {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: ExtraScrappingOptions) {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) {
return snapshot;
}
if (!snapshot?.html) {
@ -43,14 +40,13 @@ export class JSDomControl extends AsyncService {
}
@Threaded()
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}): Promise<PageSnapshot | undefined> {
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScrappingOptions): Promise<PageSnapshot | undefined> {
const t0 = Date.now();
const jsdom = this.linkedom.parseHTML(snapshot.html);
let sourceHTML = snapshot.html;
if (options?.withShadowDom && snapshot.shadowExpanded) {
sourceHTML = snapshot.shadowExpanded;
}
const jsdom = this.linkedom.parseHTML(sourceHTML);
const allNodes: Node[] = [];
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
if (options?.withIframe) {
@ -107,12 +103,12 @@ export class JSDomControl extends AsyncService {
return snapshot;
}
const textChunks: string[] = [];
const textNodes: HTMLElement[] = [];
let rootDoc: Document;
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
rootDoc = allNodes[0] as any;
if (rootDoc.body.innerText) {
textChunks.push(rootDoc.body.innerText);
textNodes.push(rootDoc.body);
}
} else {
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
@ -120,10 +116,16 @@ export class JSDomControl extends AsyncService {
rootDoc.body.appendChild(n);
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
if ((n as HTMLElement).innerText) {
textChunks.push((n as HTMLElement).innerText);
textNodes.push(n as HTMLElement);
}
}
}
const textChunks = textNodes.map((x) => {
const clone = x.cloneNode(true) as HTMLElement;
clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
return clone.innerText;
});
let parsed;
try {
@ -229,6 +231,14 @@ export class JSDomControl extends AsyncService {
snippetToElement(snippet?: string, url?: string) {
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
// Hack for turndown gfm table plugin.
parsed.window.document.querySelectorAll('table').forEach((x) => {
Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true });
});
Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', {
value: function () { return this; },
});
return parsed.window.document.documentElement;
}

View File

@ -46,6 +46,7 @@ export interface PageSnapshot {
href: string;
rebase?: string;
html: string;
shadowExpanded?: string
text: string;
status?: number;
statusText?: string;
@ -157,6 +158,79 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
};
}
function cloneAndExpandShadowRoots(rootElement = document.documentElement) {
// Create a shallow clone of the root element
const clone = rootElement.cloneNode(false);
// Function to process an element and its shadow root
function processShadowRoot(original, cloned) {
if (original.shadowRoot && original.shadowRoot.mode === 'open') {
shadowDomPresents = true;
const shadowContent = document.createDocumentFragment();
// Clone shadow root content normally
original.shadowRoot.childNodes.forEach(childNode => {
const clonedNode = childNode.cloneNode(true);
shadowContent.appendChild(clonedNode);
});
// Handle slots
const slots = shadowContent.querySelectorAll('slot');
slots.forEach(slot => {
const slotName = slot.getAttribute('name') || '';
const assignedElements = original.querySelectorAll(
slotName ? \`[slot="\${slotName}"]\` : ':not([slot])'
);
if (assignedElements.length > 0) {
const slotContent = document.createDocumentFragment();
assignedElements.forEach(el => {
const clonedEl = el.cloneNode(true);
slotContent.appendChild(clonedEl);
});
slot.parentNode.replaceChild(slotContent, slot);
} else if (!slotName) {
// Keep default slot content
// No need to do anything as it's already cloned
}
});
cloned.appendChild(shadowContent);
}
}
// Use a TreeWalker on the original root to clone the entire structure
const treeWalker = document.createTreeWalker(
rootElement,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
);
const elementMap = new Map([[rootElement, clone]]);
let currentNode;
while (currentNode = treeWalker.nextNode()) {
const parentClone = elementMap.get(currentNode.parentNode);
const clonedNode = currentNode.cloneNode(false);
parentClone.appendChild(clonedNode);
if (currentNode.nodeType === Node.ELEMENT_NODE) {
elementMap.set(currentNode, clonedNode);
processShadowRoot(currentNode, clonedNode);
}
}
return clone;
}
function shadowDomPresent(rootElement = document.documentElement) {
const elems = rootElement.querySelectorAll('*');
for (const x of elems) {
if (x.shadowRoot && x.shadowRoot.mode === 'open') {
return true;
}
}
return false;
}
function giveSnapshot(stopActiveSnapshot) {
if (stopActiveSnapshot) {
window.haltSnapshot = true;
@ -174,6 +248,7 @@ function giveSnapshot(stopActiveSnapshot) {
href: document.location.href,
html: document.documentElement?.outerHTML,
text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed,
imgs: [],
maxElemDepth: domAnalysis.maxDepth,

View File

@ -299,12 +299,12 @@ export class SnapshotFormatter extends AsyncService {
&& toBeTurnedToMd !== jsDomElementOfHTML
) {
try {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html);
contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
} catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html);
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML);
} catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
}

@ -1 +1 @@
Subproject commit 09a88ebec8ba6154df6cb0b5a3caab07fe7cd150
Subproject commit fecbdd92230de5ebd0de168b43b0358d8221769f