feat: expand shadow dom

This commit is contained in:
yanlong.wang 2024-10-22 19:04:49 +08:00
parent 00a1278385
commit 102a1686b0
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
9 changed files with 149 additions and 48 deletions

View File

@ -15,13 +15,13 @@
"archiver": "^6.0.1", "archiver": "^6.0.1",
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"civkit": "^0.8.0-8592519", "civkit": "^0.8.1-1f42c5a",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",
"express": "^4.19.2", "express": "^4.19.2",
"firebase-admin": "^12.1.0", "firebase-admin": "^12.1.0",
"firebase-functions": "^6.0.1", "firebase-functions": "^6.1.0",
"htmlparser2": "^9.0.0", "htmlparser2": "^9.0.0",
"jose": "^5.1.0", "jose": "^5.1.0",
"langdetect": "^0.2.1", "langdetect": "^0.2.1",
@ -2176,12 +2176,14 @@
} }
}, },
"node_modules/@types/express": { "node_modules/@types/express": {
"version": "4.17.3", "version": "4.17.21",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz", "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
"integrity": "sha512-I8cGRJj3pyOLs/HndoP+25vOqhqWkAZsWMEmq1qXy/b/M3ppufecUwaK2/TVDVxcV61/iSdhykUjQQ2DLSrTdg==", "integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
"license": "MIT",
"dependencies": { "dependencies": {
"@types/body-parser": "*", "@types/body-parser": "*",
"@types/express-serve-static-core": "*", "@types/express-serve-static-core": "^4.17.33",
"@types/qs": "*",
"@types/serve-static": "*" "@types/serve-static": "*"
} }
}, },
@ -3727,9 +3729,10 @@
} }
}, },
"node_modules/civkit": { "node_modules/civkit": {
"version": "0.8.0-8592519", "version": "0.8.1-1f42c5a",
"resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.0-8592519.tgz", "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-1f42c5a.tgz",
"integrity": "sha512-CFd6RLjYyKkNNlzE/kBqWqiYQJOzMXL2uuMiDYGy+IY4WnO5U9wzQ1VQDEWSPWDZl+czybyVGTp0Uz5s9NyA5A==", "integrity": "sha512-+cXywfdiu9+QbnNmJXKCjiAdEUdGRiiZ8zg/YKRqsr4vaX6lFNEI3P0J1FOj1x3vRL9cESGucXN6rh0AfmHHTQ==",
"license": "AGPL",
"dependencies": { "dependencies": {
"lodash": "^4.17.21", "lodash": "^4.17.21",
"tslib": "^2.5.0" "tslib": "^2.5.0"
@ -5510,15 +5513,15 @@
} }
}, },
"node_modules/firebase-functions": { "node_modules/firebase-functions": {
"version": "6.0.1", "version": "6.1.0",
"resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.0.1.tgz", "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz",
"integrity": "sha512-0rIpTU6dnLRvP3IK+okn1FDjoqjzShm0/S+i4OMY7JFu/HJoyJ1JNkrT4KjECy1/mCHK49KsmH8iYE0rzrglHg==", "integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==",
"license": "MIT", "license": "MIT",
"dependencies": { "dependencies": {
"@types/cors": "^2.8.5", "@types/cors": "^2.8.5",
"@types/express": "4.17.3", "@types/express": "^4.17.21",
"cors": "^2.8.5", "cors": "^2.8.5",
"express": "^4.17.1", "express": "^4.21.0",
"protobufjs": "^7.2.2" "protobufjs": "^7.2.2"
}, },
"bin": { "bin": {
@ -7848,17 +7851,6 @@
"node": ">=14" "node": ">=14"
} }
}, },
"node_modules/jwks-rsa/node_modules/@types/express": {
"version": "4.17.21",
"resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz",
"integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==",
"dependencies": {
"@types/body-parser": "*",
"@types/express-serve-static-core": "^4.17.33",
"@types/qs": "*",
"@types/serve-static": "*"
}
},
"node_modules/jwks-rsa/node_modules/jose": { "node_modules/jwks-rsa/node_modules/jose": {
"version": "4.15.5", "version": "4.15.5",
"resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz", "resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz",

View File

@ -35,13 +35,13 @@
"archiver": "^6.0.1", "archiver": "^6.0.1",
"axios": "^1.3.3", "axios": "^1.3.3",
"bcrypt": "^5.1.0", "bcrypt": "^5.1.0",
"civkit": "^0.8.0-8592519", "civkit": "^0.8.1-1f42c5a",
"core-js": "^3.37.1", "core-js": "^3.37.1",
"cors": "^2.8.5", "cors": "^2.8.5",
"dayjs": "^1.11.9", "dayjs": "^1.11.9",
"express": "^4.19.2", "express": "^4.19.2",
"firebase-admin": "^12.1.0", "firebase-admin": "^12.1.0",
"firebase-functions": "^6.0.1", "firebase-functions": "^6.1.0",
"htmlparser2": "^9.0.0", "htmlparser2": "^9.0.0",
"jose": "^5.1.0", "jose": "^5.1.0",
"langdetect": "^0.2.1", "langdetect": "^0.2.1",

View File

@ -24,6 +24,7 @@ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapsho
export interface ExtraScrappingOptions extends ScrappingOptions { export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean; withIframe?: boolean;
withShadowDom?: boolean;
targetSelector?: string | string[]; targetSelector?: string | string[];
removeSelector?: string | string[]; removeSelector?: string | string[];
keepImgDataUrl?: boolean; keepImgDataUrl?: boolean;
@ -571,7 +572,7 @@ export class CrawlerHost extends RPCHost {
} }
try { try {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) { if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
yield this.jsdomControl.narrowSnapshot(x, crawlOpts); yield this.jsdomControl.narrowSnapshot(x, crawlOpts);
} }
@ -686,6 +687,7 @@ export class CrawlerHost extends RPCHost {
overrideUserAgent: opts.userAgent, overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe, withIframe: opts.withIframe,
withShadowDom: opts.withShadowDom,
locale: opts.locale, locale: opts.locale,
referer: opts.referer, referer: opts.referer,
}; };

View File

@ -101,6 +101,16 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-With-Iframe': {
description: `Enable filling iframe contents into main. (violates standards)`,
in: 'header',
schema: { type: 'string' }
},
'X-With-Shadow-Dom': {
description: `Enable filling shadow dom contents into main. (violates standards)`,
in: 'header',
schema: { type: 'string' }
},
'X-User-Agent': { 'X-User-Agent': {
description: `Override User-Agent.`, description: `Override User-Agent.`,
in: 'header', in: 'header',
@ -185,6 +195,11 @@ export class CrawlerOptions extends AutoCastable {
}) })
withIframe!: boolean; withIframe!: boolean;
@Prop({
default: false,
})
withShadowDom!: boolean;
@Prop({ @Prop({
arrayOf: String, arrayOf: String,
}) })
@ -283,6 +298,13 @@ export class CrawlerOptions extends AutoCastable {
if (instance.withIframe) { if (instance.withIframe) {
instance.timeout ??= null; instance.timeout ??= null;
} }
const withShadowDom = ctx?.req.get('x-with-shadow-dom');
if (withShadowDom) {
instance.withShadowDom = Boolean(withShadowDom);
}
if (instance.withShadowDom) {
instance.timeout ??= null;
}
const cookies: CookieParam[] = []; const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);

View File

@ -1,5 +1,5 @@
import 'reflect-metadata'; import 'reflect-metadata';
import './shared/lib/doom-domain'; // import './shared/lib/doom-domain';
import { initializeApp } from 'firebase-admin/app'; import { initializeApp } from 'firebase-admin/app';
initializeApp(); initializeApp();

View File

@ -5,6 +5,7 @@ import { ExtendedSnapshot, PageSnapshot } from './puppeteer';
import { Readability } from '@mozilla/readability'; import { Readability } from '@mozilla/readability';
import TurndownService from 'turndown'; import TurndownService from 'turndown';
import { Threaded } from '../shared/services/threaded'; import { Threaded } from '../shared/services/threaded';
import type { ExtraScrappingOptions } from '../cloud-functions/crawler';
const pLinkedom = import('linkedom'); const pLinkedom = import('linkedom');
@ -27,12 +28,8 @@ export class JSDomControl extends AsyncService {
this.emit('ready'); this.emit('ready');
} }
async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: ExtraScrappingOptions) {
targetSelector?: string | string[]; if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) {
removeSelector?: string | string[];
withIframe?: boolean;
}) {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) {
return snapshot; return snapshot;
} }
if (!snapshot?.html) { if (!snapshot?.html) {
@ -43,14 +40,13 @@ export class JSDomControl extends AsyncService {
} }
@Threaded() @Threaded()
async actualNarrowSnapshot(snapshot: PageSnapshot, options?: { async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScrappingOptions): Promise<PageSnapshot | undefined> {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}): Promise<PageSnapshot | undefined> {
const t0 = Date.now(); const t0 = Date.now();
const jsdom = this.linkedom.parseHTML(snapshot.html); let sourceHTML = snapshot.html;
if (options?.withShadowDom && snapshot.shadowExpanded) {
sourceHTML = snapshot.shadowExpanded;
}
const jsdom = this.linkedom.parseHTML(sourceHTML);
const allNodes: Node[] = []; const allNodes: Node[] = [];
jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = '');
if (options?.withIframe) { if (options?.withIframe) {
@ -107,12 +103,12 @@ export class JSDomControl extends AsyncService {
return snapshot; return snapshot;
} }
const textChunks: string[] = []; const textNodes: HTMLElement[] = [];
let rootDoc: Document; let rootDoc: Document;
if (allNodes.length === 1 && allNodes[0].nodeName === '#document') { if (allNodes.length === 1 && allNodes[0].nodeName === '#document') {
rootDoc = allNodes[0] as any; rootDoc = allNodes[0] as any;
if (rootDoc.body.innerText) { if (rootDoc.body.innerText) {
textChunks.push(rootDoc.body.innerText); textNodes.push(rootDoc.body);
} }
} else { } else {
rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document; rootDoc = this.linkedom.parseHTML('<html><body></body></html>').window.document;
@ -120,10 +116,16 @@ export class JSDomControl extends AsyncService {
rootDoc.body.appendChild(n); rootDoc.body.appendChild(n);
rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); rootDoc.body.appendChild(rootDoc.createTextNode('\n\n'));
if ((n as HTMLElement).innerText) { if ((n as HTMLElement).innerText) {
textChunks.push((n as HTMLElement).innerText); textNodes.push(n as HTMLElement);
} }
} }
} }
const textChunks = textNodes.map((x) => {
const clone = x.cloneNode(true) as HTMLElement;
clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove());
return clone.innerText;
});
let parsed; let parsed;
try { try {
@ -229,6 +231,14 @@ export class JSDomControl extends AsyncService {
snippetToElement(snippet?: string, url?: string) { snippetToElement(snippet?: string, url?: string) {
const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>'); const parsed = this.linkedom.parseHTML(snippet || '<html><body></body></html>');
// Hack for turndown gfm table plugin.
parsed.window.document.querySelectorAll('table').forEach((x) => {
Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true });
});
Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', {
value: function () { return this; },
});
return parsed.window.document.documentElement; return parsed.window.document.documentElement;
} }

View File

@ -46,6 +46,7 @@ export interface PageSnapshot {
href: string; href: string;
rebase?: string; rebase?: string;
html: string; html: string;
shadowExpanded?: string
text: string; text: string;
status?: number; status?: number;
statusText?: string; statusText?: string;
@ -157,6 +158,79 @@ function getMaxDepthAndCountUsingTreeWalker(root) {
}; };
} }
function cloneAndExpandShadowRoots(rootElement = document.documentElement) {
// Create a shallow clone of the root element
const clone = rootElement.cloneNode(false);
// Function to process an element and its shadow root
function processShadowRoot(original, cloned) {
if (original.shadowRoot && original.shadowRoot.mode === 'open') {
shadowDomPresents = true;
const shadowContent = document.createDocumentFragment();
// Clone shadow root content normally
original.shadowRoot.childNodes.forEach(childNode => {
const clonedNode = childNode.cloneNode(true);
shadowContent.appendChild(clonedNode);
});
// Handle slots
const slots = shadowContent.querySelectorAll('slot');
slots.forEach(slot => {
const slotName = slot.getAttribute('name') || '';
const assignedElements = original.querySelectorAll(
slotName ? \`[slot="\${slotName}"]\` : ':not([slot])'
);
if (assignedElements.length > 0) {
const slotContent = document.createDocumentFragment();
assignedElements.forEach(el => {
const clonedEl = el.cloneNode(true);
slotContent.appendChild(clonedEl);
});
slot.parentNode.replaceChild(slotContent, slot);
} else if (!slotName) {
// Keep default slot content
// No need to do anything as it's already cloned
}
});
cloned.appendChild(shadowContent);
}
}
// Use a TreeWalker on the original root to clone the entire structure
const treeWalker = document.createTreeWalker(
rootElement,
NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT
);
const elementMap = new Map([[rootElement, clone]]);
let currentNode;
while (currentNode = treeWalker.nextNode()) {
const parentClone = elementMap.get(currentNode.parentNode);
const clonedNode = currentNode.cloneNode(false);
parentClone.appendChild(clonedNode);
if (currentNode.nodeType === Node.ELEMENT_NODE) {
elementMap.set(currentNode, clonedNode);
processShadowRoot(currentNode, clonedNode);
}
}
return clone;
}
function shadowDomPresent(rootElement = document.documentElement) {
const elems = rootElement.querySelectorAll('*');
for (const x of elems) {
if (x.shadowRoot && x.shadowRoot.mode === 'open') {
return true;
}
}
return false;
}
function giveSnapshot(stopActiveSnapshot) { function giveSnapshot(stopActiveSnapshot) {
if (stopActiveSnapshot) { if (stopActiveSnapshot) {
window.haltSnapshot = true; window.haltSnapshot = true;
@ -174,6 +248,7 @@ function giveSnapshot(stopActiveSnapshot) {
href: document.location.href, href: document.location.href,
html: document.documentElement?.outerHTML, html: document.documentElement?.outerHTML,
text: document.body?.innerText, text: document.body?.innerText,
shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined,
parsed: parsed, parsed: parsed,
imgs: [], imgs: [],
maxElemDepth: domAnalysis.maxDepth, maxElemDepth: domAnalysis.maxDepth,

View File

@ -299,12 +299,12 @@ export class SnapshotFormatter extends AsyncService {
&& toBeTurnedToMd !== jsDomElementOfHTML && toBeTurnedToMd !== jsDomElementOfHTML
) { ) {
try { try {
contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML);
} catch (err) { } catch (err) {
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl });
try { try {
contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML);
} catch (err2) { } catch (err2) {
this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); this.logger.warn(`Turndown failed to run, giving up`, { err: err2 });
} }

@ -1 +1 @@
Subproject commit 09a88ebec8ba6154df6cb0b5a3caab07fe7cd150 Subproject commit fecbdd92230de5ebd0de168b43b0358d8221769f