diff --git a/backend/functions/package-lock.json b/backend/functions/package-lock.json index 83009d5..a1cbe12 100644 --- a/backend/functions/package-lock.json +++ b/backend/functions/package-lock.json @@ -15,13 +15,13 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.8.0-8592519", + "civkit": "^0.8.1-1f42c5a", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", "express": "^4.19.2", "firebase-admin": "^12.1.0", - "firebase-functions": "^6.0.1", + "firebase-functions": "^6.1.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", @@ -2176,12 +2176,14 @@ } }, "node_modules/@types/express": { - "version": "4.17.3", - "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.3.tgz", - "integrity": "sha512-I8cGRJj3pyOLs/HndoP+25vOqhqWkAZsWMEmq1qXy/b/M3ppufecUwaK2/TVDVxcV61/iSdhykUjQQ2DLSrTdg==", + "version": "4.17.21", + "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz", + "integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==", + "license": "MIT", "dependencies": { "@types/body-parser": "*", - "@types/express-serve-static-core": "*", + "@types/express-serve-static-core": "^4.17.33", + "@types/qs": "*", "@types/serve-static": "*" } }, @@ -3727,9 +3729,10 @@ } }, "node_modules/civkit": { - "version": "0.8.0-8592519", - "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.0-8592519.tgz", - "integrity": "sha512-CFd6RLjYyKkNNlzE/kBqWqiYQJOzMXL2uuMiDYGy+IY4WnO5U9wzQ1VQDEWSPWDZl+czybyVGTp0Uz5s9NyA5A==", + "version": "0.8.1-1f42c5a", + "resolved": "https://registry.npmjs.org/civkit/-/civkit-0.8.1-1f42c5a.tgz", + "integrity": "sha512-+cXywfdiu9+QbnNmJXKCjiAdEUdGRiiZ8zg/YKRqsr4vaX6lFNEI3P0J1FOj1x3vRL9cESGucXN6rh0AfmHHTQ==", + "license": "AGPL", "dependencies": { "lodash": "^4.17.21", "tslib": "^2.5.0" @@ -5510,15 +5513,15 @@ } }, "node_modules/firebase-functions": { - "version": "6.0.1", - "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.0.1.tgz", - "integrity": "sha512-0rIpTU6dnLRvP3IK+okn1FDjoqjzShm0/S+i4OMY7JFu/HJoyJ1JNkrT4KjECy1/mCHK49KsmH8iYE0rzrglHg==", + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/firebase-functions/-/firebase-functions-6.1.0.tgz", + "integrity": "sha512-7Gq7XpIA2qo9wKhYA9Ksb0v2bHfXD70zQwBJO6//Q624A7D9KAb449K6DM0swrCoPO7NGExbPf2eC7j7e+4+xA==", "license": "MIT", "dependencies": { "@types/cors": "^2.8.5", - "@types/express": "4.17.3", + "@types/express": "^4.17.21", "cors": "^2.8.5", - "express": "^4.17.1", + "express": "^4.21.0", "protobufjs": "^7.2.2" }, "bin": { @@ -7848,17 +7851,6 @@ "node": ">=14" } }, - "node_modules/jwks-rsa/node_modules/@types/express": { - "version": "4.17.21", - "resolved": "https://registry.npmjs.org/@types/express/-/express-4.17.21.tgz", - "integrity": "sha512-ejlPM315qwLpaQlQDTjPdsUFSc6ZsP4AN6AlWnogPjQ7CVi7PYF3YVz+CY3jE2pwYf7E/7HlDAN0rV2GxTG0HQ==", - "dependencies": { - "@types/body-parser": "*", - "@types/express-serve-static-core": "^4.17.33", - "@types/qs": "*", - "@types/serve-static": "*" - } - }, "node_modules/jwks-rsa/node_modules/jose": { "version": "4.15.5", "resolved": "https://registry.npmjs.org/jose/-/jose-4.15.5.tgz", diff --git a/backend/functions/package.json b/backend/functions/package.json index eecb27f..3cb2d75 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -35,13 +35,13 @@ "archiver": "^6.0.1", "axios": "^1.3.3", "bcrypt": "^5.1.0", - "civkit": "^0.8.0-8592519", + "civkit": "^0.8.1-1f42c5a", "core-js": "^3.37.1", "cors": "^2.8.5", "dayjs": "^1.11.9", "express": "^4.19.2", "firebase-admin": "^12.1.0", - "firebase-functions": "^6.0.1", + "firebase-functions": "^6.1.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "langdetect": "^0.2.1", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index e473782..872bcd7 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -24,6 +24,7 @@ import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapsho export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean; + withShadowDom?: boolean; targetSelector?: string | string[]; removeSelector?: string | string[]; keepImgDataUrl?: boolean; @@ -571,7 +572,7 @@ export class CrawlerHost extends RPCHost { } try { - if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) { + if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe || crawlOpts?.withShadowDom) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { yield this.jsdomControl.narrowSnapshot(x, crawlOpts); } @@ -686,6 +687,7 @@ export class CrawlerHost extends RPCHost { overrideUserAgent: opts.userAgent, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, withIframe: opts.withIframe, + withShadowDom: opts.withShadowDom, locale: opts.locale, referer: opts.referer, }; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index e8a6de4..08a04a5 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -101,6 +101,16 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser'; in: 'header', schema: { type: 'string' } }, + 'X-With-Iframe': { + description: `Enable filling iframe contents into main. (violates standards)`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Shadow-Dom': { + description: `Enable filling shadow dom contents into main. (violates standards)`, + in: 'header', + schema: { type: 'string' } + }, 'X-User-Agent': { description: `Override User-Agent.`, in: 'header', @@ -185,6 +195,11 @@ export class CrawlerOptions extends AutoCastable { }) withIframe!: boolean; + @Prop({ + default: false, + }) + withShadowDom!: boolean; + @Prop({ arrayOf: String, }) @@ -283,6 +298,13 @@ export class CrawlerOptions extends AutoCastable { if (instance.withIframe) { instance.timeout ??= null; } + const withShadowDom = ctx?.req.get('x-with-shadow-dom'); + if (withShadowDom) { + instance.withShadowDom = Boolean(withShadowDom); + } + if (instance.withShadowDom) { + instance.timeout ??= null; + } const cookies: CookieParam[] = []; const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); diff --git a/backend/functions/src/index.ts b/backend/functions/src/index.ts index 45215ca..3e03ff5 100644 --- a/backend/functions/src/index.ts +++ b/backend/functions/src/index.ts @@ -1,5 +1,5 @@ import 'reflect-metadata'; -import './shared/lib/doom-domain'; +// import './shared/lib/doom-domain'; import { initializeApp } from 'firebase-admin/app'; initializeApp(); diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 4f97da4..b68ecc4 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -5,6 +5,7 @@ import { ExtendedSnapshot, PageSnapshot } from './puppeteer'; import { Readability } from '@mozilla/readability'; import TurndownService from 'turndown'; import { Threaded } from '../shared/services/threaded'; +import type { ExtraScrappingOptions } from '../cloud-functions/crawler'; const pLinkedom = import('linkedom'); @@ -27,12 +28,8 @@ export class JSDomControl extends AsyncService { this.emit('ready'); } - async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { - targetSelector?: string | string[]; - removeSelector?: string | string[]; - withIframe?: boolean; - }) { - if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe) { + async narrowSnapshot(snapshot: PageSnapshot | undefined, options?: ExtraScrappingOptions) { + if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector && !options?.withIframe && !options?.withShadowDom) { return snapshot; } if (!snapshot?.html) { @@ -43,14 +40,13 @@ export class JSDomControl extends AsyncService { } @Threaded() - async actualNarrowSnapshot(snapshot: PageSnapshot, options?: { - targetSelector?: string | string[]; - removeSelector?: string | string[]; - withIframe?: boolean; - }): Promise { - + async actualNarrowSnapshot(snapshot: PageSnapshot, options?: ExtraScrappingOptions): Promise { const t0 = Date.now(); - const jsdom = this.linkedom.parseHTML(snapshot.html); + let sourceHTML = snapshot.html; + if (options?.withShadowDom && snapshot.shadowExpanded) { + sourceHTML = snapshot.shadowExpanded; + } + const jsdom = this.linkedom.parseHTML(sourceHTML); const allNodes: Node[] = []; jsdom.window.document.querySelectorAll('svg').forEach((x) => x.innerHTML = ''); if (options?.withIframe) { @@ -107,12 +103,12 @@ export class JSDomControl extends AsyncService { return snapshot; } - const textChunks: string[] = []; + const textNodes: HTMLElement[] = []; let rootDoc: Document; if (allNodes.length === 1 && allNodes[0].nodeName === '#document') { rootDoc = allNodes[0] as any; if (rootDoc.body.innerText) { - textChunks.push(rootDoc.body.innerText); + textNodes.push(rootDoc.body); } } else { rootDoc = this.linkedom.parseHTML('').window.document; @@ -120,10 +116,16 @@ export class JSDomControl extends AsyncService { rootDoc.body.appendChild(n); rootDoc.body.appendChild(rootDoc.createTextNode('\n\n')); if ((n as HTMLElement).innerText) { - textChunks.push((n as HTMLElement).innerText); + textNodes.push(n as HTMLElement); } } } + const textChunks = textNodes.map((x) => { + const clone = x.cloneNode(true) as HTMLElement; + clone.querySelectorAll('script,style,link,svg').forEach((s) => s.remove()); + + return clone.innerText; + }); let parsed; try { @@ -229,6 +231,14 @@ export class JSDomControl extends AsyncService { snippetToElement(snippet?: string, url?: string) { const parsed = this.linkedom.parseHTML(snippet || ''); + // Hack for turndown gfm table plugin. + parsed.window.document.querySelectorAll('table').forEach((x) => { + Object.defineProperty(x, 'rows', { value: Array.from(x.querySelectorAll('tr')), enumerable: true }); + }); + Object.defineProperty(parsed.window.document.documentElement, 'cloneNode', { + value: function () { return this; }, + }); + return parsed.window.document.documentElement; } diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index be4576b..1f3a7f5 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -46,6 +46,7 @@ export interface PageSnapshot { href: string; rebase?: string; html: string; + shadowExpanded?: string text: string; status?: number; statusText?: string; @@ -157,6 +158,79 @@ function getMaxDepthAndCountUsingTreeWalker(root) { }; } +function cloneAndExpandShadowRoots(rootElement = document.documentElement) { + // Create a shallow clone of the root element + const clone = rootElement.cloneNode(false); + // Function to process an element and its shadow root + function processShadowRoot(original, cloned) { + if (original.shadowRoot && original.shadowRoot.mode === 'open') { + shadowDomPresents = true; + const shadowContent = document.createDocumentFragment(); + + // Clone shadow root content normally + original.shadowRoot.childNodes.forEach(childNode => { + const clonedNode = childNode.cloneNode(true); + shadowContent.appendChild(clonedNode); + }); + + // Handle slots + const slots = shadowContent.querySelectorAll('slot'); + slots.forEach(slot => { + const slotName = slot.getAttribute('name') || ''; + const assignedElements = original.querySelectorAll( + slotName ? \`[slot="\${slotName}"]\` : ':not([slot])' + ); + + if (assignedElements.length > 0) { + const slotContent = document.createDocumentFragment(); + assignedElements.forEach(el => { + const clonedEl = el.cloneNode(true); + slotContent.appendChild(clonedEl); + }); + slot.parentNode.replaceChild(slotContent, slot); + } else if (!slotName) { + // Keep default slot content + // No need to do anything as it's already cloned + } + }); + + cloned.appendChild(shadowContent); + } + } + + // Use a TreeWalker on the original root to clone the entire structure + const treeWalker = document.createTreeWalker( + rootElement, + NodeFilter.SHOW_ELEMENT | NodeFilter.SHOW_TEXT + ); + + const elementMap = new Map([[rootElement, clone]]); + + let currentNode; + while (currentNode = treeWalker.nextNode()) { + const parentClone = elementMap.get(currentNode.parentNode); + const clonedNode = currentNode.cloneNode(false); + parentClone.appendChild(clonedNode); + + if (currentNode.nodeType === Node.ELEMENT_NODE) { + elementMap.set(currentNode, clonedNode); + processShadowRoot(currentNode, clonedNode); + } + } + + return clone; +} + +function shadowDomPresent(rootElement = document.documentElement) { + const elems = rootElement.querySelectorAll('*'); + for (const x of elems) { + if (x.shadowRoot && x.shadowRoot.mode === 'open') { + return true; + } + } + return false; +} + function giveSnapshot(stopActiveSnapshot) { if (stopActiveSnapshot) { window.haltSnapshot = true; @@ -174,6 +248,7 @@ function giveSnapshot(stopActiveSnapshot) { href: document.location.href, html: document.documentElement?.outerHTML, text: document.body?.innerText, + shadowExpanded: shadowDomPresent() ? cloneAndExpandShadowRoots()?.outerHTML : undefined, parsed: parsed, imgs: [], maxElemDepth: domAnalysis.maxDepth, diff --git a/backend/functions/src/services/snapshot-formatter.ts b/backend/functions/src/services/snapshot-formatter.ts index 0c4b28b..fdce1d5 100644 --- a/backend/functions/src/services/snapshot-formatter.ts +++ b/backend/functions/src/services/snapshot-formatter.ts @@ -299,12 +299,12 @@ export class SnapshotFormatter extends AsyncService { && toBeTurnedToMd !== jsDomElementOfHTML ) { try { - contentText = this.jsdomControl.runTurndown(turnDownService, snapshot.html); + contentText = this.jsdomControl.runTurndown(turnDownService, jsDomElementOfHTML); } catch (err) { this.logger.warn(`Turndown failed to run, retrying without plugins`, { err }); const vanillaTurnDownService = this.getTurndown({ url: snapshot.rebase || nominalUrl, imgDataUrlToObjectUrl }); try { - contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, snapshot.html); + contentText = this.jsdomControl.runTurndown(vanillaTurnDownService, jsDomElementOfHTML); } catch (err2) { this.logger.warn(`Turndown failed to run, giving up`, { err: err2 }); } diff --git a/thinapps-shared b/thinapps-shared index 09a88eb..fecbdd9 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 09a88ebec8ba6154df6cb0b5a3caab07fe7cd150 +Subproject commit fecbdd92230de5ebd0de168b43b0358d8221769f