From 77c8480ca6ac004947b8d0e00f1c0ef2c47306c0 Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Tue, 30 Jul 2024 15:08:09 +0800 Subject: [PATCH] feat: with-iframe and full-page screenshot --- .../functions/src/cloud-functions/crawler.ts | 4 +- .../functions/src/dto/scrapping-options.ts | 12 + backend/functions/src/services/puppeteer.ts | 249 +++++++++++------- 3 files changed, 166 insertions(+), 99 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 76ad7d5..b7a50a2 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -28,6 +28,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip- const md5Hasher = new HashManager('md5', 'hex'); export interface ExtraScrappingOptions extends ScrappingOptions { + withIframe?: boolean; targetSelector?: string | string[]; removeSelector?: string | string[]; keepImgDataUrl?: boolean; @@ -907,7 +908,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; } try { - if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) { + if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) { for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) { yield this.puppeteerControl.narrowSnapshot(x, crawlOpts); } @@ -1011,6 +1012,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; waitForSelector: opts.waitForSelector, overrideUserAgent: opts.userAgent, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, + withIframe: opts.withIframe, }; return crawlOpts; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 464ebc2..4177dc3 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -164,6 +164,11 @@ export class CrawlerOptions extends AutoCastable { }) keepImgDataUrl!: boolean; + @Prop({ + default: false, + }) + withIframe!: boolean; + @Prop({ arrayOf: String, }) @@ -238,6 +243,13 @@ export class CrawlerOptions extends AutoCastable { if (keepImgDataUrl !== undefined) { instance.keepImgDataUrl = Boolean(keepImgDataUrl); } + const withIframe = ctx?.req.get('x-with-iframe'); + if (withIframe !== undefined) { + instance.withIframe = Boolean(withIframe); + } + if (instance.withIframe) { + instance.timeout ??= null; + } const cookies: CookieParam[] = []; const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 94279ef..7e0de78 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -54,6 +54,7 @@ export interface PageSnapshot { imgs?: ImgBrief[]; pdfs?: string[]; maxElemDepth?: number; + childFrames?: PageSnapshot[]; } export interface ExtendedSnapshot extends PageSnapshot { @@ -88,6 +89,100 @@ puppeteer.use(puppeteerPageProxy({ interceptResolutionPriority: 1, })); +const SCRIPT_TO_INJECT_INTO_FRAME = ` +${READABILITY_JS} + +function briefImgs(elem) { + const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]')); + + return imageTags.map((x)=> { + let linkPreferredSrc = x.src; + if (linkPreferredSrc.startsWith('data:')) { + if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) { + linkPreferredSrc = x.dataset.src; + } + } + + return { + src: new URL(linkPreferredSrc, document.location.href).toString(), + loaded: x.complete, + width: x.width, + height: x.height, + naturalWidth: x.naturalWidth, + naturalHeight: x.naturalHeight, + alt: x.alt || x.title, + }; + }); +} +function briefPDFs() { + const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]')); + + return pdfTags.map((x)=> { + return x.src === 'about:blank' ? document.location.href : x.src; + }); +} +function getMaxDepthUsingTreeWalker(root) { + let maxDepth = 0; + let currentDepth = 0; + + const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false); + + while (true) { + maxDepth = Math.max(maxDepth, currentDepth); + + if (treeWalker.firstChild()) { + currentDepth++; + } else { + while (!treeWalker.nextSibling() && currentDepth > 0) { + treeWalker.parentNode(); + currentDepth--; + } + + if (currentDepth <= 0) { + break; + } + } + } + + return maxDepth + 1; +} + +function giveSnapshot(stopActiveSnapshot) { + if (stopActiveSnapshot) { + window.haltSnapshot = true; + } + let parsed; + try { + parsed = new Readability(document.cloneNode(true)).parse(); + } catch (err) { + void 0; + } + + const r = { + title: document.title, + href: document.location.href, + html: document.documentElement?.outerHTML, + text: document.body?.innerText, + parsed: parsed, + imgs: [], + pdfs: briefPDFs(), + maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement) + }; + if (parsed && parsed.content) { + const elem = document.createElement('div'); + elem.innerHTML = parsed.content; + r.imgs = briefImgs(elem); + } else { + const allImgs = briefImgs(); + if (allImgs.length === 1) { + r.imgs = allImgs; + } + } + + return r; +} +`; + @singleton() export class PuppeteerControl extends AsyncService { @@ -206,98 +301,7 @@ export class PuppeteerControl extends AsyncService { } page.emit('snapshot', snapshot); })); - preparations.push(page.evaluateOnNewDocument(READABILITY_JS)); - preparations.push(page.evaluateOnNewDocument(` -function briefImgs(elem) { - const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]')); - - return imageTags.map((x)=> { - let linkPreferredSrc = x.src; - if (linkPreferredSrc.startsWith('data:')) { - if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) { - linkPreferredSrc = x.dataset.src; - } - } - - return { - src: new URL(linkPreferredSrc, document.location.href).toString(), - loaded: x.complete, - width: x.width, - height: x.height, - naturalWidth: x.naturalWidth, - naturalHeight: x.naturalHeight, - alt: x.alt || x.title, - }; - }); -} -function briefPDFs() { - const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]')); - - return pdfTags.map((x)=> { - return x.src === 'about:blank' ? document.location.href : x.src; - }); -} -function getMaxDepthUsingTreeWalker(root) { - let maxDepth = 0; - let currentDepth = 0; - - const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false); - - while (true) { - maxDepth = Math.max(maxDepth, currentDepth); - - if (treeWalker.firstChild()) { - currentDepth++; - } else { - while (!treeWalker.nextSibling() && currentDepth > 0) { - treeWalker.parentNode(); - currentDepth--; - } - - if (currentDepth <= 0) { - break; - } - } - } - - return maxDepth + 1; -} - -function giveSnapshot(stopActiveSnapshot) { - if (stopActiveSnapshot) { - window.haltSnapshot = true; - } - let parsed; - try { - parsed = new Readability(document.cloneNode(true)).parse(); - } catch (err) { - void 0; - } - - const r = { - title: document.title, - href: document.location.href, - html: document.documentElement?.outerHTML, - text: document.body?.innerText, - parsed: parsed, - imgs: [], - pdfs: briefPDFs(), - maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement) - }; - if (parsed && parsed.content) { - const elem = document.createElement('div'); - elem.innerHTML = parsed.content; - r.imgs = briefImgs(elem); - } else { - const allImgs = briefImgs(); - if (allImgs.length === 1) { - r.imgs = allImgs; - } - } - - return r; -} -`)); + preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME)); preparations.push(page.setRequestInterception(true)); await Promise.all(preparations); @@ -523,8 +527,12 @@ document.addEventListener('load', handlePageLoad); } } try { + const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); + screenshot = await page.screenshot({ fullPage: true }); + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } } catch (err: any) { this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) }); if (stuff instanceof Error) { @@ -542,8 +550,12 @@ document.addEventListener('load', handlePageLoad); if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) { const salvaged = await this.salvage(url, page); if (salvaged) { + const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); + screenshot = await page.screenshot({ fullPage: true }); + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } } } } catch (err: any) { @@ -572,8 +584,12 @@ document.addEventListener('load', handlePageLoad); Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) : page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout })) .then(async () => { + const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot(); + screenshot = await page.screenshot({ fullPage: true }); + if (snapshot) { + snapshot.childFrames = await pSubFrameSnapshots; + } finalized = true; }) .catch((err) => { @@ -607,7 +623,7 @@ document.addEventListener('load', handlePageLoad); break; } if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { - screenshot = await page.screenshot(); + screenshot = await page.screenshot({ fullPage: true }); lastHTML = snapshot.html; } if (snapshot || screenshot) { @@ -649,9 +665,30 @@ document.addEventListener('load', handlePageLoad); return true; } + async snapshotChildFrames(page: Page): Promise { + const childFrames = page.mainFrame().childFrames(); + const r = await Promise.all(childFrames.map(async (x) => { + const thisUrl = x.url(); + if (!thisUrl || thisUrl === 'about:blank') { + return undefined; + } + try { + await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME); + + return await x.evaluate(`giveSnapshot()`); + } catch (err) { + this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err }); + return undefined; + } + })) as PageSnapshot[]; + + return r.filter(Boolean); + } + narrowSnapshot(snapshot: PageSnapshot | undefined, options?: { targetSelector?: string | string[]; removeSelector?: string | string[]; + withIframe?: boolean; }): PageSnapshot | undefined { if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) { return snapshot; @@ -662,9 +699,25 @@ document.addEventListener('load', handlePageLoad); const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole }); const allNodes: Node[] = []; + if (options?.withIframe) { + jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => { + const src = x.getAttribute('src'); + const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src); + if (thisSnapshot?.html) { + x.innerHTML = thisSnapshot.html; + x.querySelectorAll('script, style').forEach((s) => s.remove()); + x.querySelectorAll('[src]').forEach((el) => { + el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString()); + }); + x.querySelectorAll('[href]').forEach((el) => { + el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString()); + }); + } + }); + } if (Array.isArray(options?.removeSelector)) { - for (const rl of options.removeSelector) { + for (const rl of options!.removeSelector) { jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove()); } } else if (options?.removeSelector) { @@ -672,7 +725,7 @@ document.addEventListener('load', handlePageLoad); } if (Array.isArray(options?.targetSelector)) { - for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { + for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) { x.forEach((el) => { if (!allNodes.includes(el)) { allNodes.push(el);