feat: with-iframe and full-page screenshot

This commit is contained in:
yanlong.wang 2024-07-30 15:08:09 +08:00
parent e4d46e7acb
commit 77c8480ca6
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 166 additions and 99 deletions

View File

@ -28,6 +28,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
const md5Hasher = new HashManager('md5', 'hex');
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean;
targetSelector?: string | string[];
removeSelector?: string | string[];
keepImgDataUrl?: boolean;
@ -907,7 +908,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}
try {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
}
@ -1011,6 +1012,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
waitForSelector: opts.waitForSelector,
overrideUserAgent: opts.userAgent,
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe,
};
return crawlOpts;

View File

@ -164,6 +164,11 @@ export class CrawlerOptions extends AutoCastable {
})
keepImgDataUrl!: boolean;
@Prop({
default: false,
})
withIframe!: boolean;
@Prop({
arrayOf: String,
})
@ -238,6 +243,13 @@ export class CrawlerOptions extends AutoCastable {
if (keepImgDataUrl !== undefined) {
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
}
const withIframe = ctx?.req.get('x-with-iframe');
if (withIframe !== undefined) {
instance.withIframe = Boolean(withIframe);
}
if (instance.withIframe) {
instance.timeout ??= null;
}
const cookies: CookieParam[] = [];
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);

View File

@ -54,6 +54,7 @@ export interface PageSnapshot {
imgs?: ImgBrief[];
pdfs?: string[];
maxElemDepth?: number;
childFrames?: PageSnapshot[];
}
export interface ExtendedSnapshot extends PageSnapshot {
@ -88,6 +89,100 @@ puppeteer.use(puppeteerPageProxy({
interceptResolutionPriority: 1,
}));
const SCRIPT_TO_INJECT_INTO_FRAME = `
${READABILITY_JS}
function briefImgs(elem) {
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
return imageTags.map((x)=> {
let linkPreferredSrc = x.src;
if (linkPreferredSrc.startsWith('data:')) {
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
linkPreferredSrc = x.dataset.src;
}
}
return {
src: new URL(linkPreferredSrc, document.location.href).toString(),
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
};
});
}
function briefPDFs() {
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
return pdfTags.map((x)=> {
return x.src === 'about:blank' ? document.location.href : x.src;
});
}
function getMaxDepthUsingTreeWalker(root) {
let maxDepth = 0;
let currentDepth = 0;
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
while (true) {
maxDepth = Math.max(maxDepth, currentDepth);
if (treeWalker.firstChild()) {
currentDepth++;
} else {
while (!treeWalker.nextSibling() && currentDepth > 0) {
treeWalker.parentNode();
currentDepth--;
}
if (currentDepth <= 0) {
break;
}
}
}
return maxDepth + 1;
}
function giveSnapshot(stopActiveSnapshot) {
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
let parsed;
try {
parsed = new Readability(document.cloneNode(true)).parse();
} catch (err) {
void 0;
}
const r = {
title: document.title,
href: document.location.href,
html: document.documentElement?.outerHTML,
text: document.body?.innerText,
parsed: parsed,
imgs: [],
pdfs: briefPDFs(),
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
};
if (parsed && parsed.content) {
const elem = document.createElement('div');
elem.innerHTML = parsed.content;
r.imgs = briefImgs(elem);
} else {
const allImgs = briefImgs();
if (allImgs.length === 1) {
r.imgs = allImgs;
}
}
return r;
}
`;
@singleton()
export class PuppeteerControl extends AsyncService {
@ -206,98 +301,7 @@ export class PuppeteerControl extends AsyncService {
}
page.emit('snapshot', snapshot);
}));
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
preparations.push(page.evaluateOnNewDocument(`
function briefImgs(elem) {
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
return imageTags.map((x)=> {
let linkPreferredSrc = x.src;
if (linkPreferredSrc.startsWith('data:')) {
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
linkPreferredSrc = x.dataset.src;
}
}
return {
src: new URL(linkPreferredSrc, document.location.href).toString(),
loaded: x.complete,
width: x.width,
height: x.height,
naturalWidth: x.naturalWidth,
naturalHeight: x.naturalHeight,
alt: x.alt || x.title,
};
});
}
function briefPDFs() {
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
return pdfTags.map((x)=> {
return x.src === 'about:blank' ? document.location.href : x.src;
});
}
function getMaxDepthUsingTreeWalker(root) {
let maxDepth = 0;
let currentDepth = 0;
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
while (true) {
maxDepth = Math.max(maxDepth, currentDepth);
if (treeWalker.firstChild()) {
currentDepth++;
} else {
while (!treeWalker.nextSibling() && currentDepth > 0) {
treeWalker.parentNode();
currentDepth--;
}
if (currentDepth <= 0) {
break;
}
}
}
return maxDepth + 1;
}
function giveSnapshot(stopActiveSnapshot) {
if (stopActiveSnapshot) {
window.haltSnapshot = true;
}
let parsed;
try {
parsed = new Readability(document.cloneNode(true)).parse();
} catch (err) {
void 0;
}
const r = {
title: document.title,
href: document.location.href,
html: document.documentElement?.outerHTML,
text: document.body?.innerText,
parsed: parsed,
imgs: [],
pdfs: briefPDFs(),
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
};
if (parsed && parsed.content) {
const elem = document.createElement('div');
elem.innerHTML = parsed.content;
r.imgs = briefImgs(elem);
} else {
const allImgs = briefImgs();
if (allImgs.length === 1) {
r.imgs = allImgs;
}
}
return r;
}
`));
preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
preparations.push(page.setRequestInterception(true));
await Promise.all(preparations);
@ -523,8 +527,12 @@ document.addEventListener('load', handlePageLoad);
}
}
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
screenshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
} catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
if (stuff instanceof Error) {
@ -542,8 +550,12 @@ document.addEventListener('load', handlePageLoad);
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
const salvaged = await this.salvage(url, page);
if (salvaged) {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
screenshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
}
}
} catch (err: any) {
@ -572,8 +584,12 @@ document.addEventListener('load', handlePageLoad);
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot();
screenshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
finalized = true;
})
.catch((err) => {
@ -607,7 +623,7 @@ document.addEventListener('load', handlePageLoad);
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot();
screenshot = await page.screenshot({ fullPage: true });
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
@ -649,9 +665,30 @@ document.addEventListener('load', handlePageLoad);
return true;
}
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
const childFrames = page.mainFrame().childFrames();
const r = await Promise.all(childFrames.map(async (x) => {
const thisUrl = x.url();
if (!thisUrl || thisUrl === 'about:blank') {
return undefined;
}
try {
await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME);
return await x.evaluate(`giveSnapshot()`);
} catch (err) {
this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err });
return undefined;
}
})) as PageSnapshot[];
return r.filter(Boolean);
}
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
targetSelector?: string | string[];
removeSelector?: string | string[];
withIframe?: boolean;
}): PageSnapshot | undefined {
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
return snapshot;
@ -662,9 +699,25 @@ document.addEventListener('load', handlePageLoad);
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
const allNodes: Node[] = [];
if (options?.withIframe) {
jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
const src = x.getAttribute('src');
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
if (thisSnapshot?.html) {
x.innerHTML = thisSnapshot.html;
x.querySelectorAll('script, style').forEach((s) => s.remove());
x.querySelectorAll('[src]').forEach((el) => {
el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
});
x.querySelectorAll('[href]').forEach((el) => {
el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
});
}
});
}
if (Array.isArray(options?.removeSelector)) {
for (const rl of options.removeSelector) {
for (const rl of options!.removeSelector) {
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
}
} else if (options?.removeSelector) {
@ -672,7 +725,7 @@ document.addEventListener('load', handlePageLoad);
}
if (Array.isArray(options?.targetSelector)) {
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
x.forEach((el) => {
if (!allNodes.includes(el)) {
allNodes.push(el);