mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 11:25:54 +08:00
feat: with-iframe and full-page screenshot
This commit is contained in:
parent
e4d46e7acb
commit
77c8480ca6
@ -28,6 +28,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
|||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
|
withIframe?: boolean;
|
||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
keepImgDataUrl?: boolean;
|
keepImgDataUrl?: boolean;
|
||||||
@ -907,7 +908,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector) {
|
if (crawlOpts?.targetSelector || crawlOpts?.removeSelector || crawlOpts?.withIframe) {
|
||||||
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
for await (const x of this.puppeteerControl.scrap(urlToCrawl, crawlOpts)) {
|
||||||
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
yield this.puppeteerControl.narrowSnapshot(x, crawlOpts);
|
||||||
}
|
}
|
||||||
@ -1011,6 +1012,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
waitForSelector: opts.waitForSelector,
|
waitForSelector: opts.waitForSelector,
|
||||||
overrideUserAgent: opts.userAgent,
|
overrideUserAgent: opts.userAgent,
|
||||||
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
|
||||||
|
withIframe: opts.withIframe,
|
||||||
};
|
};
|
||||||
|
|
||||||
return crawlOpts;
|
return crawlOpts;
|
||||||
|
@ -164,6 +164,11 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
})
|
})
|
||||||
keepImgDataUrl!: boolean;
|
keepImgDataUrl!: boolean;
|
||||||
|
|
||||||
|
@Prop({
|
||||||
|
default: false,
|
||||||
|
})
|
||||||
|
withIframe!: boolean;
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
arrayOf: String,
|
arrayOf: String,
|
||||||
})
|
})
|
||||||
@ -238,6 +243,13 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (keepImgDataUrl !== undefined) {
|
if (keepImgDataUrl !== undefined) {
|
||||||
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
instance.keepImgDataUrl = Boolean(keepImgDataUrl);
|
||||||
}
|
}
|
||||||
|
const withIframe = ctx?.req.get('x-with-iframe');
|
||||||
|
if (withIframe !== undefined) {
|
||||||
|
instance.withIframe = Boolean(withIframe);
|
||||||
|
}
|
||||||
|
if (instance.withIframe) {
|
||||||
|
instance.timeout ??= null;
|
||||||
|
}
|
||||||
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
const setCookieHeaders = ctx?.req.get('x-set-cookie')?.split(', ') || (instance.setCookies as any as string[]);
|
||||||
|
@ -54,6 +54,7 @@ export interface PageSnapshot {
|
|||||||
imgs?: ImgBrief[];
|
imgs?: ImgBrief[];
|
||||||
pdfs?: string[];
|
pdfs?: string[];
|
||||||
maxElemDepth?: number;
|
maxElemDepth?: number;
|
||||||
|
childFrames?: PageSnapshot[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ExtendedSnapshot extends PageSnapshot {
|
export interface ExtendedSnapshot extends PageSnapshot {
|
||||||
@ -88,6 +89,100 @@ puppeteer.use(puppeteerPageProxy({
|
|||||||
interceptResolutionPriority: 1,
|
interceptResolutionPriority: 1,
|
||||||
}));
|
}));
|
||||||
|
|
||||||
|
const SCRIPT_TO_INJECT_INTO_FRAME = `
|
||||||
|
${READABILITY_JS}
|
||||||
|
|
||||||
|
function briefImgs(elem) {
|
||||||
|
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
||||||
|
|
||||||
|
return imageTags.map((x)=> {
|
||||||
|
let linkPreferredSrc = x.src;
|
||||||
|
if (linkPreferredSrc.startsWith('data:')) {
|
||||||
|
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
||||||
|
linkPreferredSrc = x.dataset.src;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
||||||
|
loaded: x.complete,
|
||||||
|
width: x.width,
|
||||||
|
height: x.height,
|
||||||
|
naturalWidth: x.naturalWidth,
|
||||||
|
naturalHeight: x.naturalHeight,
|
||||||
|
alt: x.alt || x.title,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function briefPDFs() {
|
||||||
|
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
||||||
|
|
||||||
|
return pdfTags.map((x)=> {
|
||||||
|
return x.src === 'about:blank' ? document.location.href : x.src;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
function getMaxDepthUsingTreeWalker(root) {
|
||||||
|
let maxDepth = 0;
|
||||||
|
let currentDepth = 0;
|
||||||
|
|
||||||
|
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
maxDepth = Math.max(maxDepth, currentDepth);
|
||||||
|
|
||||||
|
if (treeWalker.firstChild()) {
|
||||||
|
currentDepth++;
|
||||||
|
} else {
|
||||||
|
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
||||||
|
treeWalker.parentNode();
|
||||||
|
currentDepth--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentDepth <= 0) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return maxDepth + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
function giveSnapshot(stopActiveSnapshot) {
|
||||||
|
if (stopActiveSnapshot) {
|
||||||
|
window.haltSnapshot = true;
|
||||||
|
}
|
||||||
|
let parsed;
|
||||||
|
try {
|
||||||
|
parsed = new Readability(document.cloneNode(true)).parse();
|
||||||
|
} catch (err) {
|
||||||
|
void 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
const r = {
|
||||||
|
title: document.title,
|
||||||
|
href: document.location.href,
|
||||||
|
html: document.documentElement?.outerHTML,
|
||||||
|
text: document.body?.innerText,
|
||||||
|
parsed: parsed,
|
||||||
|
imgs: [],
|
||||||
|
pdfs: briefPDFs(),
|
||||||
|
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
||||||
|
};
|
||||||
|
if (parsed && parsed.content) {
|
||||||
|
const elem = document.createElement('div');
|
||||||
|
elem.innerHTML = parsed.content;
|
||||||
|
r.imgs = briefImgs(elem);
|
||||||
|
} else {
|
||||||
|
const allImgs = briefImgs();
|
||||||
|
if (allImgs.length === 1) {
|
||||||
|
r.imgs = allImgs;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return r;
|
||||||
|
}
|
||||||
|
`;
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class PuppeteerControl extends AsyncService {
|
export class PuppeteerControl extends AsyncService {
|
||||||
|
|
||||||
@ -206,98 +301,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
page.emit('snapshot', snapshot);
|
page.emit('snapshot', snapshot);
|
||||||
}));
|
}));
|
||||||
preparations.push(page.evaluateOnNewDocument(READABILITY_JS));
|
preparations.push(page.evaluateOnNewDocument(SCRIPT_TO_INJECT_INTO_FRAME));
|
||||||
preparations.push(page.evaluateOnNewDocument(`
|
|
||||||
function briefImgs(elem) {
|
|
||||||
const imageTags = Array.from((elem || document).querySelectorAll('img[src],img[data-src]'));
|
|
||||||
|
|
||||||
return imageTags.map((x)=> {
|
|
||||||
let linkPreferredSrc = x.src;
|
|
||||||
if (linkPreferredSrc.startsWith('data:')) {
|
|
||||||
if (typeof x.dataset?.src === 'string' && !x.dataset.src.startsWith('data:')) {
|
|
||||||
linkPreferredSrc = x.dataset.src;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
src: new URL(linkPreferredSrc, document.location.href).toString(),
|
|
||||||
loaded: x.complete,
|
|
||||||
width: x.width,
|
|
||||||
height: x.height,
|
|
||||||
naturalWidth: x.naturalWidth,
|
|
||||||
naturalHeight: x.naturalHeight,
|
|
||||||
alt: x.alt || x.title,
|
|
||||||
};
|
|
||||||
});
|
|
||||||
}
|
|
||||||
function briefPDFs() {
|
|
||||||
const pdfTags = Array.from(document.querySelectorAll('embed[type="application/pdf"]'));
|
|
||||||
|
|
||||||
return pdfTags.map((x)=> {
|
|
||||||
return x.src === 'about:blank' ? document.location.href : x.src;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
function getMaxDepthUsingTreeWalker(root) {
|
|
||||||
let maxDepth = 0;
|
|
||||||
let currentDepth = 0;
|
|
||||||
|
|
||||||
const treeWalker = document.createTreeWalker(root, NodeFilter.SHOW_ELEMENT, null, false);
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
maxDepth = Math.max(maxDepth, currentDepth);
|
|
||||||
|
|
||||||
if (treeWalker.firstChild()) {
|
|
||||||
currentDepth++;
|
|
||||||
} else {
|
|
||||||
while (!treeWalker.nextSibling() && currentDepth > 0) {
|
|
||||||
treeWalker.parentNode();
|
|
||||||
currentDepth--;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentDepth <= 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return maxDepth + 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
function giveSnapshot(stopActiveSnapshot) {
|
|
||||||
if (stopActiveSnapshot) {
|
|
||||||
window.haltSnapshot = true;
|
|
||||||
}
|
|
||||||
let parsed;
|
|
||||||
try {
|
|
||||||
parsed = new Readability(document.cloneNode(true)).parse();
|
|
||||||
} catch (err) {
|
|
||||||
void 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
const r = {
|
|
||||||
title: document.title,
|
|
||||||
href: document.location.href,
|
|
||||||
html: document.documentElement?.outerHTML,
|
|
||||||
text: document.body?.innerText,
|
|
||||||
parsed: parsed,
|
|
||||||
imgs: [],
|
|
||||||
pdfs: briefPDFs(),
|
|
||||||
maxElemDepth: getMaxDepthUsingTreeWalker(document.documentElement)
|
|
||||||
};
|
|
||||||
if (parsed && parsed.content) {
|
|
||||||
const elem = document.createElement('div');
|
|
||||||
elem.innerHTML = parsed.content;
|
|
||||||
r.imgs = briefImgs(elem);
|
|
||||||
} else {
|
|
||||||
const allImgs = briefImgs();
|
|
||||||
if (allImgs.length === 1) {
|
|
||||||
r.imgs = allImgs;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
`));
|
|
||||||
preparations.push(page.setRequestInterception(true));
|
preparations.push(page.setRequestInterception(true));
|
||||||
|
|
||||||
await Promise.all(preparations);
|
await Promise.all(preparations);
|
||||||
@ -523,8 +527,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot({ fullPage: true });
|
||||||
|
if (snapshot) {
|
||||||
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Page ${sn}: Failed to finalize ${url}`, { err: marshalErrorLike(err) });
|
||||||
if (stuff instanceof Error) {
|
if (stuff instanceof Error) {
|
||||||
@ -542,8 +550,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
|
||||||
const salvaged = await this.salvage(url, page);
|
const salvaged = await this.salvage(url, page);
|
||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot({ fullPage: true });
|
||||||
|
if (snapshot) {
|
||||||
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
@ -572,8 +584,12 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
Promise.all(options.waitForSelector.map((x) => page.waitForSelector(x, { timeout: thisTimeout }))) :
|
||||||
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
page.waitForSelector(options.waitForSelector!, { timeout: thisTimeout }))
|
||||||
.then(async () => {
|
.then(async () => {
|
||||||
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot({ fullPage: true });
|
||||||
|
if (snapshot) {
|
||||||
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
|
}
|
||||||
finalized = true;
|
finalized = true;
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
@ -607,7 +623,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
screenshot = await page.screenshot();
|
screenshot = await page.screenshot({ fullPage: true });
|
||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
if (snapshot || screenshot) {
|
if (snapshot || screenshot) {
|
||||||
@ -649,9 +665,30 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
|
||||||
|
const childFrames = page.mainFrame().childFrames();
|
||||||
|
const r = await Promise.all(childFrames.map(async (x) => {
|
||||||
|
const thisUrl = x.url();
|
||||||
|
if (!thisUrl || thisUrl === 'about:blank') {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
await x.evaluate(SCRIPT_TO_INJECT_INTO_FRAME);
|
||||||
|
|
||||||
|
return await x.evaluate(`giveSnapshot()`);
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn(`Failed to snapshot child frame ${thisUrl}`, { err });
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
})) as PageSnapshot[];
|
||||||
|
|
||||||
|
return r.filter(Boolean);
|
||||||
|
}
|
||||||
|
|
||||||
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
narrowSnapshot(snapshot: PageSnapshot | undefined, options?: {
|
||||||
targetSelector?: string | string[];
|
targetSelector?: string | string[];
|
||||||
removeSelector?: string | string[];
|
removeSelector?: string | string[];
|
||||||
|
withIframe?: boolean;
|
||||||
}): PageSnapshot | undefined {
|
}): PageSnapshot | undefined {
|
||||||
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
if (snapshot?.parsed && !options?.targetSelector && !options?.removeSelector) {
|
||||||
return snapshot;
|
return snapshot;
|
||||||
@ -662,9 +699,25 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
const jsdom = new JSDOM(snapshot.html, { url: snapshot.href, virtualConsole });
|
||||||
const allNodes: Node[] = [];
|
const allNodes: Node[] = [];
|
||||||
|
if (options?.withIframe) {
|
||||||
|
jsdom.window.document.querySelectorAll('iframe[src]').forEach((x) => {
|
||||||
|
const src = x.getAttribute('src');
|
||||||
|
const thisSnapshot = snapshot.childFrames?.find((f) => f.href === src);
|
||||||
|
if (thisSnapshot?.html) {
|
||||||
|
x.innerHTML = thisSnapshot.html;
|
||||||
|
x.querySelectorAll('script, style').forEach((s) => s.remove());
|
||||||
|
x.querySelectorAll('[src]').forEach((el) => {
|
||||||
|
el.setAttribute('src', new URL(el.getAttribute('src')!, src!).toString());
|
||||||
|
});
|
||||||
|
x.querySelectorAll('[href]').forEach((el) => {
|
||||||
|
el.setAttribute('href', new URL(el.getAttribute('href')!, src!).toString());
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
if (Array.isArray(options?.removeSelector)) {
|
if (Array.isArray(options?.removeSelector)) {
|
||||||
for (const rl of options.removeSelector) {
|
for (const rl of options!.removeSelector) {
|
||||||
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
jsdom.window.document.querySelectorAll(rl).forEach((x) => x.remove());
|
||||||
}
|
}
|
||||||
} else if (options?.removeSelector) {
|
} else if (options?.removeSelector) {
|
||||||
@ -672,7 +725,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (Array.isArray(options?.targetSelector)) {
|
if (Array.isArray(options?.targetSelector)) {
|
||||||
for (const x of options.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
for (const x of options!.targetSelector.map((x) => jsdom.window.document.querySelectorAll(x))) {
|
||||||
x.forEach((el) => {
|
x.forEach((el) => {
|
||||||
if (!allNodes.includes(el)) {
|
if (!allNodes.includes(el)) {
|
||||||
allNodes.push(el);
|
allNodes.push(el);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user