From d3f3a8502ac22cccb0bf3a0b06a4152de5e11e09 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 30 Jul 2024 20:09:06 +0800 Subject: [PATCH] fix: revert screenshot behavior and introduce pageshot --- .../functions/src/cloud-functions/crawler.ts | 57 ++++++++++++++++++- backend/functions/src/db/crawled.ts | 5 +- .../functions/src/dto/scrapping-options.ts | 1 + backend/functions/src/services/puppeteer.ts | 19 ++++--- 4 files changed, 71 insertions(+), 11 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index d43b0e8..5b2adfc 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -45,6 +45,8 @@ export interface FormattedPage { text?: string; screenshotUrl?: string; screenshot?: Buffer; + pageshotUrl?: string; + pageshot?: Buffer; links?: { [k: string]: string; }; images?: { [k: string]: string; }; @@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost { return mixin; } - async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { + async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & { screenshotUrl?: string; + pageshotUrl?: string; }, nominalUrl?: URL) { if (mode === 'screenshot') { if (snapshot.screenshot && !snapshot.screenshotUrl) { @@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost { } } as FormattedPage; } + if (mode === 'pageshot') { + if (snapshot.pageshot && !snapshot.pageshotUrl) { + const fid = `instant-screenshots/${randomUUID()}`; + await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, { + metadata: { + contentType: 'image/png', + } + }); + snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs); + } + + return { + ...this.getGeneralSnapshotMixins(snapshot), + html: snapshot.html, + pageshotUrl: snapshot.pageshotUrl, + toString() { + return this.pageshotUrl; + } + } as FormattedPage; + } if (mode === 'html') { return { ...this.getGeneralSnapshotMixins(snapshot), @@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } ); } + if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { + + return assignTransferProtocolMeta(`${formatted}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } + ); + } return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } @@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } ); } + if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { + + return assignTransferProtocolMeta(`${formatted}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } + ); + } return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); } @@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; let snapshot: PageSnapshot | undefined; let screenshotUrl: string | undefined; + let pageshotUrl: string | undefined; const preparations = [ this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => { snapshot = JSON.parse(r.toString('utf-8')); @@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { screenshotUrl = r; }) : + Promise.resolve(undefined), + cache.pageshotAvailable ? + this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { + pageshotUrl = r; + }) : Promise.resolve(undefined) ]; try { @@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; snapshot: { ...snapshot, screenshot: undefined, + pageshot: undefined, screenshotUrl, - } as PageSnapshot & { screenshotUrl?: string; } + pageshotUrl, + } as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; } }; } @@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; }); cache.screenshotAvailable = true; } + if (snapshot.pageshot) { + await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, { + metadata: { + contentType: 'image/png', + } + }); + cache.pageshotAvailable = true; + } await savingOfSnapshot; const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); @@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, cookies: opts.setCookies, - favorScreenshot: opts.respondWith === 'screenshot', + favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith), removeSelector: opts.removeSelector, targetSelector: opts.targetSelector, waitForSelector: opts.waitForSelector, diff --git a/backend/functions/src/db/crawled.ts b/backend/functions/src/db/crawled.ts index 212e8a1..4ec64f1 100644 --- a/backend/functions/src/db/crawled.ts +++ b/backend/functions/src/db/crawled.ts @@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord { urlPathDigest!: string; @Prop() - snapshot?: PageSnapshot & { screenshot: never; }; + snapshot?: PageSnapshot & { screenshot: never; pageshot: never; }; @Prop() screenshotAvailable?: boolean; + @Prop() + pageshotAvailable?: boolean; + @Prop() snapshotAvailable?: boolean; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 4177dc3..41e2b20 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser'; `- markdown\n` + `- html\n` + `- text\n` + + `- pageshot\n` + `- screenshot\n` , in: 'header', diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 6ead686..1286a62 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -46,6 +46,7 @@ export interface PageSnapshot { text: string; parsed?: Partial | null; screenshot?: Buffer; + pageshot?: Buffer; imgs?: ImgBrief[]; pdfs?: string[]; maxElemDepth?: number; @@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad); let snapshot: PageSnapshot | undefined; let screenshot: Buffer | undefined; + let pageshot: Buffer | undefined; const page = await this.getNextPage(); const sn = this.snMap.get(page); this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); @@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad); try { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot({ fullPage: true }); + screenshot = await page.screenshot(); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad); if (salvaged) { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot({ fullPage: true }); + screenshot = await page.screenshot(); + pageshot = await page.screenshot({ fullPage: true }); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad); this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); this.emit( 'crawled', - { ...snapshot, screenshot }, + { ...snapshot, screenshot, pageshot }, { ...options, url: parsedUrl } ); } @@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad); .then(async () => { const pSubFrameSnapshots = this.snapshotChildFrames(page); snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; - screenshot = await page.screenshot({ fullPage: true }); + screenshot = await page.screenshot(); + pageshot = await page.screenshot({ fullPage: true }); if (snapshot) { snapshot.childFrames = await pSubFrameSnapshots; } @@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad); } throw new AssertionFailureError(`Could not extract any meaningful content from the page`); } - yield { ...snapshot, screenshot } as PageSnapshot; + yield { ...snapshot, screenshot, pageshot } as PageSnapshot; break; } if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { - screenshot = await page.screenshot({ fullPage: true }); + screenshot = await page.screenshot(); + pageshot = await page.screenshot({ fullPage: true }); lastHTML = snapshot.html; } if (snapshot || screenshot) { - yield { ...snapshot, screenshot } as PageSnapshot; + yield { ...snapshot, screenshot, pageshot } as PageSnapshot; } if (error) { throw error;