fix: revert screenshot behavior and introduce pageshot

This commit is contained in:
Yanlong Wang 2024-07-30 20:09:06 +08:00
parent 57cbae864e
commit d3f3a8502a
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 71 additions and 11 deletions

View File

@ -45,6 +45,8 @@ export interface FormattedPage {
text?: string; text?: string;
screenshotUrl?: string; screenshotUrl?: string;
screenshot?: Buffer; screenshot?: Buffer;
pageshotUrl?: string;
pageshot?: Buffer;
links?: { [k: string]: string; }; links?: { [k: string]: string; };
images?: { [k: string]: string; }; images?: { [k: string]: string; };
@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
return mixin; return mixin;
} }
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & { async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
screenshotUrl?: string; screenshotUrl?: string;
pageshotUrl?: string;
}, nominalUrl?: URL) { }, nominalUrl?: URL) {
if (mode === 'screenshot') { if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) { if (snapshot.screenshot && !snapshot.screenshotUrl) {
@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
} }
} as FormattedPage; } as FormattedPage;
} }
if (mode === 'pageshot') {
if (snapshot.pageshot && !snapshot.pageshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
pageshotUrl: snapshot.pageshotUrl,
toString() {
return this.pageshotUrl;
}
} as FormattedPage;
}
if (mode === 'html') { if (mode === 'html') {
return { return {
...this.getGeneralSnapshotMixins(snapshot), ...this.getGeneralSnapshotMixins(snapshot),
@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
); );
} }
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
} }
@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
); );
} }
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null }); return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
} }
@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let snapshot: PageSnapshot | undefined; let snapshot: PageSnapshot | undefined;
let screenshotUrl: string | undefined; let screenshotUrl: string | undefined;
let pageshotUrl: string | undefined;
const preparations = [ const preparations = [
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => { this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
snapshot = JSON.parse(r.toString('utf-8')); snapshot = JSON.parse(r.toString('utf-8'));
@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => { this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
screenshotUrl = r; screenshotUrl = r;
}) : }) :
Promise.resolve(undefined),
cache.pageshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
pageshotUrl = r;
}) :
Promise.resolve(undefined) Promise.resolve(undefined)
]; ];
try { try {
@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
snapshot: { snapshot: {
...snapshot, ...snapshot,
screenshot: undefined, screenshot: undefined,
pageshot: undefined,
screenshotUrl, screenshotUrl,
} as PageSnapshot & { screenshotUrl?: string; } pageshotUrl,
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
}; };
} }
@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
}); });
cache.screenshotAvailable = true; cache.screenshotAvailable = true;
} }
if (snapshot.pageshot) {
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
cache.pageshotAvailable = true;
}
await savingOfSnapshot; await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => { const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) }); this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const crawlOpts: ExtraScrappingOptions = { const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl, proxyUrl: opts.proxyUrl,
cookies: opts.setCookies, cookies: opts.setCookies,
favorScreenshot: opts.respondWith === 'screenshot', favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
removeSelector: opts.removeSelector, removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector, targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector, waitForSelector: opts.waitForSelector,

View File

@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string; urlPathDigest!: string;
@Prop() @Prop()
snapshot?: PageSnapshot & { screenshot: never; }; snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
@Prop() @Prop()
screenshotAvailable?: boolean; screenshotAvailable?: boolean;
@Prop()
pageshotAvailable?: boolean;
@Prop() @Prop()
snapshotAvailable?: boolean; snapshotAvailable?: boolean;

View File

@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
`- markdown\n` + `- markdown\n` +
`- html\n` + `- html\n` +
`- text\n` + `- text\n` +
`- pageshot\n` +
`- screenshot\n` `- screenshot\n`
, ,
in: 'header', in: 'header',

View File

@ -46,6 +46,7 @@ export interface PageSnapshot {
text: string; text: string;
parsed?: Partial<ReadabilityParsed> | null; parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer; screenshot?: Buffer;
pageshot?: Buffer;
imgs?: ImgBrief[]; imgs?: ImgBrief[];
pdfs?: string[]; pdfs?: string[];
maxElemDepth?: number; maxElemDepth?: number;
@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
let snapshot: PageSnapshot | undefined; let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined; let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
const page = await this.getNextPage(); const page = await this.getNextPage();
const sn = this.snMap.get(page); const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url }); this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
try { try {
const pSubFrameSnapshots = this.snapshotChildFrames(page); const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true }); screenshot = await page.screenshot();
if (snapshot) { if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots; snapshot.childFrames = await pSubFrameSnapshots;
} }
@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
if (salvaged) { if (salvaged) {
const pSubFrameSnapshots = this.snapshotChildFrames(page); const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true }); screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) { if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots; snapshot.childFrames = await pSubFrameSnapshots;
} }
@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href }); this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit( this.emit(
'crawled', 'crawled',
{ ...snapshot, screenshot }, { ...snapshot, screenshot, pageshot },
{ ...options, url: parsedUrl } { ...options, url: parsedUrl }
); );
} }
@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
.then(async () => { .then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page); const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true }); screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) { if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots; snapshot.childFrames = await pSubFrameSnapshots;
} }
@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
} }
throw new AssertionFailureError(`Could not extract any meaningful content from the page`); throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
} }
yield { ...snapshot, screenshot } as PageSnapshot; yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break; break;
} }
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) { if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot({ fullPage: true }); screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
lastHTML = snapshot.html; lastHTML = snapshot.html;
} }
if (snapshot || screenshot) { if (snapshot || screenshot) {
yield { ...snapshot, screenshot } as PageSnapshot; yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
} }
if (error) { if (error) {
throw error; throw error;