fix: revert screenshot behavior and introduce pageshot

This commit is contained in:
Yanlong Wang 2024-07-30 20:09:06 +08:00
parent 57cbae864e
commit d3f3a8502a
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
4 changed files with 71 additions and 11 deletions

View File

@ -45,6 +45,8 @@ export interface FormattedPage {
text?: string;
screenshotUrl?: string;
screenshot?: Buffer;
pageshotUrl?: string;
pageshot?: Buffer;
links?: { [k: string]: string; };
images?: { [k: string]: string; };
@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
return mixin;
}
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
}, nominalUrl?: URL) {
if (mode === 'screenshot') {
if (snapshot.screenshot && !snapshot.screenshotUrl) {
@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
}
} as FormattedPage;
}
if (mode === 'pageshot') {
if (snapshot.pageshot && !snapshot.pageshotUrl) {
const fid = `instant-screenshots/${randomUUID()}`;
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
}
return {
...this.getGeneralSnapshotMixins(snapshot),
html: snapshot.html,
pageshotUrl: snapshot.pageshotUrl,
toString() {
return this.pageshotUrl;
}
} as FormattedPage;
}
if (mode === 'html') {
return {
...this.getGeneralSnapshotMixins(snapshot),
@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
}
@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
let snapshot: PageSnapshot | undefined;
let screenshotUrl: string | undefined;
let pageshotUrl: string | undefined;
const preparations = [
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
snapshot = JSON.parse(r.toString('utf-8'));
@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
screenshotUrl = r;
}) :
Promise.resolve(undefined),
cache.pageshotAvailable ?
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
pageshotUrl = r;
}) :
Promise.resolve(undefined)
];
try {
@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
snapshot: {
...snapshot,
screenshot: undefined,
pageshot: undefined,
screenshotUrl,
} as PageSnapshot & { screenshotUrl?: string; }
pageshotUrl,
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
};
}
@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
});
cache.screenshotAvailable = true;
}
if (snapshot.pageshot) {
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
metadata: {
contentType: 'image/png',
}
});
cache.pageshotAvailable = true;
}
await savingOfSnapshot;
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const crawlOpts: ExtraScrappingOptions = {
proxyUrl: opts.proxyUrl,
cookies: opts.setCookies,
favorScreenshot: opts.respondWith === 'screenshot',
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
removeSelector: opts.removeSelector,
targetSelector: opts.targetSelector,
waitForSelector: opts.waitForSelector,

View File

@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
urlPathDigest!: string;
@Prop()
snapshot?: PageSnapshot & { screenshot: never; };
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
@Prop()
screenshotAvailable?: boolean;
@Prop()
pageshotAvailable?: boolean;
@Prop()
snapshotAvailable?: boolean;

View File

@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
`- markdown\n` +
`- html\n` +
`- text\n` +
`- pageshot\n` +
`- screenshot\n`
,
in: 'header',

View File

@ -46,6 +46,7 @@ export interface PageSnapshot {
text: string;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer;
pageshot?: Buffer;
imgs?: ImgBrief[];
pdfs?: string[];
maxElemDepth?: number;
@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
let snapshot: PageSnapshot | undefined;
let screenshot: Buffer | undefined;
let pageshot: Buffer | undefined;
const page = await this.getNextPage();
const sn = this.snMap.get(page);
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
try {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true });
screenshot = await page.screenshot();
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
if (salvaged) {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true });
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
this.emit(
'crawled',
{ ...snapshot, screenshot },
{ ...snapshot, screenshot, pageshot },
{ ...options, url: parsedUrl }
);
}
@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
.then(async () => {
const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = await page.screenshot({ fullPage: true });
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots;
}
@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
}
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
}
yield { ...snapshot, screenshot } as PageSnapshot;
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
break;
}
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
screenshot = await page.screenshot({ fullPage: true });
screenshot = await page.screenshot();
pageshot = await page.screenshot({ fullPage: true });
lastHTML = snapshot.html;
}
if (snapshot || screenshot) {
yield { ...snapshot, screenshot } as PageSnapshot;
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
}
if (error) {
throw error;