mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 12:15:58 +08:00
fix: revert screenshot behavior and introduce pageshot
This commit is contained in:
parent
57cbae864e
commit
d3f3a8502a
@ -45,6 +45,8 @@ export interface FormattedPage {
|
||||
text?: string;
|
||||
screenshotUrl?: string;
|
||||
screenshot?: Buffer;
|
||||
pageshotUrl?: string;
|
||||
pageshot?: Buffer;
|
||||
links?: { [k: string]: string; };
|
||||
images?: { [k: string]: string; };
|
||||
|
||||
@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
|
||||
return mixin;
|
||||
}
|
||||
|
||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
pageshotUrl?: string;
|
||||
}, nominalUrl?: URL) {
|
||||
if (mode === 'screenshot') {
|
||||
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
||||
@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
if (mode === 'pageshot') {
|
||||
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
||||
const fid = `instant-screenshots/${randomUUID()}`;
|
||||
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
||||
}
|
||||
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
html: snapshot.html,
|
||||
pageshotUrl: snapshot.pageshotUrl,
|
||||
toString() {
|
||||
return this.pageshotUrl;
|
||||
}
|
||||
} as FormattedPage;
|
||||
}
|
||||
if (mode === 'html') {
|
||||
return {
|
||||
...this.getGeneralSnapshotMixins(snapshot),
|
||||
@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
|
||||
let snapshot: PageSnapshot | undefined;
|
||||
let screenshotUrl: string | undefined;
|
||||
let pageshotUrl: string | undefined;
|
||||
const preparations = [
|
||||
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
||||
snapshot = JSON.parse(r.toString('utf-8'));
|
||||
@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
||||
screenshotUrl = r;
|
||||
}) :
|
||||
Promise.resolve(undefined),
|
||||
cache.pageshotAvailable ?
|
||||
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
||||
pageshotUrl = r;
|
||||
}) :
|
||||
Promise.resolve(undefined)
|
||||
];
|
||||
try {
|
||||
@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
snapshot: {
|
||||
...snapshot,
|
||||
screenshot: undefined,
|
||||
pageshot: undefined,
|
||||
screenshotUrl,
|
||||
} as PageSnapshot & { screenshotUrl?: string; }
|
||||
pageshotUrl,
|
||||
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
|
||||
};
|
||||
}
|
||||
|
||||
@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
});
|
||||
cache.screenshotAvailable = true;
|
||||
}
|
||||
if (snapshot.pageshot) {
|
||||
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
|
||||
metadata: {
|
||||
contentType: 'image/png',
|
||||
}
|
||||
});
|
||||
cache.pageshotAvailable = true;
|
||||
}
|
||||
await savingOfSnapshot;
|
||||
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
||||
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||
@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
const crawlOpts: ExtraScrappingOptions = {
|
||||
proxyUrl: opts.proxyUrl,
|
||||
cookies: opts.setCookies,
|
||||
favorScreenshot: opts.respondWith === 'screenshot',
|
||||
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
|
||||
removeSelector: opts.removeSelector,
|
||||
targetSelector: opts.targetSelector,
|
||||
waitForSelector: opts.waitForSelector,
|
||||
|
@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
|
||||
urlPathDigest!: string;
|
||||
|
||||
@Prop()
|
||||
snapshot?: PageSnapshot & { screenshot: never; };
|
||||
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
||||
|
||||
@Prop()
|
||||
screenshotAvailable?: boolean;
|
||||
|
||||
@Prop()
|
||||
pageshotAvailable?: boolean;
|
||||
|
||||
@Prop()
|
||||
snapshotAvailable?: boolean;
|
||||
|
||||
|
@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||
`- markdown\n` +
|
||||
`- html\n` +
|
||||
`- text\n` +
|
||||
`- pageshot\n` +
|
||||
`- screenshot\n`
|
||||
,
|
||||
in: 'header',
|
||||
|
@ -46,6 +46,7 @@ export interface PageSnapshot {
|
||||
text: string;
|
||||
parsed?: Partial<ReadabilityParsed> | null;
|
||||
screenshot?: Buffer;
|
||||
pageshot?: Buffer;
|
||||
imgs?: ImgBrief[];
|
||||
pdfs?: string[];
|
||||
maxElemDepth?: number;
|
||||
@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
|
||||
|
||||
let snapshot: PageSnapshot | undefined;
|
||||
let screenshot: Buffer | undefined;
|
||||
let pageshot: Buffer | undefined;
|
||||
const page = await this.getNextPage();
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||
@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
|
||||
try {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = await page.screenshot();
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
|
||||
if (salvaged) {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
|
||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||
this.emit(
|
||||
'crawled',
|
||||
{ ...snapshot, screenshot },
|
||||
{ ...snapshot, screenshot, pageshot },
|
||||
{ ...options, url: parsedUrl }
|
||||
);
|
||||
}
|
||||
@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
|
||||
.then(async () => {
|
||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||
screenshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
if (snapshot) {
|
||||
snapshot.childFrames = await pSubFrameSnapshots;
|
||||
}
|
||||
@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
|
||||
}
|
||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||
}
|
||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||
break;
|
||||
}
|
||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||
screenshot = await page.screenshot({ fullPage: true });
|
||||
screenshot = await page.screenshot();
|
||||
pageshot = await page.screenshot({ fullPage: true });
|
||||
lastHTML = snapshot.html;
|
||||
}
|
||||
if (snapshot || screenshot) {
|
||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
||||
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||
}
|
||||
if (error) {
|
||||
throw error;
|
||||
|
Loading…
x
Reference in New Issue
Block a user