mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 18:55:55 +08:00
fix: revert screenshot behavior and introduce pageshot
This commit is contained in:
parent
57cbae864e
commit
d3f3a8502a
@ -45,6 +45,8 @@ export interface FormattedPage {
|
|||||||
text?: string;
|
text?: string;
|
||||||
screenshotUrl?: string;
|
screenshotUrl?: string;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
|
pageshotUrl?: string;
|
||||||
|
pageshot?: Buffer;
|
||||||
links?: { [k: string]: string; };
|
links?: { [k: string]: string; };
|
||||||
images?: { [k: string]: string; };
|
images?: { [k: string]: string; };
|
||||||
|
|
||||||
@ -282,8 +284,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return mixin;
|
return mixin;
|
||||||
}
|
}
|
||||||
|
|
||||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot' | 'pageshot', snapshot: PageSnapshot & {
|
||||||
screenshotUrl?: string;
|
screenshotUrl?: string;
|
||||||
|
pageshotUrl?: string;
|
||||||
}, nominalUrl?: URL) {
|
}, nominalUrl?: URL) {
|
||||||
if (mode === 'screenshot') {
|
if (mode === 'screenshot') {
|
||||||
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
if (snapshot.screenshot && !snapshot.screenshotUrl) {
|
||||||
@ -305,6 +308,26 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
} as FormattedPage;
|
} as FormattedPage;
|
||||||
}
|
}
|
||||||
|
if (mode === 'pageshot') {
|
||||||
|
if (snapshot.pageshot && !snapshot.pageshotUrl) {
|
||||||
|
const fid = `instant-screenshots/${randomUUID()}`;
|
||||||
|
await this.firebaseObjectStorage.saveFile(fid, snapshot.pageshot, {
|
||||||
|
metadata: {
|
||||||
|
contentType: 'image/png',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
snapshot.pageshotUrl = await this.firebaseObjectStorage.signDownloadUrl(fid, Date.now() + this.urlValidMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
|
html: snapshot.html,
|
||||||
|
pageshotUrl: snapshot.pageshotUrl,
|
||||||
|
toString() {
|
||||||
|
return this.pageshotUrl;
|
||||||
|
}
|
||||||
|
} as FormattedPage;
|
||||||
|
}
|
||||||
if (mode === 'html') {
|
if (mode === 'html') {
|
||||||
return {
|
return {
|
||||||
...this.getGeneralSnapshotMixins(snapshot),
|
...this.getGeneralSnapshotMixins(snapshot),
|
||||||
@ -761,6 +784,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
@ -778,6 +807,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
return assignTransferProtocolMeta(`${formatted}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
@ -810,6 +845,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
|
|
||||||
let snapshot: PageSnapshot | undefined;
|
let snapshot: PageSnapshot | undefined;
|
||||||
let screenshotUrl: string | undefined;
|
let screenshotUrl: string | undefined;
|
||||||
|
let pageshotUrl: string | undefined;
|
||||||
const preparations = [
|
const preparations = [
|
||||||
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
this.firebaseObjectStorage.downloadFile(`snapshots/${cache._id}`).then((r) => {
|
||||||
snapshot = JSON.parse(r.toString('utf-8'));
|
snapshot = JSON.parse(r.toString('utf-8'));
|
||||||
@ -818,6 +854,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
this.firebaseObjectStorage.signDownloadUrl(`screenshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
||||||
screenshotUrl = r;
|
screenshotUrl = r;
|
||||||
}) :
|
}) :
|
||||||
|
Promise.resolve(undefined),
|
||||||
|
cache.pageshotAvailable ?
|
||||||
|
this.firebaseObjectStorage.signDownloadUrl(`pageshots/${cache._id}`, Date.now() + this.urlValidMs).then((r) => {
|
||||||
|
pageshotUrl = r;
|
||||||
|
}) :
|
||||||
Promise.resolve(undefined)
|
Promise.resolve(undefined)
|
||||||
];
|
];
|
||||||
try {
|
try {
|
||||||
@ -833,8 +874,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
snapshot: {
|
snapshot: {
|
||||||
...snapshot,
|
...snapshot,
|
||||||
screenshot: undefined,
|
screenshot: undefined,
|
||||||
|
pageshot: undefined,
|
||||||
screenshotUrl,
|
screenshotUrl,
|
||||||
} as PageSnapshot & { screenshotUrl?: string; }
|
pageshotUrl,
|
||||||
|
} as PageSnapshot & { screenshotUrl?: string; pageshotUrl?: string; }
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -878,6 +921,14 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
});
|
});
|
||||||
cache.screenshotAvailable = true;
|
cache.screenshotAvailable = true;
|
||||||
}
|
}
|
||||||
|
if (snapshot.pageshot) {
|
||||||
|
await this.firebaseObjectStorage.saveFile(`pageshots/${cache._id}`, snapshot.pageshot, {
|
||||||
|
metadata: {
|
||||||
|
contentType: 'image/png',
|
||||||
|
}
|
||||||
|
});
|
||||||
|
cache.pageshotAvailable = true;
|
||||||
|
}
|
||||||
await savingOfSnapshot;
|
await savingOfSnapshot;
|
||||||
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
const r = await Crawled.save(cache.degradeForFireStore()).catch((err) => {
|
||||||
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
this.logger.error(`Failed to save cache for ${urlToCrawl}`, { err: marshalErrorLike(err) });
|
||||||
@ -1013,7 +1064,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
const crawlOpts: ExtraScrappingOptions = {
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
proxyUrl: opts.proxyUrl,
|
proxyUrl: opts.proxyUrl,
|
||||||
cookies: opts.setCookies,
|
cookies: opts.setCookies,
|
||||||
favorScreenshot: opts.respondWith === 'screenshot',
|
favorScreenshot: ['screenshot', 'pageshot'].includes(opts.respondWith),
|
||||||
removeSelector: opts.removeSelector,
|
removeSelector: opts.removeSelector,
|
||||||
targetSelector: opts.targetSelector,
|
targetSelector: opts.targetSelector,
|
||||||
waitForSelector: opts.waitForSelector,
|
waitForSelector: opts.waitForSelector,
|
||||||
|
@ -22,11 +22,14 @@ export class Crawled extends FirestoreRecord {
|
|||||||
urlPathDigest!: string;
|
urlPathDigest!: string;
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
snapshot?: PageSnapshot & { screenshot: never; };
|
snapshot?: PageSnapshot & { screenshot: never; pageshot: never; };
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
screenshotAvailable?: boolean;
|
screenshotAvailable?: boolean;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
pageshotAvailable?: boolean;
|
||||||
|
|
||||||
@Prop()
|
@Prop()
|
||||||
snapshotAvailable?: boolean;
|
snapshotAvailable?: boolean;
|
||||||
|
|
||||||
|
@ -34,6 +34,7 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
|||||||
`- markdown\n` +
|
`- markdown\n` +
|
||||||
`- html\n` +
|
`- html\n` +
|
||||||
`- text\n` +
|
`- text\n` +
|
||||||
|
`- pageshot\n` +
|
||||||
`- screenshot\n`
|
`- screenshot\n`
|
||||||
,
|
,
|
||||||
in: 'header',
|
in: 'header',
|
||||||
|
@ -46,6 +46,7 @@ export interface PageSnapshot {
|
|||||||
text: string;
|
text: string;
|
||||||
parsed?: Partial<ReadabilityParsed> | null;
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
|
pageshot?: Buffer;
|
||||||
imgs?: ImgBrief[];
|
imgs?: ImgBrief[];
|
||||||
pdfs?: string[];
|
pdfs?: string[];
|
||||||
maxElemDepth?: number;
|
maxElemDepth?: number;
|
||||||
@ -448,6 +449,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
|
|
||||||
let snapshot: PageSnapshot | undefined;
|
let snapshot: PageSnapshot | undefined;
|
||||||
let screenshot: Buffer | undefined;
|
let screenshot: Buffer | undefined;
|
||||||
|
let pageshot: Buffer | undefined;
|
||||||
const page = await this.getNextPage();
|
const page = await this.getNextPage();
|
||||||
const sn = this.snMap.get(page);
|
const sn = this.snMap.get(page);
|
||||||
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
this.logger.info(`Page ${sn}: Scraping ${url}`, { url });
|
||||||
@ -524,7 +526,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
try {
|
try {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot({ fullPage: true });
|
screenshot = await page.screenshot();
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -547,7 +549,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot({ fullPage: true });
|
screenshot = await page.screenshot();
|
||||||
|
pageshot = await page.screenshot({ fullPage: true });
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -562,7 +565,7 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Page ${sn}: Snapshot of ${url} done`, { url, title: snapshot?.title, href: snapshot?.href });
|
||||||
this.emit(
|
this.emit(
|
||||||
'crawled',
|
'crawled',
|
||||||
{ ...snapshot, screenshot },
|
{ ...snapshot, screenshot, pageshot },
|
||||||
{ ...options, url: parsedUrl }
|
{ ...options, url: parsedUrl }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@ -581,7 +584,8 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
.then(async () => {
|
.then(async () => {
|
||||||
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
const pSubFrameSnapshots = this.snapshotChildFrames(page);
|
||||||
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
|
||||||
screenshot = await page.screenshot({ fullPage: true });
|
screenshot = await page.screenshot();
|
||||||
|
pageshot = await page.screenshot({ fullPage: true });
|
||||||
if (snapshot) {
|
if (snapshot) {
|
||||||
snapshot.childFrames = await pSubFrameSnapshots;
|
snapshot.childFrames = await pSubFrameSnapshots;
|
||||||
}
|
}
|
||||||
@ -614,15 +618,16 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
}
|
}
|
||||||
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
throw new AssertionFailureError(`Could not extract any meaningful content from the page`);
|
||||||
}
|
}
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
if (options?.favorScreenshot && snapshot?.title && snapshot?.html !== lastHTML) {
|
||||||
screenshot = await page.screenshot({ fullPage: true });
|
screenshot = await page.screenshot();
|
||||||
|
pageshot = await page.screenshot({ fullPage: true });
|
||||||
lastHTML = snapshot.html;
|
lastHTML = snapshot.html;
|
||||||
}
|
}
|
||||||
if (snapshot || screenshot) {
|
if (snapshot || screenshot) {
|
||||||
yield { ...snapshot, screenshot } as PageSnapshot;
|
yield { ...snapshot, screenshot, pageshot } as PageSnapshot;
|
||||||
}
|
}
|
||||||
if (error) {
|
if (error) {
|
||||||
throw error;
|
throw error;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user