fix: salvaging with google cache does not work anymore

This commit is contained in:
yanlong.wang 2024-10-29 15:09:50 +08:00
parent ebc09003d1
commit 3303763345
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37

View File

@ -739,22 +739,22 @@ if (window.self === window.top) {
throw stuff; throw stuff;
} }
} }
try { // try {
if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) { // if ((!snapshot?.title || !snapshot?.parsed?.content) && !(snapshot?.pdfs?.length)) {
const salvaged = await this.salvage(url, page); // const salvaged = await this.salvage(url, page);
if (salvaged) { // if (salvaged) {
const pSubFrameSnapshots = this.snapshotChildFrames(page); // const pSubFrameSnapshots = this.snapshotChildFrames(page);
snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot; // snapshot = await page.evaluate('giveSnapshot(true)') as PageSnapshot;
screenshot = Buffer.from(await page.screenshot()); // screenshot = Buffer.from(await page.screenshot());
pageshot = Buffer.from(await page.screenshot({ fullPage: true })); // pageshot = Buffer.from(await page.screenshot({ fullPage: true }));
if (snapshot) { // if (snapshot) {
snapshot.childFrames = await pSubFrameSnapshots; // snapshot.childFrames = await pSubFrameSnapshots;
} // }
} // }
} // }
} catch (err: any) { // } catch (err: any) {
this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) }); // this.logger.warn(`Page ${sn}: Failed to salvage ${url}`, { err: marshalErrorLike(err) });
} // }
finalized = true; finalized = true;
if (snapshot?.html) { if (snapshot?.html) {
@ -853,28 +853,28 @@ if (window.self === window.top) {
} }
} }
async salvage(url: string, page: Page) { // async salvage(url: string, page: Page) {
this.logger.info(`Salvaging ${url}`); // this.logger.info(`Salvaging ${url}`);
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`; // const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
const resp = await fetch(googleArchiveUrl, { // const resp = await fetch(googleArchiveUrl, {
headers: { // headers: {
'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)` // 'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
} // }
}); // });
resp.body?.cancel().catch(() => void 0); // resp.body?.cancel().catch(() => void 0);
if (!resp.ok) { // if (!resp.ok) {
this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url }); // this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url });
return null; // return null;
} // }
await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => { // await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) }); // this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
}); // });
this.logger.info(`Salvation completed.`); // this.logger.info(`Salvation completed.`);
return true; // return true;
} // }
async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> { async snapshotChildFrames(page: Page): Promise<PageSnapshot[]> {
const childFrames = page.mainFrame().childFrames(); const childFrames = page.mainFrame().childFrames();