mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 08:25:53 +08:00
feat: fallback to google archive (#16)
* feat: fallback to google archive * fix
This commit is contained in:
parent
8a2b095bd7
commit
50ed9cc248
@ -1,6 +1,6 @@
|
|||||||
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
|
import { AssertionFailureError, AsyncService, Defer, HashManager, marshalErrorLike } from 'civkit';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
import type { Browser } from 'puppeteer';
|
import type { Browser, Page } from 'puppeteer';
|
||||||
import { Logger } from '../shared/services/logger';
|
import { Logger } from '../shared/services/logger';
|
||||||
import genericPool from 'generic-pool';
|
import genericPool from 'generic-pool';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
@ -93,7 +93,6 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
this.browser = await puppeteer.launch({
|
this.browser = await puppeteer.launch({
|
||||||
headless: true,
|
|
||||||
timeout: 10_000
|
timeout: 10_000
|
||||||
}).catch((err: any) => {
|
}).catch((err: any) => {
|
||||||
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
this.logger.error(`Unknown firebase issue, just die fast.`, { err });
|
||||||
@ -266,6 +265,16 @@ function giveSnapshot() {
|
|||||||
quality: 85,
|
quality: 85,
|
||||||
});
|
});
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
||||||
|
if (!snapshot.title || !snapshot.parsed?.content) {
|
||||||
|
const salvaged = await this.salvage(url, page);
|
||||||
|
if (salvaged) {
|
||||||
|
screenshot = await page.screenshot({
|
||||||
|
type: 'jpeg',
|
||||||
|
quality: 85,
|
||||||
|
});
|
||||||
|
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
||||||
|
}
|
||||||
|
}
|
||||||
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
|
this.logger.info(`Snapshot of ${url} done`, { url, digest, title: snapshot?.title, href: snapshot?.href });
|
||||||
const nowDate = new Date();
|
const nowDate = new Date();
|
||||||
Crawled.save(
|
Crawled.save(
|
||||||
@ -299,6 +308,27 @@ function giveSnapshot() {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async salvage(url: string, page: Page) {
|
||||||
|
this.logger.info(`Salvaging ${url}`);
|
||||||
|
const googleArchiveUrl = `https://webcache.googleusercontent.com/search?q=cache:${encodeURIComponent(url)}`;
|
||||||
|
const resp = await fetch(googleArchiveUrl, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': `Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`
|
||||||
|
}
|
||||||
|
});
|
||||||
|
resp.body?.cancel().catch(() => void 0);
|
||||||
|
if (!resp.ok) {
|
||||||
|
this.logger.warn(`No salvation found for url: ${url}`, { status: resp.status, url });
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
await page.goto(googleArchiveUrl, { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout: 15_000 }).catch((err) => {
|
||||||
|
this.logger.warn(`Page salvation did not fully succeed.`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const puppeteerControl = container.resolve(PuppeteerControl);
|
const puppeteerControl = container.resolve(PuppeteerControl);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user