mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 01:55:56 +08:00
wip
This commit is contained in:
parent
89d6d49f06
commit
b46e859a30
@ -41,7 +41,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
for await (const scrapped of this.puppeteerControl.scrap(url)) {
|
for await (const scrapped of this.puppeteerControl.scrap(url)) {
|
||||||
this.logger.info(`Scrapped: ${scrapped.snapshot}`);
|
|
||||||
const content = typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content;
|
const content = typeof scrapped.snapshot === 'string' ? scrapped.snapshot : (scrapped.snapshot as any)?.content;
|
||||||
if (!content) {
|
if (!content) {
|
||||||
continue;
|
continue;
|
||||||
|
@ -42,7 +42,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
await this.browser.close();
|
await this.browser.close();
|
||||||
}
|
}
|
||||||
this.browser = await puppeteer.launch({
|
this.browser = await puppeteer.launch({
|
||||||
headless: false,
|
headless: true,
|
||||||
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
args: ['--no-sandbox', '--disable-setuid-sandbox'],
|
||||||
});
|
});
|
||||||
this.browser.once('disconnected', () => {
|
this.browser.once('disconnected', () => {
|
||||||
@ -67,8 +67,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
await page.evaluateOnNewDocument(READABILITY_JS);
|
await page.evaluateOnNewDocument(READABILITY_JS);
|
||||||
|
|
||||||
await page.evaluateOnNewDocument(() => {
|
await page.evaluateOnNewDocument(() => {
|
||||||
// @ts-expect-error
|
function giveSnapshot() {
|
||||||
window.giveSnapshot() = () => {
|
|
||||||
// @ts-expect-error
|
// @ts-expect-error
|
||||||
return new Readability(document.cloneNode(true)).parse();
|
return new Readability(document.cloneNode(true)).parse();
|
||||||
};
|
};
|
||||||
@ -79,9 +78,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// @ts-expect-error
|
const parsed = giveSnapshot();
|
||||||
const parsed = window.giveSnapshot();
|
|
||||||
console.log(parsed);
|
|
||||||
if (parsed) {
|
if (parsed) {
|
||||||
// @ts-expect-error
|
// @ts-expect-error
|
||||||
window.reportSnapshot(parsed);
|
window.reportSnapshot(parsed);
|
||||||
@ -91,7 +88,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
aftershot = setTimeout(() => {
|
aftershot = setTimeout(() => {
|
||||||
// @ts-expect-error
|
// @ts-expect-error
|
||||||
window.reportSnapshot(window.giveSnapshot());
|
window.reportSnapshot(giveSnapshot());
|
||||||
}, 500);
|
}, 500);
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
@ -130,7 +127,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
const screenshot = await page.screenshot();
|
const screenshot = await page.screenshot();
|
||||||
if (finalized) {
|
if (finalized) {
|
||||||
await gotoPromise;
|
await gotoPromise;
|
||||||
snapshot = await page.evaluate('window.giveSnapshot()');
|
snapshot = await page.evaluate('new Readability(document.cloneNode(true)).parse()');
|
||||||
yield { snapshot, screenshot };
|
yield { snapshot, screenshot };
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user