fix: expose publishedTime if possible

This commit is contained in:
yanlong.wang 2024-04-17 12:36:36 +08:00
parent 6e36f0a447
commit a211366501
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 24 additions and 16 deletions

View File

@ -123,12 +123,18 @@ export class CrawlerHost extends RPCHost {
title: (snapshot.parsed?.title || snapshot.title || '').trim(), title: (snapshot.parsed?.title || snapshot.title || '').trim(),
url: nominalUrl || snapshot.href?.trim(), url: nominalUrl || snapshot.href?.trim(),
content: cleanText, content: cleanText,
publishedTime: snapshot.parsed?.publishedTime || undefined,
toString() { toString() {
const mixins = [];
if (this.publishedTime) {
mixins.push(`Published Time: ${this.publishedTime}`);
}
return `Title: ${this.title} return `Title: ${this.title}
URL Source: ${this.url} URL Source: ${this.url}
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
Markdown Content: Markdown Content:
${this.content} ${this.content}
`; `;

View File

@ -20,12 +20,7 @@ export interface ImgBrief {
alt?: string; alt?: string;
} }
export interface PageSnapshot { export interface ReadabilityParsed {
title: string;
href: string;
html: string;
text: string;
parsed?: {
title: string; title: string;
content: string; content: string;
textContent: string; textContent: string;
@ -36,7 +31,14 @@ export interface PageSnapshot {
siteName: string; siteName: string;
lang: string; lang: string;
publishedTime: string; publishedTime: string;
} | null; }
export interface PageSnapshot {
title: string;
href: string;
html: string;
text: string;
parsed?: Partial<ReadabilityParsed> | null;
screenshot?: Buffer; screenshot?: Buffer;
imgs?: ImgBrief[]; imgs?: ImgBrief[];
} }
@ -121,7 +123,7 @@ export class PuppeteerControl extends AsyncService {
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
preparations.push(page.setBypassCSP(true)); preparations.push(page.setBypassCSP(true));
preparations.push(page.setViewport({ width: 1920, height: 1080 })); preparations.push(page.setViewport({ width: 1024, height: 1024 }));
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => { preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
page.emit('snapshot', snapshot); page.emit('snapshot', snapshot);
})); }));
@ -262,7 +264,7 @@ function giveSnapshot() {
} }
screenshot = await page.screenshot({ screenshot = await page.screenshot({
type: 'jpeg', type: 'jpeg',
quality: 85, quality: 75,
}); });
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
if (!snapshot.title || !snapshot.parsed?.content) { if (!snapshot.title || !snapshot.parsed?.content) {
@ -270,7 +272,7 @@ function giveSnapshot() {
if (salvaged) { if (salvaged) {
screenshot = await page.screenshot({ screenshot = await page.screenshot({
type: 'jpeg', type: 'jpeg',
quality: 85, quality: 75,
}); });
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot; snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
} }