mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-15 04:16:01 +08:00
fix: expose publishedTime if possible
This commit is contained in:
parent
6e36f0a447
commit
a211366501
@ -123,12 +123,18 @@ export class CrawlerHost extends RPCHost {
|
|||||||
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
title: (snapshot.parsed?.title || snapshot.title || '').trim(),
|
||||||
url: nominalUrl || snapshot.href?.trim(),
|
url: nominalUrl || snapshot.href?.trim(),
|
||||||
content: cleanText,
|
content: cleanText,
|
||||||
|
publishedTime: snapshot.parsed?.publishedTime || undefined,
|
||||||
|
|
||||||
toString() {
|
toString() {
|
||||||
|
const mixins = [];
|
||||||
|
if (this.publishedTime) {
|
||||||
|
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||||
|
}
|
||||||
|
|
||||||
return `Title: ${this.title}
|
return `Title: ${this.title}
|
||||||
|
|
||||||
URL Source: ${this.url}
|
URL Source: ${this.url}
|
||||||
|
${mixins.length ? `\n${mixins.join('\n\n')}\n` : ''}
|
||||||
Markdown Content:
|
Markdown Content:
|
||||||
${this.content}
|
${this.content}
|
||||||
`;
|
`;
|
||||||
|
@ -20,12 +20,7 @@ export interface ImgBrief {
|
|||||||
alt?: string;
|
alt?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PageSnapshot {
|
export interface ReadabilityParsed {
|
||||||
title: string;
|
|
||||||
href: string;
|
|
||||||
html: string;
|
|
||||||
text: string;
|
|
||||||
parsed?: {
|
|
||||||
title: string;
|
title: string;
|
||||||
content: string;
|
content: string;
|
||||||
textContent: string;
|
textContent: string;
|
||||||
@ -36,7 +31,14 @@ export interface PageSnapshot {
|
|||||||
siteName: string;
|
siteName: string;
|
||||||
lang: string;
|
lang: string;
|
||||||
publishedTime: string;
|
publishedTime: string;
|
||||||
} | null;
|
}
|
||||||
|
|
||||||
|
export interface PageSnapshot {
|
||||||
|
title: string;
|
||||||
|
href: string;
|
||||||
|
html: string;
|
||||||
|
text: string;
|
||||||
|
parsed?: Partial<ReadabilityParsed> | null;
|
||||||
screenshot?: Buffer;
|
screenshot?: Buffer;
|
||||||
imgs?: ImgBrief[];
|
imgs?: ImgBrief[];
|
||||||
}
|
}
|
||||||
@ -121,7 +123,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
||||||
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
||||||
preparations.push(page.setBypassCSP(true));
|
preparations.push(page.setBypassCSP(true));
|
||||||
preparations.push(page.setViewport({ width: 1920, height: 1080 }));
|
preparations.push(page.setViewport({ width: 1024, height: 1024 }));
|
||||||
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
preparations.push(page.exposeFunction('reportSnapshot', (snapshot: any) => {
|
||||||
page.emit('snapshot', snapshot);
|
page.emit('snapshot', snapshot);
|
||||||
}));
|
}));
|
||||||
@ -262,7 +264,7 @@ function giveSnapshot() {
|
|||||||
}
|
}
|
||||||
screenshot = await page.screenshot({
|
screenshot = await page.screenshot({
|
||||||
type: 'jpeg',
|
type: 'jpeg',
|
||||||
quality: 85,
|
quality: 75,
|
||||||
});
|
});
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
||||||
if (!snapshot.title || !snapshot.parsed?.content) {
|
if (!snapshot.title || !snapshot.parsed?.content) {
|
||||||
@ -270,7 +272,7 @@ function giveSnapshot() {
|
|||||||
if (salvaged) {
|
if (salvaged) {
|
||||||
screenshot = await page.screenshot({
|
screenshot = await page.screenshot({
|
||||||
type: 'jpeg',
|
type: 'jpeg',
|
||||||
quality: 85,
|
quality: 75,
|
||||||
});
|
});
|
||||||
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
snapshot = await page.evaluate('giveSnapshot()') as PageSnapshot;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user