mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-20 05:09:09 +08:00
feat: full markdown mode
This commit is contained in:
parent
0f70723d19
commit
69231ad59e
@ -58,7 +58,23 @@ export class CrawlerHost extends RPCHost {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
async formatSnapshot(mode: string | 'markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
getTurndown(noRules?: boolean | string) {
|
||||||
|
const turnDownService = new TurndownService();
|
||||||
|
if (!noRules) {
|
||||||
|
turnDownService.addRule('remove-irrelevant', {
|
||||||
|
filter: ['meta', 'style', 'script', 'noscript', 'link', 'textarea'],
|
||||||
|
replacement: () => ''
|
||||||
|
});
|
||||||
|
turnDownService.addRule('title-as-h1', {
|
||||||
|
filter: ['title'],
|
||||||
|
replacement: (innerText) => `${innerText}\n===============\n`
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return turnDownService;
|
||||||
|
}
|
||||||
|
|
||||||
|
async formatSnapshot(mode: string | 'markdown' | 'full-markdown' | 'html' | 'text' | 'screenshot', snapshot: PageSnapshot & {
|
||||||
screenshotUrl?: string;
|
screenshotUrl?: string;
|
||||||
}, nominalUrl?: URL) {
|
}, nominalUrl?: URL) {
|
||||||
if (mode === 'screenshot') {
|
if (mode === 'screenshot') {
|
||||||
@ -96,8 +112,8 @@ export class CrawlerHost extends RPCHost {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const toBeTurnedToMd = snapshot.parsed?.content;
|
const toBeTurnedToMd = mode === 'full-markdown' ? snapshot.html : snapshot.parsed?.content;
|
||||||
let turnDownService = new TurndownService();
|
let turnDownService = mode === 'markdown' ? this.getTurndown('without any rule') : this.getTurndown();
|
||||||
for (const plugin of this.turnDownPlugins) {
|
for (const plugin of this.turnDownPlugins) {
|
||||||
turnDownService = turnDownService.use(plugin);
|
turnDownService = turnDownService.use(plugin);
|
||||||
}
|
}
|
||||||
@ -129,7 +145,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (mapped) {
|
if (mapped) {
|
||||||
return ``;
|
return ``;
|
||||||
}
|
}
|
||||||
return ``;
|
return alt ? `` : ``;
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -139,7 +155,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = turnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = new TurndownService();
|
const vanillaTurnDownService = this.getTurndown();
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
contentText = vanillaTurnDownService.turndown(toBeTurnedToMd).trim();
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -148,12 +164,15 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))) {
|
if (
|
||||||
|
!contentText || (contentText.startsWith('<') && contentText.endsWith('>'))
|
||||||
|
&& toBeTurnedToMd !== snapshot.html
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
contentText = turnDownService.turndown(snapshot.html);
|
contentText = turnDownService.turndown(snapshot.html);
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
this.logger.warn(`Turndown failed to run, retrying without plugins`, { err });
|
||||||
const vanillaTurnDownService = new TurndownService();
|
const vanillaTurnDownService = this.getTurndown();
|
||||||
try {
|
try {
|
||||||
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
contentText = vanillaTurnDownService.turndown(snapshot.html);
|
||||||
} catch (err2) {
|
} catch (err2) {
|
||||||
@ -179,6 +198,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
mixins.push(`Published Time: ${this.publishedTime}`);
|
mixins.push(`Published Time: ${this.publishedTime}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (mode === 'full-markdown') {
|
||||||
|
return this.content;
|
||||||
|
}
|
||||||
|
|
||||||
return `Title: ${this.title}
|
return `Title: ${this.title}
|
||||||
|
|
||||||
URL Source: ${this.url}
|
URL Source: ${this.url}
|
||||||
@ -233,6 +256,7 @@ ${this.content}
|
|||||||
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
|
description: `Specifies the form factor of the crawled data you prefer. \n\n` +
|
||||||
`Supported formats:\n` +
|
`Supported formats:\n` +
|
||||||
`- markdown\n` +
|
`- markdown\n` +
|
||||||
|
`- full-markdown\n` +
|
||||||
`- html\n` +
|
`- html\n` +
|
||||||
`- text\n` +
|
`- text\n` +
|
||||||
`- screenshot\n\n` +
|
`- screenshot\n\n` +
|
||||||
|
Loading…
x
Reference in New Issue
Block a user