mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 02:35:55 +08:00
fix: tidyMarkdown feature in turndown rues
This commit is contained in:
parent
36bf5d96b5
commit
e417cd8a53
@ -75,6 +75,38 @@ export class CrawlerHost extends RPCHost {
|
|||||||
replacement: (innerText) => `${innerText}\n===============\n`
|
replacement: (innerText) => `${innerText}\n===============\n`
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
turnDownService.addRule('improved-paragraph', {
|
||||||
|
filter: 'p',
|
||||||
|
replacement: (innerText) => {
|
||||||
|
const trimmed = innerText.trim();
|
||||||
|
if (!trimmed) {
|
||||||
|
return '';
|
||||||
|
}
|
||||||
|
|
||||||
|
return `${trimmed.replace(/\n{3,}/g, '\n\n')}\n\n`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
turnDownService.addRule('improved-inline-link', {
|
||||||
|
filter: function (node, options) {
|
||||||
|
return (
|
||||||
|
options.linkStyle === 'inlined' &&
|
||||||
|
node.nodeName === 'A' &&
|
||||||
|
node.getAttribute('href')
|
||||||
|
);
|
||||||
|
},
|
||||||
|
|
||||||
|
replacement: function (content, node) {
|
||||||
|
let href = node.getAttribute('href');
|
||||||
|
if (href) href = href.replace(/([()])/g, '\\$1');
|
||||||
|
let title = cleanAttribute(node.getAttribute('title'));
|
||||||
|
if (title) title = ' "' + title.replace(/"/g, '\\"') + '"';
|
||||||
|
|
||||||
|
const fixedContent = content.replace(/\s+/g, ' ').trim();
|
||||||
|
const fixedHref = href.replace(/\s+/g, '').trim();
|
||||||
|
|
||||||
|
return `[${fixedContent}](${fixedHref}${title || ''})`;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
return turnDownService;
|
return turnDownService;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user