mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 08:36:02 +08:00
fix: poorly transformed detection
This commit is contained in:
parent
706de20e5c
commit
6fa8ce309e
@ -578,11 +578,19 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
||||
}
|
||||
|
||||
if (content.includes('<table') && content.includes('</table>')) {
|
||||
if (node?.textContent && content.length > node.textContent.length * 0.8) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const tableElms = node?.querySelectorAll('table') || [];
|
||||
const deepTableElms = node?.querySelectorAll('table table');
|
||||
if (node && tableElms.length) {
|
||||
const wrappingTables = _.without(tableElms, ...Array.from(deepTableElms || []));
|
||||
const tableTextsLength = _.sum(wrappingTables.map((x) => (x.innerHTML?.length || 0)));
|
||||
|
||||
if ((deepTableElms?.length || 0) / tableElms.length > 0.6) {
|
||||
return true;
|
||||
if (tableTextsLength / (content.length) > 0.6) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
const tbodyElms = node?.querySelectorAll('tbody') || [];
|
||||
|
Loading…
x
Reference in New Issue
Block a user