mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 01:45:56 +08:00
fix: poorly transformed detection
This commit is contained in:
parent
706de20e5c
commit
6fa8ce309e
@ -578,12 +578,20 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (content.includes('<table') && content.includes('</table>')) {
|
if (content.includes('<table') && content.includes('</table>')) {
|
||||||
|
if (node?.textContent && content.length > node.textContent.length * 0.8) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
const tableElms = node?.querySelectorAll('table') || [];
|
const tableElms = node?.querySelectorAll('table') || [];
|
||||||
const deepTableElms = node?.querySelectorAll('table table');
|
const deepTableElms = node?.querySelectorAll('table table');
|
||||||
|
if (node && tableElms.length) {
|
||||||
|
const wrappingTables = _.without(tableElms, ...Array.from(deepTableElms || []));
|
||||||
|
const tableTextsLength = _.sum(wrappingTables.map((x) => (x.innerHTML?.length || 0)));
|
||||||
|
|
||||||
if ((deepTableElms?.length || 0) / tableElms.length > 0.6) {
|
if (tableTextsLength / (content.length) > 0.6) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const tbodyElms = node?.querySelectorAll('tbody') || [];
|
const tbodyElms = node?.querySelectorAll('tbody') || [];
|
||||||
const deepTbodyElms = node?.querySelectorAll('tbody tbody');
|
const deepTbodyElms = node?.querySelectorAll('tbody tbody');
|
||||||
|
Loading…
x
Reference in New Issue
Block a user