mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-20 12:39:15 +08:00
remove more attrs in readerlm preprocessing
This commit is contained in:
parent
140a6f86ae
commit
234f61d066
@ -273,6 +273,24 @@ export class JSDomControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
x.removeAttribute('style');
|
x.removeAttribute('style');
|
||||||
});
|
});
|
||||||
|
const treeWalker = jsdom.window.document.createTreeWalker(
|
||||||
|
jsdom.window.document, // Start from the root document
|
||||||
|
0x80 // Only show comment nodes
|
||||||
|
);
|
||||||
|
|
||||||
|
let currentNode;
|
||||||
|
while ((currentNode = treeWalker.nextNode())) {
|
||||||
|
currentNode.parentNode?.removeChild(currentNode); // Remove each comment node
|
||||||
|
}
|
||||||
|
|
||||||
|
jsdom.window.document.querySelectorAll('*').forEach((x)=> {
|
||||||
|
const attrs = x.getAttributeNames();
|
||||||
|
for (const attr of attrs) {
|
||||||
|
if (attr.startsWith('data-') || attr.startsWith('aria-')) {
|
||||||
|
x.removeAttribute(attr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
const dt = Date.now() - t0;
|
const dt = Date.now() - t0;
|
||||||
if (dt > 1000) {
|
if (dt > 1000) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user