From 234f61d066ca0a017dc2e99da141d13805b6c1aa Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Mon, 20 Jan 2025 11:54:31 +0800 Subject: [PATCH] remove more attrs in readerlm preprocessing --- backend/functions/src/services/jsdom.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/backend/functions/src/services/jsdom.ts b/backend/functions/src/services/jsdom.ts index 65124ae..88cadac 100644 --- a/backend/functions/src/services/jsdom.ts +++ b/backend/functions/src/services/jsdom.ts @@ -273,6 +273,24 @@ export class JSDomControl extends AsyncService { } x.removeAttribute('style'); }); + const treeWalker = jsdom.window.document.createTreeWalker( + jsdom.window.document, // Start from the root document + 0x80 // Only show comment nodes + ); + + let currentNode; + while ((currentNode = treeWalker.nextNode())) { + currentNode.parentNode?.removeChild(currentNode); // Remove each comment node + } + + jsdom.window.document.querySelectorAll('*').forEach((x)=> { + const attrs = x.getAttributeNames(); + for (const attr of attrs) { + if (attr.startsWith('data-') || attr.startsWith('aria-')) { + x.removeAttribute(attr); + } + } + }); const dt = Date.now() - t0; if (dt > 1000) {