From 8b7af6d07651ad05b0af292ede12ef548aa16e1e Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Wed, 31 Jul 2024 14:06:22 +0800 Subject: [PATCH] fix: ignore match all target selectors for performance --- backend/functions/src/dto/scrapping-options.ts | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 41e2b20..6bb2763 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -237,6 +237,7 @@ export class CrawlerOptions extends AutoCastable { instance.targetSelector ??= targetSelector; const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', '); instance.waitForSelector ??= waitForSelector || instance.targetSelector; + instance.targetSelector = filterSelector(instance.targetSelector); const overrideUserAgent = ctx?.req.get('x-user-agent'); instance.userAgent ??= overrideUserAgent; @@ -286,3 +287,20 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions { return instance; } } + +function filterSelector(s?: string | string[]) { + if (!s) { + return s; + } + const sr = Array.isArray(s) ? s : [s]; + const selectors = sr.filter((i)=> { + const innerSelectors = i.split(',').map((s) => s.trim()); + const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:')); + if (someViolation) { + return false; + } + return true; + }) + + return selectors; +};