mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-18 02:25:55 +08:00
fix: ignore match all target selectors for performance
This commit is contained in:
parent
a08218506e
commit
8b7af6d076
@ -237,6 +237,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.targetSelector ??= targetSelector;
|
instance.targetSelector ??= targetSelector;
|
||||||
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
|
const waitForSelector = ctx?.req.get('x-wait-for-selector')?.split(', ');
|
||||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||||
|
instance.targetSelector = filterSelector(instance.targetSelector);
|
||||||
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||||
instance.userAgent ??= overrideUserAgent;
|
instance.userAgent ??= overrideUserAgent;
|
||||||
|
|
||||||
@ -286,3 +287,20 @@ export class CrawlerOptionsHeaderOnly extends CrawlerOptions {
|
|||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function filterSelector(s?: string | string[]) {
|
||||||
|
if (!s) {
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
const sr = Array.isArray(s) ? s : [s];
|
||||||
|
const selectors = sr.filter((i)=> {
|
||||||
|
const innerSelectors = i.split(',').map((s) => s.trim());
|
||||||
|
const someViolation = innerSelectors.find((x) => x.startsWith('*') || x.startsWith(':') || x.includes('*:'));
|
||||||
|
if (someViolation) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
})
|
||||||
|
|
||||||
|
return selectors;
|
||||||
|
};
|
||||||
|
Loading…
x
Reference in New Issue
Block a user