mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 02:55:54 +08:00
fix(scrapeURL/removeUnwantedElements): try to fix onlyMainContent for poorly structured sites
This commit is contained in:
parent
88a16b18a3
commit
6b1f30e0fb
@ -50,6 +50,10 @@ const excludeNonMainTags = [
|
||||
"#cookie"
|
||||
];
|
||||
|
||||
const forceIncludeMainTags = [
|
||||
"#main"
|
||||
];
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
scrapeOptions: ScrapeOptions
|
||||
@ -101,7 +105,9 @@ export const removeUnwantedElements = (
|
||||
|
||||
if (scrapeOptions.onlyMainContent) {
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
const elementsToRemove = soup(tag)
|
||||
.filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join(""));
|
||||
|
||||
elementsToRemove.remove();
|
||||
});
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user