mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 02:55:54 +08:00
fix(scrapeURL/removeUnwantedElements): try to fix onlyMainContent for poorly structured sites
This commit is contained in:
parent
88a16b18a3
commit
6b1f30e0fb
@ -50,6 +50,10 @@ const excludeNonMainTags = [
|
|||||||
"#cookie"
|
"#cookie"
|
||||||
];
|
];
|
||||||
|
|
||||||
|
const forceIncludeMainTags = [
|
||||||
|
"#main"
|
||||||
|
];
|
||||||
|
|
||||||
export const removeUnwantedElements = (
|
export const removeUnwantedElements = (
|
||||||
html: string,
|
html: string,
|
||||||
scrapeOptions: ScrapeOptions
|
scrapeOptions: ScrapeOptions
|
||||||
@ -101,7 +105,9 @@ export const removeUnwantedElements = (
|
|||||||
|
|
||||||
if (scrapeOptions.onlyMainContent) {
|
if (scrapeOptions.onlyMainContent) {
|
||||||
excludeNonMainTags.forEach((tag) => {
|
excludeNonMainTags.forEach((tag) => {
|
||||||
const elementsToRemove = soup(tag);
|
const elementsToRemove = soup(tag)
|
||||||
|
.filter(forceIncludeMainTags.map(x => ":not(:has(" + x + "))").join(""));
|
||||||
|
|
||||||
elementsToRemove.remove();
|
elementsToRemove.remove();
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user