mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 20:56:07 +08:00
bugfix: using onlyIncludeTags and removeTags together
This commit is contained in:
parent
a5322322f0
commit
26825fc8fb
File diff suppressed because one or more lines are too long
@ -1,30 +1,31 @@
|
||||
import cheerio, { AnyNode, Cheerio } from "cheerio";
|
||||
import { AnyNode, Cheerio, load } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions
|
||||
pageOptions: PageOptions,
|
||||
) => {
|
||||
const soup = cheerio.load(html);
|
||||
let soup = load(html);
|
||||
|
||||
if (
|
||||
pageOptions.onlyIncludeTags &&
|
||||
pageOptions.onlyIncludeTags.length > 0 &&
|
||||
pageOptions.onlyIncludeTags[0] !== ''
|
||||
pageOptions.onlyIncludeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load("<div></div>")("div");
|
||||
const newRoot = load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
return newRoot.html();
|
||||
|
||||
soup = load(newRoot.html());
|
||||
}
|
||||
}
|
||||
|
||||
@ -33,7 +34,7 @@ export const removeUnwantedElements = (
|
||||
if (
|
||||
pageOptions.removeTags &&
|
||||
pageOptions.removeTags.length > 0 &&
|
||||
pageOptions.removeTags[0] !== ''
|
||||
pageOptions.removeTags[0] !== ""
|
||||
) {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
@ -51,11 +52,11 @@ export const removeUnwantedElements = (
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`),
|
||||
);
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`)
|
||||
regexPattern.test(`class="${attributes[attr]}"`),
|
||||
);
|
||||
}
|
||||
return tagNameMatches || attributesMatch || classMatch;
|
||||
|
Loading…
x
Reference in New Issue
Block a user