mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-10 00:09:02 +08:00
Nick: small changes
This commit is contained in:
parent
1d4907acc9
commit
388ce3cbce
@ -129,11 +129,11 @@ describe('removeUnwantedElements', () => {
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle no specified tags', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
it('should not handle no specified tags, return full content', () => {
|
||||
const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
|
||||
const options: PageOptions = { onlyIncludeTags: [] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toBe('');
|
||||
expect(result).toBe(html);
|
||||
});
|
||||
|
||||
it('should handle specified tags as a string', () => {
|
||||
|
@ -2,46 +2,51 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions
|
||||
) => {
|
||||
const soup = cheerio.load(html);
|
||||
|
||||
if (pageOptions.onlyIncludeTags) {
|
||||
if (typeof pageOptions.onlyIncludeTags === 'string') {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load('<div></div>')('div');
|
||||
pageOptions.onlyIncludeTags.forEach(tag => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
});
|
||||
return newRoot.html();
|
||||
return newRoot.html();
|
||||
}
|
||||
}
|
||||
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
|
||||
|
||||
if (pageOptions.removeTags) {
|
||||
if (typeof pageOptions.removeTags === 'string') {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
}
|
||||
|
||||
|
||||
if (Array.isArray(pageOptions.removeTags)) {
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
let elementsToRemove: Cheerio<AnyNode>;
|
||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||
let classMatch = false;
|
||||
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
|
||||
elementsToRemove = soup('*').filter((i, element) => {
|
||||
if (element.type === 'tag') {
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), "i");
|
||||
elementsToRemove = soup("*").filter((i, element) => {
|
||||
if (element.type === "tag") {
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some(attr =>
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||
);
|
||||
if (tag.startsWith('*.')) {
|
||||
classMatch = Object.keys(attributes).some(attr =>
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`)
|
||||
);
|
||||
}
|
||||
@ -56,7 +61,7 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
@ -65,4 +70,4 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
||||
}
|
||||
const cleanedHtml = soup.html();
|
||||
return cleanedHtml;
|
||||
};
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user