mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-13 10:09:00 +08:00
Merge pull request #328 from mendableai/nsc/includeOnlyTags
pageOptions.onlyIncludeTags param
This commit is contained in:
commit
017b0b2556
@ -68,9 +68,21 @@
|
||||
},
|
||||
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"onlyIncludeTags": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
|
||||
},
|
||||
"headers": {
|
||||
"type": "object",
|
||||
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
|
||||
},
|
||||
"replaceAllPathsWithAbsolutePaths": {
|
||||
"type": "boolean",
|
||||
"description": "Replace all relative paths with absolute paths for images and links",
|
||||
"default": false
|
||||
}
|
||||
}
|
||||
},
|
||||
@ -184,7 +196,7 @@
|
||||
},
|
||||
"maxDepth": {
|
||||
"type": "integer",
|
||||
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
|
||||
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
|
||||
},
|
||||
"mode": {
|
||||
"type": "string",
|
||||
@ -511,7 +523,7 @@
|
||||
"html": {
|
||||
"type": "string",
|
||||
"nullable": true,
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
"description": "Raw HTML content of the page if `includeHtml` is true"
|
||||
},
|
||||
"metadata": {
|
||||
"type": "object",
|
||||
@ -526,118 +538,13 @@
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"keywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"robots": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogTitle": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogUrl": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"nullable": true
|
||||
},
|
||||
"ogImage": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogAudio": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDeterminer": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocale": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocaleAlternate": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"nullable": true
|
||||
},
|
||||
"ogSiteName": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogVideo": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDateCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDate": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsAudience": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsKeywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"modifiedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"publishedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleTag": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleSection": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
@ -647,6 +554,7 @@
|
||||
"nullable": true,
|
||||
"description": "The error message of the page"
|
||||
}
|
||||
|
||||
}
|
||||
},
|
||||
"llm_extraction": {
|
||||
@ -694,118 +602,13 @@
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"keywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"robots": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogTitle": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogUrl": {
|
||||
"type": "string",
|
||||
"format": "uri",
|
||||
"nullable": true
|
||||
},
|
||||
"ogImage": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogAudio": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogDeterminer": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocale": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogLocaleAlternate": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"nullable": true
|
||||
},
|
||||
"ogSiteName": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"ogVideo": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDateCreated": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDate": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcType": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsAudience": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcSubject": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dcDescription": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"dctermsKeywords": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"modifiedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"publishedTime": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleTag": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"articleSection": {
|
||||
"type": "string",
|
||||
"nullable": true
|
||||
},
|
||||
"sourceURL": {
|
||||
"type": "string",
|
||||
"format": "uri"
|
||||
},
|
||||
"<any other metadata> ": {
|
||||
"type": "string"
|
||||
},
|
||||
"pageStatusCode": {
|
||||
"type": "integer",
|
||||
"description": "The status code of the page"
|
||||
@ -878,4 +681,4 @@
|
||||
"bearerAuth": []
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
@ -21,6 +21,7 @@ export type PageOptions = {
|
||||
replaceAllPathsWithAbsolutePaths?: boolean;
|
||||
parsePDF?: boolean;
|
||||
removeTags?: string | string[];
|
||||
onlyIncludeTags?: string | string[];
|
||||
};
|
||||
|
||||
export type ExtractorOptions = {
|
||||
|
@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => {
|
||||
expect(result).not.toContain('id="remove-this"');
|
||||
expect(result).toContain('class="keep"');
|
||||
});
|
||||
|
||||
it('should only include specified tags', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle multiple specified tags', () => {
|
||||
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<header>Header Content</header>');
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should handle nested specified tags', () => {
|
||||
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main><section>Main Section</section></main>');
|
||||
expect(result).toContain('<footer>Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
});
|
||||
|
||||
it('should not handle no specified tags, return full content', () => {
|
||||
const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
|
||||
const options: PageOptions = { onlyIncludeTags: [] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toBe(html);
|
||||
});
|
||||
|
||||
it('should handle specified tags as a string', () => {
|
||||
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: 'main' };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main>Main Content</main>');
|
||||
expect(result).not.toContain('<aside>');
|
||||
expect(result).not.toContain('<footer>');
|
||||
});
|
||||
it('should include specified tags with class', () => {
|
||||
const html = `<div><main class="main-content">Main Content</main><aside class="remove">Remove</aside><footer class="footer-content">Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['.main-content', '.footer-content'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main class="main-content">Main Content</main>');
|
||||
expect(result).toContain('<footer class="footer-content">Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside class="remove">');
|
||||
});
|
||||
|
||||
it('should include specified tags with id', () => {
|
||||
const html = `<div><main id="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['#main-content', '#footer-content'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main id="main-content">Main Content</main>');
|
||||
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside id="remove">');
|
||||
});
|
||||
|
||||
it('should include specified tags with mixed class and id', () => {
|
||||
const html = `<div><main class="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
|
||||
const options: PageOptions = { onlyIncludeTags: ['.main-content', '#footer-content'] };
|
||||
const result = removeUnwantedElements(html, options);
|
||||
expect(result).toContain('<main class="main-content">Main Content</main>');
|
||||
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
|
||||
expect(result).not.toContain('<aside id="remove">');
|
||||
});
|
||||
|
||||
|
||||
});
|
||||
|
@ -2,31 +2,51 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
|
||||
import { PageOptions } from "../../../lib/entities";
|
||||
import { excludeNonMainTags } from "./excludeTags";
|
||||
|
||||
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
|
||||
export const removeUnwantedElements = (
|
||||
html: string,
|
||||
pageOptions: PageOptions
|
||||
) => {
|
||||
const soup = cheerio.load(html);
|
||||
|
||||
if (pageOptions.onlyIncludeTags) {
|
||||
if (typeof pageOptions.onlyIncludeTags === "string") {
|
||||
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
|
||||
}
|
||||
if (pageOptions.onlyIncludeTags.length !== 0) {
|
||||
// Create a new root element to hold the tags to keep
|
||||
const newRoot = cheerio.load("<div></div>")("div");
|
||||
pageOptions.onlyIncludeTags.forEach((tag) => {
|
||||
soup(tag).each((index, element) => {
|
||||
newRoot.append(soup(element).clone());
|
||||
});
|
||||
});
|
||||
return newRoot.html();
|
||||
}
|
||||
}
|
||||
|
||||
soup("script, style, iframe, noscript, meta, head").remove();
|
||||
|
||||
|
||||
if (pageOptions.removeTags) {
|
||||
if (typeof pageOptions.removeTags === 'string') {
|
||||
if (typeof pageOptions.removeTags === "string") {
|
||||
pageOptions.removeTags = [pageOptions.removeTags];
|
||||
}
|
||||
|
||||
|
||||
if (Array.isArray(pageOptions.removeTags)) {
|
||||
pageOptions.removeTags.forEach((tag) => {
|
||||
let elementsToRemove: Cheerio<AnyNode>;
|
||||
if (tag.startsWith("*") && tag.endsWith("*")) {
|
||||
let classMatch = false;
|
||||
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
|
||||
elementsToRemove = soup('*').filter((i, element) => {
|
||||
if (element.type === 'tag') {
|
||||
const regexPattern = new RegExp(tag.slice(1, -1), "i");
|
||||
elementsToRemove = soup("*").filter((i, element) => {
|
||||
if (element.type === "tag") {
|
||||
const attributes = element.attribs;
|
||||
const tagNameMatches = regexPattern.test(element.name);
|
||||
const attributesMatch = Object.keys(attributes).some(attr =>
|
||||
const attributesMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`${attr}="${attributes[attr]}"`)
|
||||
);
|
||||
if (tag.startsWith('*.')) {
|
||||
classMatch = Object.keys(attributes).some(attr =>
|
||||
if (tag.startsWith("*.")) {
|
||||
classMatch = Object.keys(attributes).some((attr) =>
|
||||
regexPattern.test(`class="${attributes[attr]}"`)
|
||||
);
|
||||
}
|
||||
@ -41,7 +61,7 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (pageOptions.onlyMainContent) {
|
||||
excludeNonMainTags.forEach((tag) => {
|
||||
const elementsToRemove = soup(tag);
|
||||
@ -50,4 +70,4 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
|
||||
}
|
||||
const cleanedHtml = soup.html();
|
||||
return cleanedHtml;
|
||||
};
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user