Merge pull request #328 from mendableai/nsc/includeOnlyTags

pageOptions.onlyIncludeTags param
This commit is contained in:
Nicolas 2024-06-26 21:33:10 -03:00 committed by GitHub
commit 017b0b2556
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 127 additions and 231 deletions

View File

@ -68,9 +68,21 @@
},
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
},
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"headers": {
"type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
}
}
},
@ -184,7 +196,7 @@
},
"maxDepth": {
"type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on."
"description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
},
"mode": {
"type": "string",
@ -511,7 +523,7 @@
"html": {
"type": "string",
"nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true"
"description": "Raw HTML content of the page if `includeHtml` is true"
},
"metadata": {
"type": "object",
@ -526,118 +538,13 @@
"type": "string",
"nullable": true
},
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
@ -647,6 +554,7 @@
"nullable": true,
"description": "The error message of the page"
}
}
},
"llm_extraction": {
@ -694,118 +602,13 @@
"type": "string",
"nullable": true
},
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": {
"type": "string",
"format": "uri"
},
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": {
"type": "integer",
"description": "The status code of the page"
@ -878,4 +681,4 @@
"bearerAuth": []
}
]
}
}

View File

@ -21,6 +21,7 @@ export type PageOptions = {
replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean;
removeTags?: string | string[];
onlyIncludeTags?: string | string[];
};
export type ExtractorOptions = {

View File

@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"');
});
it('should only include specified tags', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle multiple specified tags', () => {
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<header>Header Content</header>');
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle nested specified tags', () => {
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main><section>Main Section</section></main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should not handle no specified tags, return full content', () => {
const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
const options: PageOptions = { onlyIncludeTags: [] };
const result = removeUnwantedElements(html, options);
expect(result).toBe(html);
});
it('should handle specified tags as a string', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: 'main' };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).not.toContain('<aside>');
expect(result).not.toContain('<footer>');
});
it('should include specified tags with class', () => {
const html = `<div><main class="main-content">Main Content</main><aside class="remove">Remove</aside><footer class="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '.footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer class="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside class="remove">');
});
it('should include specified tags with id', () => {
const html = `<div><main id="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['#main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main id="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
it('should include specified tags with mixed class and id', () => {
const html = `<div><main class="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
});

View File

@ -2,31 +2,51 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => {
export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions
) => {
const soup = cheerio.load(html);
if (pageOptions.onlyIncludeTags) {
if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep
const newRoot = cheerio.load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
});
return newRoot.html();
}
}
soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') {
if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags];
}
if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), 'i');
elementsToRemove = soup('*').filter((i, element) => {
if (element.type === 'tag') {
const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup("*").filter((i, element) => {
if (element.type === "tag") {
const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr =>
const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`)
);
if (tag.startsWith('*.')) {
classMatch = Object.keys(attributes).some(attr =>
if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`)
);
}
@ -41,7 +61,7 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
});
}
}
if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag);
@ -50,4 +70,4 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
}
const cleanedHtml = soup.html();
return cleanedHtml;
};
};