Merge pull request #328 from mendableai/nsc/includeOnlyTags

pageOptions.onlyIncludeTags param
This commit is contained in:
Nicolas 2024-06-26 21:33:10 -03:00 committed by GitHub
commit 017b0b2556
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 127 additions and 231 deletions

View File

@ -68,9 +68,21 @@
}, },
"description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'"
}, },
"onlyIncludeTags": {
"type": "array",
"items": {
"type": "string"
},
"description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'"
},
"headers": { "headers": {
"type": "object", "type": "object",
"description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc."
},
"replaceAllPathsWithAbsolutePaths": {
"type": "boolean",
"description": "Replace all relative paths with absolute paths for images and links",
"default": false
} }
} }
}, },
@ -184,7 +196,7 @@
}, },
"maxDepth": { "maxDepth": {
"type": "integer", "type": "integer",
"description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern."
}, },
"mode": { "mode": {
"type": "string", "type": "string",
@ -511,7 +523,7 @@
"html": { "html": {
"type": "string", "type": "string",
"nullable": true, "nullable": true,
"description": "Raw HTML content of the page if `includeHtml` is true" "description": "Raw HTML content of the page if `includeHtml` is true"
}, },
"metadata": { "metadata": {
"type": "object", "type": "object",
@ -526,118 +538,13 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
}, },
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": { "pageStatusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
@ -647,6 +554,7 @@
"nullable": true, "nullable": true,
"description": "The error message of the page" "description": "The error message of the page"
} }
} }
}, },
"llm_extraction": { "llm_extraction": {
@ -694,118 +602,13 @@
"type": "string", "type": "string",
"nullable": true "nullable": true
}, },
"keywords": {
"type": "string",
"nullable": true
},
"robots": {
"type": "string",
"nullable": true
},
"ogTitle": {
"type": "string",
"nullable": true
},
"ogDescription": {
"type": "string",
"nullable": true
},
"ogUrl": {
"type": "string",
"format": "uri",
"nullable": true
},
"ogImage": {
"type": "string",
"nullable": true
},
"ogAudio": {
"type": "string",
"nullable": true
},
"ogDeterminer": {
"type": "string",
"nullable": true
},
"ogLocale": {
"type": "string",
"nullable": true
},
"ogLocaleAlternate": {
"type": "array",
"items": {
"type": "string"
},
"nullable": true
},
"ogSiteName": {
"type": "string",
"nullable": true
},
"ogVideo": {
"type": "string",
"nullable": true
},
"dctermsCreated": {
"type": "string",
"nullable": true
},
"dcDateCreated": {
"type": "string",
"nullable": true
},
"dcDate": {
"type": "string",
"nullable": true
},
"dctermsType": {
"type": "string",
"nullable": true
},
"dcType": {
"type": "string",
"nullable": true
},
"dctermsAudience": {
"type": "string",
"nullable": true
},
"dctermsSubject": {
"type": "string",
"nullable": true
},
"dcSubject": {
"type": "string",
"nullable": true
},
"dcDescription": {
"type": "string",
"nullable": true
},
"dctermsKeywords": {
"type": "string",
"nullable": true
},
"modifiedTime": {
"type": "string",
"nullable": true
},
"publishedTime": {
"type": "string",
"nullable": true
},
"articleTag": {
"type": "string",
"nullable": true
},
"articleSection": {
"type": "string",
"nullable": true
},
"sourceURL": { "sourceURL": {
"type": "string", "type": "string",
"format": "uri" "format": "uri"
}, },
"<any other metadata> ": {
"type": "string"
},
"pageStatusCode": { "pageStatusCode": {
"type": "integer", "type": "integer",
"description": "The status code of the page" "description": "The status code of the page"
@ -878,4 +681,4 @@
"bearerAuth": [] "bearerAuth": []
} }
] ]
} }

View File

@ -21,6 +21,7 @@ export type PageOptions = {
replaceAllPathsWithAbsolutePaths?: boolean; replaceAllPathsWithAbsolutePaths?: boolean;
parsePDF?: boolean; parsePDF?: boolean;
removeTags?: string | string[]; removeTags?: string | string[];
onlyIncludeTags?: string | string[];
}; };
export type ExtractorOptions = { export type ExtractorOptions = {

View File

@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => {
expect(result).not.toContain('id="remove-this"'); expect(result).not.toContain('id="remove-this"');
expect(result).toContain('class="keep"'); expect(result).toContain('class="keep"');
}); });
it('should only include specified tags', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle multiple specified tags', () => {
const html = `<div><header>Header Content</header><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['header', 'main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<header>Header Content</header>');
expect(result).toContain('<main>Main Content</main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should handle nested specified tags', () => {
const html = `<div><main><section>Main Section</section></main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main><section>Main Section</section></main>');
expect(result).toContain('<footer>Footer Content</footer>');
expect(result).not.toContain('<aside>');
});
it('should not handle no specified tags, return full content', () => {
const html = `<html><body><div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div></body></html>`;
const options: PageOptions = { onlyIncludeTags: [] };
const result = removeUnwantedElements(html, options);
expect(result).toBe(html);
});
it('should handle specified tags as a string', () => {
const html = `<div><main>Main Content</main><aside>Remove</aside><footer>Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: 'main' };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main>Main Content</main>');
expect(result).not.toContain('<aside>');
expect(result).not.toContain('<footer>');
});
it('should include specified tags with class', () => {
const html = `<div><main class="main-content">Main Content</main><aside class="remove">Remove</aside><footer class="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '.footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer class="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside class="remove">');
});
it('should include specified tags with id', () => {
const html = `<div><main id="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['#main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main id="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
it('should include specified tags with mixed class and id', () => {
const html = `<div><main class="main-content">Main Content</main><aside id="remove">Remove</aside><footer id="footer-content">Footer Content</footer></div>`;
const options: PageOptions = { onlyIncludeTags: ['.main-content', '#footer-content'] };
const result = removeUnwantedElements(html, options);
expect(result).toContain('<main class="main-content">Main Content</main>');
expect(result).toContain('<footer id="footer-content">Footer Content</footer>');
expect(result).not.toContain('<aside id="remove">');
});
}); });

View File

@ -2,31 +2,51 @@ import cheerio, { AnyNode, Cheerio } from "cheerio";
import { PageOptions } from "../../../lib/entities"; import { PageOptions } from "../../../lib/entities";
import { excludeNonMainTags } from "./excludeTags"; import { excludeNonMainTags } from "./excludeTags";
export const removeUnwantedElements = (html: string, pageOptions: PageOptions) => { export const removeUnwantedElements = (
html: string,
pageOptions: PageOptions
) => {
const soup = cheerio.load(html); const soup = cheerio.load(html);
if (pageOptions.onlyIncludeTags) {
if (typeof pageOptions.onlyIncludeTags === "string") {
pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags];
}
if (pageOptions.onlyIncludeTags.length !== 0) {
// Create a new root element to hold the tags to keep
const newRoot = cheerio.load("<div></div>")("div");
pageOptions.onlyIncludeTags.forEach((tag) => {
soup(tag).each((index, element) => {
newRoot.append(soup(element).clone());
});
});
return newRoot.html();
}
}
soup("script, style, iframe, noscript, meta, head").remove(); soup("script, style, iframe, noscript, meta, head").remove();
if (pageOptions.removeTags) { if (pageOptions.removeTags) {
if (typeof pageOptions.removeTags === 'string') { if (typeof pageOptions.removeTags === "string") {
pageOptions.removeTags = [pageOptions.removeTags]; pageOptions.removeTags = [pageOptions.removeTags];
} }
if (Array.isArray(pageOptions.removeTags)) { if (Array.isArray(pageOptions.removeTags)) {
pageOptions.removeTags.forEach((tag) => { pageOptions.removeTags.forEach((tag) => {
let elementsToRemove: Cheerio<AnyNode>; let elementsToRemove: Cheerio<AnyNode>;
if (tag.startsWith("*") && tag.endsWith("*")) { if (tag.startsWith("*") && tag.endsWith("*")) {
let classMatch = false; let classMatch = false;
const regexPattern = new RegExp(tag.slice(1, -1), 'i'); const regexPattern = new RegExp(tag.slice(1, -1), "i");
elementsToRemove = soup('*').filter((i, element) => { elementsToRemove = soup("*").filter((i, element) => {
if (element.type === 'tag') { if (element.type === "tag") {
const attributes = element.attribs; const attributes = element.attribs;
const tagNameMatches = regexPattern.test(element.name); const tagNameMatches = regexPattern.test(element.name);
const attributesMatch = Object.keys(attributes).some(attr => const attributesMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`${attr}="${attributes[attr]}"`) regexPattern.test(`${attr}="${attributes[attr]}"`)
); );
if (tag.startsWith('*.')) { if (tag.startsWith("*.")) {
classMatch = Object.keys(attributes).some(attr => classMatch = Object.keys(attributes).some((attr) =>
regexPattern.test(`class="${attributes[attr]}"`) regexPattern.test(`class="${attributes[attr]}"`)
); );
} }
@ -41,7 +61,7 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
}); });
} }
} }
if (pageOptions.onlyMainContent) { if (pageOptions.onlyMainContent) {
excludeNonMainTags.forEach((tag) => { excludeNonMainTags.forEach((tag) => {
const elementsToRemove = soup(tag); const elementsToRemove = soup(tag);
@ -50,4 +70,4 @@ export const removeUnwantedElements = (html: string, pageOptions: PageOptions) =
} }
const cleanedHtml = soup.html(); const cleanedHtml = soup.html();
return cleanedHtml; return cleanedHtml;
}; };