diff --git a/apps/api/openapi.json b/apps/api/openapi.json index 17b36777..a466e50b 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -68,9 +68,21 @@ }, "description": "Tags, classes and ids to remove from the page. Use comma separated values. Example: 'script, .ad, #footer'" }, + "onlyIncludeTags": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Only include tags, classes and ids from the page in the final output. Use comma separated values. Example: 'script, .ad, #footer'" + }, "headers": { "type": "object", "description": "Headers to send with the request. Can be used to send cookies, user-agent, etc." + }, + "replaceAllPathsWithAbsolutePaths": { + "type": "boolean", + "description": "Replace all relative paths with absolute paths for images and links", + "default": false } } }, @@ -184,7 +196,7 @@ }, "maxDepth": { "type": "integer", - "description": "Maximum depth to crawl. Depth 1 is the base URL, depth 2 is the base URL and its direct children, and so on." + "description": "Maximum depth to crawl relative to the entered URL. A maxDepth of 0 scrapes only the entered URL. A maxDepth of 1 scrapes the entered URL and all pages one level deep. A maxDepth of 2 scrapes the entered URL and all pages up to two levels deep. Higher values follow the same pattern." }, "mode": { "type": "string", @@ -511,7 +523,7 @@ "html": { "type": "string", "nullable": true, - "description": "Raw HTML content of the page if `includeHtml` is true" + "description": "Raw HTML content of the page if `includeHtml` is true" }, "metadata": { "type": "object", @@ -526,118 +538,13 @@ "type": "string", "nullable": true }, - "keywords": { - "type": "string", - "nullable": true - }, - "robots": { - "type": "string", - "nullable": true - }, - "ogTitle": { - "type": "string", - "nullable": true - }, - "ogDescription": { - "type": "string", - "nullable": true - }, - "ogUrl": { - "type": "string", - "format": "uri", - "nullable": true - }, - "ogImage": { - "type": "string", - "nullable": true - }, - "ogAudio": { - "type": "string", - "nullable": true - }, - "ogDeterminer": { - "type": "string", - "nullable": true - }, - "ogLocale": { - "type": "string", - "nullable": true - }, - "ogLocaleAlternate": { - "type": "array", - "items": { - "type": "string" - }, - "nullable": true - }, - "ogSiteName": { - "type": "string", - "nullable": true - }, - "ogVideo": { - "type": "string", - "nullable": true - }, - "dctermsCreated": { - "type": "string", - "nullable": true - }, - "dcDateCreated": { - "type": "string", - "nullable": true - }, - "dcDate": { - "type": "string", - "nullable": true - }, - "dctermsType": { - "type": "string", - "nullable": true - }, - "dcType": { - "type": "string", - "nullable": true - }, - "dctermsAudience": { - "type": "string", - "nullable": true - }, - "dctermsSubject": { - "type": "string", - "nullable": true - }, - "dcSubject": { - "type": "string", - "nullable": true - }, - "dcDescription": { - "type": "string", - "nullable": true - }, - "dctermsKeywords": { - "type": "string", - "nullable": true - }, - "modifiedTime": { - "type": "string", - "nullable": true - }, - "publishedTime": { - "type": "string", - "nullable": true - }, - "articleTag": { - "type": "string", - "nullable": true - }, - "articleSection": { - "type": "string", - "nullable": true - }, "sourceURL": { "type": "string", "format": "uri" }, + " ": { + "type": "string" + }, "pageStatusCode": { "type": "integer", "description": "The status code of the page" @@ -647,6 +554,7 @@ "nullable": true, "description": "The error message of the page" } + } }, "llm_extraction": { @@ -694,118 +602,13 @@ "type": "string", "nullable": true }, - "keywords": { - "type": "string", - "nullable": true - }, - "robots": { - "type": "string", - "nullable": true - }, - "ogTitle": { - "type": "string", - "nullable": true - }, - "ogDescription": { - "type": "string", - "nullable": true - }, - "ogUrl": { - "type": "string", - "format": "uri", - "nullable": true - }, - "ogImage": { - "type": "string", - "nullable": true - }, - "ogAudio": { - "type": "string", - "nullable": true - }, - "ogDeterminer": { - "type": "string", - "nullable": true - }, - "ogLocale": { - "type": "string", - "nullable": true - }, - "ogLocaleAlternate": { - "type": "array", - "items": { - "type": "string" - }, - "nullable": true - }, - "ogSiteName": { - "type": "string", - "nullable": true - }, - "ogVideo": { - "type": "string", - "nullable": true - }, - "dctermsCreated": { - "type": "string", - "nullable": true - }, - "dcDateCreated": { - "type": "string", - "nullable": true - }, - "dcDate": { - "type": "string", - "nullable": true - }, - "dctermsType": { - "type": "string", - "nullable": true - }, - "dcType": { - "type": "string", - "nullable": true - }, - "dctermsAudience": { - "type": "string", - "nullable": true - }, - "dctermsSubject": { - "type": "string", - "nullable": true - }, - "dcSubject": { - "type": "string", - "nullable": true - }, - "dcDescription": { - "type": "string", - "nullable": true - }, - "dctermsKeywords": { - "type": "string", - "nullable": true - }, - "modifiedTime": { - "type": "string", - "nullable": true - }, - "publishedTime": { - "type": "string", - "nullable": true - }, - "articleTag": { - "type": "string", - "nullable": true - }, - "articleSection": { - "type": "string", - "nullable": true - }, "sourceURL": { "type": "string", "format": "uri" }, + " ": { + "type": "string" + }, "pageStatusCode": { "type": "integer", "description": "The status code of the page" @@ -878,4 +681,4 @@ "bearerAuth": [] } ] -} +} \ No newline at end of file diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 12d8c366..2f43b9a4 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -21,6 +21,7 @@ export type PageOptions = { replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; removeTags?: string | string[]; + onlyIncludeTags?: string | string[]; }; export type ExtractorOptions = { diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts index 0dc24c89..18438df2 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/removeUnwantedElements.test.ts @@ -100,4 +100,76 @@ describe('removeUnwantedElements', () => { expect(result).not.toContain('id="remove-this"'); expect(result).toContain('class="keep"'); }); + + it('should only include specified tags', () => { + const html = `
Main Content
Footer Content
`; + const options: PageOptions = { onlyIncludeTags: ['main', 'footer'] }; + const result = removeUnwantedElements(html, options); + expect(result).toContain('
Main Content
'); + expect(result).toContain('
Footer Content
'); + expect(result).not.toContain('