diff --git a/apps/api/package.json b/apps/api/package.json index 8c92746f..1c554728 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -121,6 +121,7 @@ "scrapingbee": "^1.7.4", "stripe": "^16.1.0", "systeminformation": "^5.22.11", + "tldts": "^6.1.75", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", "typesense": "^1.5.4", @@ -142,4 +143,4 @@ "temp" ] } -} \ No newline at end of file +} diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index 7478cef9..f66dfe72 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -203,6 +203,9 @@ importers: systeminformation: specifier: ^5.22.11 version: 5.22.11 + tldts: + specifier: ^6.1.75 + version: 6.1.75 turndown: specifier: ^7.1.3 version: 7.2.0 @@ -4286,6 +4289,13 @@ packages: through@2.3.8: resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} + tldts-core@6.1.75: + resolution: {integrity: sha512-AOvV5YYIAFFBfransBzSTyztkc3IMfz5Eq3YluaRiEu55nn43Fzaufx70UqEKYr8BoLCach4q8g/bg6e5+/aFw==} + + tldts@6.1.75: + resolution: {integrity: sha512-+lFzEXhpl7JXgWYaXcB6DqTYXbUArvrWAE/5ioq/X3CdWLbDjpPP4XTrQBmEJ91y3xbe4Fkw7Lxv4P3GWeJaNg==} + hasBin: true + tmpl@1.0.5: resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==} @@ -9612,6 +9622,12 @@ snapshots: through@2.3.8: {} + tldts-core@6.1.75: {} + + tldts@6.1.75: + dependencies: + tldts-core: 6.1.75 + tmpl@1.0.5: {} to-fast-properties@2.0.0: {} diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts index d3963685..8144f2c8 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -1,94 +1,53 @@ import { isUrlBlocked } from "../blocklist"; -describe("Blocklist Functionality", () => { - describe("isUrlBlocked", () => { - test.each([ - "https://facebook.com/fake-test", - "https://x.com/user-profile", - "https://twitter.com/home", - "https://instagram.com/explore", - "https://linkedin.com/in/johndoe", - "https://snapchat.com/add/johndoe", - "https://tiktok.com/@johndoe", - "https://reddit.com/r/funny", - "https://tumblr.com/dashboard", - "https://flickr.com/photos/johndoe", - "https://whatsapp.com/download", - "https://wechat.com/features", - "https://telegram.org/apps", - ])("should return true for blocklisted URL %s", (url) => { - expect(isUrlBlocked(url)).toBe(true); - }); - - test.each([ - "https://facebook.com/policy", - "https://twitter.com/tos", - "https://instagram.com/about/legal/terms", - "https://linkedin.com/legal/privacy-policy", - "https://pinterest.com/about/privacy", - "https://snapchat.com/legal/terms", - "https://tiktok.com/legal/privacy-policy", - "https://reddit.com/policies", - "https://tumblr.com/policy/en/privacy", - "https://flickr.com/help/terms", - "https://whatsapp.com/legal", - "https://wechat.com/en/privacy-policy", - "https://telegram.org/tos", - ])("should return false for allowed URLs with keywords %s", (url) => { - expect(isUrlBlocked(url)).toBe(false); - }); - - test("should return false for non-blocklisted domain", () => { - const url = "https://example.com"; - expect(isUrlBlocked(url)).toBe(false); - }); - - test("should handle invalid URLs gracefully", () => { - const url = "htp://invalid-url"; - expect(isUrlBlocked(url)).toBe(false); - }); +describe("isUrlBlocked function", () => { + test("Blocks exact domain facebook.com", () => { + expect(isUrlBlocked("facebook.com")).toBe(true); + expect(isUrlBlocked("http://facebook.com")).toBe(true); + expect(isUrlBlocked("https://facebook.com")).toBe(true); }); - test.each([ - "https://subdomain.facebook.com", - "https://facebook.com.someotherdomain.com", - "https://www.facebook.com/profile", - "https://api.twitter.com/info", - "https://instagram.com/accounts/login", - ])( - "should return true for URLs with blocklisted domains in subdomains or paths %s", - (url) => { - expect(isUrlBlocked(url)).toBe(true); - }, - ); - - test.each([ - "https://example.com/facebook.com", - "https://example.com/redirect?url=https://twitter.com", - "https://facebook.com.policy.example.com", - ])( - "should return false for URLs where blocklisted domain is part of another domain or path %s", - (url) => { - expect(isUrlBlocked(url)).toBe(false); - }, - ); - - test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])( - "should handle case variations %s", - (url) => { - expect(isUrlBlocked(url)).toBe(true); - }, - ); - - test.each([ - "https://facebook.com?redirect=https://example.com", - "https://twitter.com?query=something", - ])("should handle query parameters %s", (url) => { - expect(isUrlBlocked(url)).toBe(true); + test("Blocks subdomains of facebook.com", () => { + expect(isUrlBlocked("www.facebook.com")).toBe(true); + expect(isUrlBlocked("ads.facebook.com")).toBe(true); + expect(isUrlBlocked("business.facebook.com")).toBe(true); }); - test("should handle internationalized domain names", () => { - const url = "https://xn--d1acpjx3f.xn--p1ai"; - expect(isUrlBlocked(url)).toBe(false); + test("Blocks different TLDs (facebook.pt, facebook.io)", () => { + expect(isUrlBlocked("facebook.pt")).toBe(true); + expect(isUrlBlocked("facebook.io")).toBe(true); + expect(isUrlBlocked("facebook.co.uk")).toBe(true); + expect(isUrlBlocked("https://facebook.de")).toBe(true); + }); + + test("Allows unrelated domains like whateverfacebook.com", () => { + expect(isUrlBlocked("whateverfacebook.com")).toBe(false); + expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false); + }); + + test("Blocks other domains from the blocklist", () => { + expect(isUrlBlocked("tiktok.com")).toBe(true); + expect(isUrlBlocked("www.tiktok.com")).toBe(true); + expect(isUrlBlocked("reddit.com")).toBe(true); + expect(isUrlBlocked("youtube.com")).toBe(true); + }); + + test("Allows allowed keywords URLs", () => { + expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false); + expect(isUrlBlocked("https://developers.facebook.com")).toBe(false); + expect(isUrlBlocked("https://library.tiktok.com")).toBe(false); + }); + + test("Handles URLs with and without protocols", () => { + expect(isUrlBlocked("facebook.com")).toBe(true); + expect(isUrlBlocked("http://facebook.com")).toBe(true); + expect(isUrlBlocked("https://facebook.com")).toBe(true); + expect(isUrlBlocked("www.facebook.com")).toBe(true); + }); + + test("Should return false if the URL is invalid", () => { + expect(isUrlBlocked("randomstring")).toBe(false); + expect(isUrlBlocked("htp://bad.url")).toBe(false); + expect(isUrlBlocked("")).toBe(false); }); }); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 691af596..f1e28892 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,6 +1,7 @@ -import { logger } from "../../../lib/logger"; -import crypto from "crypto"; import { configDotenv } from "dotenv"; +import crypto from "crypto"; +import { parse } from "tldts"; + configDotenv(); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); @@ -66,7 +67,7 @@ const urlBlocklist = [ "KKttwRz4w+AMJrZcB828WQ==", "vMdzZ33BXoyWVZnAPOBcrg==", "l8GDVI8w/ueHnNzdN1ODuQ==", - "+yz9bnYYMnC0trJZGJwf6Q==" + "+yz9bnYYMnC0trJZGJwf6Q==", ]; const decryptedBlocklist = @@ -100,38 +101,57 @@ const allowedKeywords = [ ]; export function isUrlBlocked(url: string): boolean { - const lowerCaseUrl = url.toLowerCase(); + const lowerCaseUrl = url.trim().toLowerCase(); - // Check if the URL contains any allowed keywords as whole words + const decryptedUrl = + decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) || + lowerCaseUrl; + + // If the URL is empty or invalid, return false + let parsedUrl; + try { + parsedUrl = parse(decryptedUrl); + } catch { + console.log("Error parsing URL:", url); + return false; + } + + const domain = parsedUrl.domain; + const publicSuffix = parsedUrl.publicSuffix; + + if (!domain) { + return false; + } + + // Check if URL contains any allowed keyword if ( allowedKeywords.some((keyword) => - new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl), + lowerCaseUrl.includes(keyword.toLowerCase()), ) ) { return false; } - try { - if (!url.startsWith("http://") && !url.startsWith("https://")) { - url = "https://" + url; - } - - const urlObj = new URL(url); - const hostname = urlObj.hostname.toLowerCase(); - - // Check if the URL matches any domain in the blocklist - const isBlocked = decryptedBlocklist.some((domain) => { - const domainPattern = new RegExp( - `(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`, - "i", - ); - return domainPattern.test(hostname); - }); - - return isBlocked; - } catch (e) { - // If an error occurs (e.g., invalid URL), return false - logger.error(`Error parsing the following URL: ${url}`); - return false; + // Block exact matches + if (decryptedBlocklist.includes(domain)) { + return true; } + + // Block subdomains + if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) { + return true; + } + + // Block different TLDs of the same base domain + const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com") + if ( + publicSuffix && + decryptedBlocklist.some( + (blocked) => blocked.startsWith(baseDomain) && blocked !== domain, + ) + ) { + return true; + } + + return false; }