feat(blocklist): Improve URL blocking with tldts parsing

This commit is contained in:
Ademílson F. Tonato 2025-01-31 13:14:48 +00:00
parent 948f7866df
commit 24e8aaf6b5
No known key found for this signature in database
GPG Key ID: 169C7BE271C9FA3A
4 changed files with 111 additions and 115 deletions

View File

@ -121,6 +121,7 @@
"scrapingbee": "^1.7.4", "scrapingbee": "^1.7.4",
"stripe": "^16.1.0", "stripe": "^16.1.0",
"systeminformation": "^5.22.11", "systeminformation": "^5.22.11",
"tldts": "^6.1.75",
"turndown": "^7.1.3", "turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2", "turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4", "typesense": "^1.5.4",
@ -142,4 +143,4 @@
"temp" "temp"
] ]
} }
} }

View File

@ -203,6 +203,9 @@ importers:
systeminformation: systeminformation:
specifier: ^5.22.11 specifier: ^5.22.11
version: 5.22.11 version: 5.22.11
tldts:
specifier: ^6.1.75
version: 6.1.75
turndown: turndown:
specifier: ^7.1.3 specifier: ^7.1.3
version: 7.2.0 version: 7.2.0
@ -4286,6 +4289,13 @@ packages:
through@2.3.8: through@2.3.8:
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
tldts-core@6.1.75:
resolution: {integrity: sha512-AOvV5YYIAFFBfransBzSTyztkc3IMfz5Eq3YluaRiEu55nn43Fzaufx70UqEKYr8BoLCach4q8g/bg6e5+/aFw==}
tldts@6.1.75:
resolution: {integrity: sha512-+lFzEXhpl7JXgWYaXcB6DqTYXbUArvrWAE/5ioq/X3CdWLbDjpPP4XTrQBmEJ91y3xbe4Fkw7Lxv4P3GWeJaNg==}
hasBin: true
tmpl@1.0.5: tmpl@1.0.5:
resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==} resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==}
@ -9612,6 +9622,12 @@ snapshots:
through@2.3.8: {} through@2.3.8: {}
tldts-core@6.1.75: {}
tldts@6.1.75:
dependencies:
tldts-core: 6.1.75
tmpl@1.0.5: {} tmpl@1.0.5: {}
to-fast-properties@2.0.0: {} to-fast-properties@2.0.0: {}

View File

@ -1,94 +1,53 @@
import { isUrlBlocked } from "../blocklist"; import { isUrlBlocked } from "../blocklist";
describe("Blocklist Functionality", () => { describe("isUrlBlocked function", () => {
describe("isUrlBlocked", () => { test("Blocks exact domain facebook.com", () => {
test.each([ expect(isUrlBlocked("facebook.com")).toBe(true);
"https://facebook.com/fake-test", expect(isUrlBlocked("http://facebook.com")).toBe(true);
"https://x.com/user-profile", expect(isUrlBlocked("https://facebook.com")).toBe(true);
"https://twitter.com/home",
"https://instagram.com/explore",
"https://linkedin.com/in/johndoe",
"https://snapchat.com/add/johndoe",
"https://tiktok.com/@johndoe",
"https://reddit.com/r/funny",
"https://tumblr.com/dashboard",
"https://flickr.com/photos/johndoe",
"https://whatsapp.com/download",
"https://wechat.com/features",
"https://telegram.org/apps",
])("should return true for blocklisted URL %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
});
test.each([
"https://facebook.com/policy",
"https://twitter.com/tos",
"https://instagram.com/about/legal/terms",
"https://linkedin.com/legal/privacy-policy",
"https://pinterest.com/about/privacy",
"https://snapchat.com/legal/terms",
"https://tiktok.com/legal/privacy-policy",
"https://reddit.com/policies",
"https://tumblr.com/policy/en/privacy",
"https://flickr.com/help/terms",
"https://whatsapp.com/legal",
"https://wechat.com/en/privacy-policy",
"https://telegram.org/tos",
])("should return false for allowed URLs with keywords %s", (url) => {
expect(isUrlBlocked(url)).toBe(false);
});
test("should return false for non-blocklisted domain", () => {
const url = "https://example.com";
expect(isUrlBlocked(url)).toBe(false);
});
test("should handle invalid URLs gracefully", () => {
const url = "htp://invalid-url";
expect(isUrlBlocked(url)).toBe(false);
});
}); });
test.each([ test("Blocks subdomains of facebook.com", () => {
"https://subdomain.facebook.com", expect(isUrlBlocked("www.facebook.com")).toBe(true);
"https://facebook.com.someotherdomain.com", expect(isUrlBlocked("ads.facebook.com")).toBe(true);
"https://www.facebook.com/profile", expect(isUrlBlocked("business.facebook.com")).toBe(true);
"https://api.twitter.com/info",
"https://instagram.com/accounts/login",
])(
"should return true for URLs with blocklisted domains in subdomains or paths %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
},
);
test.each([
"https://example.com/facebook.com",
"https://example.com/redirect?url=https://twitter.com",
"https://facebook.com.policy.example.com",
])(
"should return false for URLs where blocklisted domain is part of another domain or path %s",
(url) => {
expect(isUrlBlocked(url)).toBe(false);
},
);
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
"should handle case variations %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
},
);
test.each([
"https://facebook.com?redirect=https://example.com",
"https://twitter.com?query=something",
])("should handle query parameters %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
}); });
test("should handle internationalized domain names", () => { test("Blocks different TLDs (facebook.pt, facebook.io)", () => {
const url = "https://xn--d1acpjx3f.xn--p1ai"; expect(isUrlBlocked("facebook.pt")).toBe(true);
expect(isUrlBlocked(url)).toBe(false); expect(isUrlBlocked("facebook.io")).toBe(true);
expect(isUrlBlocked("facebook.co.uk")).toBe(true);
expect(isUrlBlocked("https://facebook.de")).toBe(true);
});
test("Allows unrelated domains like whateverfacebook.com", () => {
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
});
test("Blocks other domains from the blocklist", () => {
expect(isUrlBlocked("tiktok.com")).toBe(true);
expect(isUrlBlocked("www.tiktok.com")).toBe(true);
expect(isUrlBlocked("reddit.com")).toBe(true);
expect(isUrlBlocked("youtube.com")).toBe(true);
});
test("Allows allowed keywords URLs", () => {
expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false);
expect(isUrlBlocked("https://developers.facebook.com")).toBe(false);
expect(isUrlBlocked("https://library.tiktok.com")).toBe(false);
});
test("Handles URLs with and without protocols", () => {
expect(isUrlBlocked("facebook.com")).toBe(true);
expect(isUrlBlocked("http://facebook.com")).toBe(true);
expect(isUrlBlocked("https://facebook.com")).toBe(true);
expect(isUrlBlocked("www.facebook.com")).toBe(true);
});
test("Should return false if the URL is invalid", () => {
expect(isUrlBlocked("randomstring")).toBe(false);
expect(isUrlBlocked("htp://bad.url")).toBe(false);
expect(isUrlBlocked("")).toBe(false);
}); });
}); });

View File

@ -1,6 +1,7 @@
import { logger } from "../../../lib/logger";
import crypto from "crypto";
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import crypto from "crypto";
import { parse } from "tldts";
configDotenv(); configDotenv();
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
@ -66,7 +67,7 @@ const urlBlocklist = [
"KKttwRz4w+AMJrZcB828WQ==", "KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==", "vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==", "l8GDVI8w/ueHnNzdN1ODuQ==",
"+yz9bnYYMnC0trJZGJwf6Q==" "+yz9bnYYMnC0trJZGJwf6Q==",
]; ];
const decryptedBlocklist = const decryptedBlocklist =
@ -100,38 +101,57 @@ const allowedKeywords = [
]; ];
export function isUrlBlocked(url: string): boolean { export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase(); const lowerCaseUrl = url.trim().toLowerCase();
// Check if the URL contains any allowed keywords as whole words const decryptedUrl =
decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) ||
lowerCaseUrl;
// If the URL is empty or invalid, return false
let parsedUrl;
try {
parsedUrl = parse(decryptedUrl);
} catch {
console.log("Error parsing URL:", url);
return false;
}
const domain = parsedUrl.domain;
const publicSuffix = parsedUrl.publicSuffix;
if (!domain) {
return false;
}
// Check if URL contains any allowed keyword
if ( if (
allowedKeywords.some((keyword) => allowedKeywords.some((keyword) =>
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl), lowerCaseUrl.includes(keyword.toLowerCase()),
) )
) { ) {
return false; return false;
} }
try { // Block exact matches
if (!url.startsWith("http://") && !url.startsWith("https://")) { if (decryptedBlocklist.includes(domain)) {
url = "https://" + url; return true;
}
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
// Check if the URL matches any domain in the blocklist
const isBlocked = decryptedBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
"i",
);
return domainPattern.test(hostname);
});
return isBlocked;
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
logger.error(`Error parsing the following URL: ${url}`);
return false;
} }
// Block subdomains
if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) {
return true;
}
// Block different TLDs of the same base domain
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
if (
publicSuffix &&
decryptedBlocklist.some(
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
)
) {
return true;
}
return false;
} }