Merge pull request #1117 from mendableai/feat/blocked-domains

feat(blocklist): Improve URL blocking with tldts parsing
This commit is contained in:
Ademílson Tonato 2025-02-06 15:23:58 +00:00 committed by GitHub
commit 996352c24c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 227 additions and 126 deletions

View File

@ -121,6 +121,7 @@
"scrapingbee": "^1.7.4",
"stripe": "^16.1.0",
"systeminformation": "^5.22.11",
"tldts": "^6.1.75",
"turndown": "^7.1.3",
"turndown-plugin-gfm": "^1.0.2",
"typesense": "^1.5.4",

View File

@ -203,6 +203,9 @@ importers:
systeminformation:
specifier: ^5.22.11
version: 5.22.11
tldts:
specifier: ^6.1.75
version: 6.1.75
turndown:
specifier: ^7.1.3
version: 7.2.0
@ -4286,6 +4289,13 @@ packages:
through@2.3.8:
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
tldts-core@6.1.75:
resolution: {integrity: sha512-AOvV5YYIAFFBfransBzSTyztkc3IMfz5Eq3YluaRiEu55nn43Fzaufx70UqEKYr8BoLCach4q8g/bg6e5+/aFw==}
tldts@6.1.75:
resolution: {integrity: sha512-+lFzEXhpl7JXgWYaXcB6DqTYXbUArvrWAE/5ioq/X3CdWLbDjpPP4XTrQBmEJ91y3xbe4Fkw7Lxv4P3GWeJaNg==}
hasBin: true
tmpl@1.0.5:
resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==}
@ -9612,6 +9622,12 @@ snapshots:
through@2.3.8: {}
tldts-core@6.1.75: {}
tldts@6.1.75:
dependencies:
tldts-core: 6.1.75
tmpl@1.0.5: {}
to-fast-properties@2.0.0: {}

View File

@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent.
}
export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
}
// Multi entity schema anlayzer

View File

@ -1,94 +1,157 @@
import { isUrlBlocked } from "../blocklist";
import { decryptAES, isUrlBlocked } from "../blocklist";
describe("Blocklist Functionality", () => {
describe("isUrlBlocked", () => {
test.each([
"https://facebook.com/fake-test",
"https://x.com/user-profile",
"https://twitter.com/home",
"https://instagram.com/explore",
"https://linkedin.com/in/johndoe",
"https://snapchat.com/add/johndoe",
"https://tiktok.com/@johndoe",
"https://reddit.com/r/funny",
"https://tumblr.com/dashboard",
"https://flickr.com/photos/johndoe",
"https://whatsapp.com/download",
"https://wechat.com/features",
"https://telegram.org/apps",
])("should return true for blocklisted URL %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
});
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
test.each([
"https://facebook.com/policy",
"https://twitter.com/tos",
"https://instagram.com/about/legal/terms",
"https://linkedin.com/legal/privacy-policy",
"https://pinterest.com/about/privacy",
"https://snapchat.com/legal/terms",
"https://tiktok.com/legal/privacy-policy",
"https://reddit.com/policies",
"https://tumblr.com/policy/en/privacy",
"https://flickr.com/help/terms",
"https://whatsapp.com/legal",
"https://wechat.com/en/privacy-policy",
"https://telegram.org/tos",
])("should return false for allowed URLs with keywords %s", (url) => {
expect(isUrlBlocked(url)).toBe(false);
});
test("should return false for non-blocklisted domain", () => {
const url = "https://example.com";
expect(isUrlBlocked(url)).toBe(false);
});
test("should handle invalid URLs gracefully", () => {
const url = "htp://invalid-url";
expect(isUrlBlocked(url)).toBe(false);
});
describe("isUrlBlocked function", () => {
beforeAll(() => {
// Mock the decryptedBlocklist function to return known values
jest
.spyOn(require("../blocklist"), "decryptedBlocklist")
.mockReturnValue([
"h8ngAFXUNLO3ZqQufJjGVA==",
"fEGiDm/TWDBkXUXejFVICg==",
"l6Mei7IGbEmTTFoSudUnqQ==",
"4OjallJzXRiZUAWDiC2Xww==",
"ReSvkSfx34TNEdecmmSDdQ==",
"X1E4WtdmXAv3SAX9xN925Q==",
"VTzBQfMtXZzM05mnNkWkjA==",
"m/q4Lb2Z8cxwU7/CoztOFg==",
"UbVnmRaeG+gKcyVDLAm0vg==",
"xNQhczYG22tTVc6lYE3qwg==",
"CQfGDydbg4l1swRCru6O6Q==",
"l86LQxm2NonTWMauXwEsPw==",
"6v4QDUcwjnID80G+uU+tgw==",
"pCF/6nrKZAxaYntzEGluZQ==",
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
"QV436UZuQ6D0Dqrx9MwaGw==",
"OYVvrwILYbzA2mSSqOPPpw==",
"xW2i4C0Dzcnp+qu12u0SAw==",
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
"ObdlM5NEkvBJ/sojRW5K/Q==",
"C8Th38X0SjsE1vL/OsD8bA==",
"PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==",
"KZfBtpwjOpdSoqacRbz7og==",
"Indtl4yxJMHCKBGF4KABCQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"86ZDUI7vmp4MvNq3fvZrGQ==",
"sEGFoYZ6GEg4Zocd+TiyfQ==",
"6OOL72eXthgnJ1Hj4PfOQQ==",
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
"+yz9bnYYMnC0trJZGJwf6Q==",
]);
});
test.each([
"https://subdomain.facebook.com",
"https://facebook.com.someotherdomain.com",
"https://www.facebook.com/profile",
"https://api.twitter.com/info",
"https://instagram.com/accounts/login",
])(
"should return true for URLs with blocklisted domains in subdomains or paths %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
},
);
test.each([
"https://example.com/facebook.com",
"https://example.com/redirect?url=https://twitter.com",
"https://facebook.com.policy.example.com",
])(
"should return false for URLs where blocklisted domain is part of another domain or path %s",
(url) => {
expect(isUrlBlocked(url)).toBe(false);
},
);
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
"should handle case variations %s",
(url) => {
expect(isUrlBlocked(url)).toBe(true);
},
);
test.each([
"https://facebook.com?redirect=https://example.com",
"https://twitter.com?query=something",
])("should handle query parameters %s", (url) => {
expect(isUrlBlocked(url)).toBe(true);
test("Blocks exact domain with and without protocol", () => {
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
});
test("should handle internationalized domain names", () => {
const url = "https://xn--d1acpjx3f.xn--p1ai";
expect(isUrlBlocked(url)).toBe(false);
test("Blocks subdomains of a blocked domain", () => {
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
),
).toBe(true);
});
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
),
).toBe(true);
});
test("Allows unrelated domains like whateverfacebook.com", () => {
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
});
test("Blocks other domains from the blocklist", () => {
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
true,
);
});
test("Allows allowed keywords URLs [developers.*, library.*, ads.*]", () => {
expect(
isUrlBlocked(
decryptAES(
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
hashKey,
),
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
),
).toBe(false);
});
test("Should return false if the URL is invalid", () => {
expect(isUrlBlocked("randomstring")).toBe(false);
expect(isUrlBlocked("htp://bad.url")).toBe(false);
expect(isUrlBlocked("")).toBe(false);
});
});

View File

@ -1,12 +1,13 @@
import { logger } from "../../../lib/logger";
import crypto from "crypto";
import { configDotenv } from "dotenv";
import crypto from "crypto";
import { parse } from "tldts";
configDotenv();
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb";
function encryptAES(plaintext: string, key: Buffer): string {
export function encryptAES(plaintext: string, key: Buffer): string {
const cipher = crypto.createCipheriv(algorithm, key, null);
const encrypted = Buffer.concat([
cipher.update(plaintext, "utf-8"),
@ -15,7 +16,7 @@ function encryptAES(plaintext: string, key: Buffer): string {
return encrypted.toString("base64");
}
function decryptAES(ciphertext: string, key: Buffer): string {
export function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([
decipher.update(Buffer.from(ciphertext, "base64")),
@ -61,18 +62,12 @@ const urlBlocklist = [
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
"+yz9bnYYMnC0trJZGJwf6Q=="
];
const decryptedBlocklist =
hashKey.length > 0
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
"+yz9bnYYMnC0trJZGJwf6Q==",
]
const allowedKeywords = [
"pulse",
@ -99,39 +94,65 @@ const allowedKeywords = [
"://www.facebook.com/ads/library",
];
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.toLowerCase();
function decryptedBlocklist(list: string[]): string[] {
return hashKey.length > 0
? list.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
}
// Check if the URL contains any allowed keywords as whole words
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.trim().toLowerCase();
const blockedlist = decryptedBlocklist(urlBlocklist);
const decryptedUrl =
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
lowerCaseUrl;
// If the URL is empty or invalid, return false
let parsedUrl: any;
try {
parsedUrl = parse(decryptedUrl);
} catch {
console.log("Error parsing URL:", url);
return false;
}
const domain = parsedUrl.domain;
const publicSuffix = parsedUrl.publicSuffix;
if (!domain) {
return false;
}
// Check if URL contains any allowed keyword
if (
allowedKeywords.some((keyword) =>
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
lowerCaseUrl.includes(keyword.toLowerCase()),
)
) {
return false;
}
try {
if (!url.startsWith("http://") && !url.startsWith("https://")) {
url = "https://" + url;
}
const urlObj = new URL(url);
const hostname = urlObj.hostname.toLowerCase();
// Check if the URL matches any domain in the blocklist
const isBlocked = decryptedBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
"i",
);
return domainPattern.test(hostname);
});
return isBlocked;
} catch (e) {
// If an error occurs (e.g., invalid URL), return false
logger.error(`Error parsing the following URL: ${url}`);
return false;
// Block exact matches
if (blockedlist.includes(domain)) {
return true;
}
// Block subdomains
if (blockedlist.some((blocked) => domain.endsWith(`.${blocked}`))) {
return true;
}
// Block different TLDs of the same base domain
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
if (
publicSuffix &&
blockedlist.some(
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
)
) {
return true;
}
return false;
}