test(blocklist): Enhance URL blocking test coverage with decrypted domain validation

This commit is contained in:
Ademílson F. Tonato 2025-01-31 18:23:38 +00:00
parent 24e8aaf6b5
commit bcd74498e3
No known key found for this signature in database
GPG Key ID: 169C7BE271C9FA3A
3 changed files with 150 additions and 45 deletions

View File

@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent.
}
export function buildRerankerUserPrompt(searchQuery: string): string {
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
}
// Multi entity schema anlayzer

View File

@ -1,23 +1,111 @@
import { isUrlBlocked } from "../blocklist";
import { decryptAES, isUrlBlocked } from "../blocklist";
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
describe("isUrlBlocked function", () => {
test("Blocks exact domain facebook.com", () => {
expect(isUrlBlocked("facebook.com")).toBe(true);
expect(isUrlBlocked("http://facebook.com")).toBe(true);
expect(isUrlBlocked("https://facebook.com")).toBe(true);
beforeAll(() => {
// Mock the decryptedBlocklist function to return known values
jest
.spyOn(require("../blocklist"), "decryptedBlocklist")
.mockReturnValue([
"h8ngAFXUNLO3ZqQufJjGVA==",
"fEGiDm/TWDBkXUXejFVICg==",
"l6Mei7IGbEmTTFoSudUnqQ==",
"4OjallJzXRiZUAWDiC2Xww==",
"ReSvkSfx34TNEdecmmSDdQ==",
"X1E4WtdmXAv3SAX9xN925Q==",
"VTzBQfMtXZzM05mnNkWkjA==",
"m/q4Lb2Z8cxwU7/CoztOFg==",
"UbVnmRaeG+gKcyVDLAm0vg==",
"xNQhczYG22tTVc6lYE3qwg==",
"CQfGDydbg4l1swRCru6O6Q==",
"l86LQxm2NonTWMauXwEsPw==",
"6v4QDUcwjnID80G+uU+tgw==",
"pCF/6nrKZAxaYntzEGluZQ==",
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
"QV436UZuQ6D0Dqrx9MwaGw==",
"OYVvrwILYbzA2mSSqOPPpw==",
"xW2i4C0Dzcnp+qu12u0SAw==",
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
"ObdlM5NEkvBJ/sojRW5K/Q==",
"C8Th38X0SjsE1vL/OsD8bA==",
"PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==",
"KZfBtpwjOpdSoqacRbz7og==",
"Indtl4yxJMHCKBGF4KABCQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"86ZDUI7vmp4MvNq3fvZrGQ==",
"sEGFoYZ6GEg4Zocd+TiyfQ==",
"6OOL72eXthgnJ1Hj4PfOQQ==",
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
"+yz9bnYYMnC0trJZGJwf6Q==",
]);
});
test("Blocks subdomains of facebook.com", () => {
expect(isUrlBlocked("www.facebook.com")).toBe(true);
expect(isUrlBlocked("ads.facebook.com")).toBe(true);
expect(isUrlBlocked("business.facebook.com")).toBe(true);
test("Blocks exact domain with and without protocol", () => {
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
});
test("Blocks different TLDs (facebook.pt, facebook.io)", () => {
expect(isUrlBlocked("facebook.pt")).toBe(true);
expect(isUrlBlocked("facebook.io")).toBe(true);
expect(isUrlBlocked("facebook.co.uk")).toBe(true);
expect(isUrlBlocked("https://facebook.de")).toBe(true);
test("Blocks subdomains of a blocked domain", () => {
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
),
).toBe(true);
});
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
),
).toBe(true);
});
test("Allows unrelated domains like whateverfacebook.com", () => {
@ -26,23 +114,39 @@ describe("isUrlBlocked function", () => {
});
test("Blocks other domains from the blocklist", () => {
expect(isUrlBlocked("tiktok.com")).toBe(true);
expect(isUrlBlocked("www.tiktok.com")).toBe(true);
expect(isUrlBlocked("reddit.com")).toBe(true);
expect(isUrlBlocked("youtube.com")).toBe(true);
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
true,
);
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
true,
);
});
test("Allows allowed keywords URLs", () => {
expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false);
expect(isUrlBlocked("https://developers.facebook.com")).toBe(false);
expect(isUrlBlocked("https://library.tiktok.com")).toBe(false);
});
test("Handles URLs with and without protocols", () => {
expect(isUrlBlocked("facebook.com")).toBe(true);
expect(isUrlBlocked("http://facebook.com")).toBe(true);
expect(isUrlBlocked("https://facebook.com")).toBe(true);
expect(isUrlBlocked("www.facebook.com")).toBe(true);
test("Allows allowed keywords URLs [developers.*, library.*, ads.*]", () => {
expect(
isUrlBlocked(
decryptAES(
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
hashKey,
),
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
),
).toBe(false);
});
test("Should return false if the URL is invalid", () => {

View File

@ -7,7 +7,7 @@ configDotenv();
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb";
function encryptAES(plaintext: string, key: Buffer): string {
export function encryptAES(plaintext: string, key: Buffer): string {
const cipher = crypto.createCipheriv(algorithm, key, null);
const encrypted = Buffer.concat([
cipher.update(plaintext, "utf-8"),
@ -16,7 +16,7 @@ function encryptAES(plaintext: string, key: Buffer): string {
return encrypted.toString("base64");
}
function decryptAES(ciphertext: string, key: Buffer): string {
export function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([
decipher.update(Buffer.from(ciphertext, "base64")),
@ -62,18 +62,12 @@ const urlBlocklist = [
"g/ME+Sh1CAFboKrwkVb+5Q==",
"Pw+xawUoX8xBYbX2yqqGWQ==",
"k6vBalxYFhAvkPsF19t9gQ==",
"e3HFXLVgxhaVoadYpwb2BA==",
"b+asgLayXQ5Jq+se+q56jA==",
"KKttwRz4w+AMJrZcB828WQ==",
"vMdzZ33BXoyWVZnAPOBcrg==",
"l8GDVI8w/ueHnNzdN1ODuQ==",
"+yz9bnYYMnC0trJZGJwf6Q==",
];
const decryptedBlocklist =
hashKey.length > 0
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
]
const allowedKeywords = [
"pulse",
@ -100,15 +94,22 @@ const allowedKeywords = [
"://www.facebook.com/ads/library",
];
function decryptedBlocklist(list: string[]): string[] {
return hashKey.length > 0
? list.map((ciphertext) => decryptAES(ciphertext, hashKey))
: [];
}
export function isUrlBlocked(url: string): boolean {
const lowerCaseUrl = url.trim().toLowerCase();
const blockedlist = decryptedBlocklist(urlBlocklist);
const decryptedUrl =
decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) ||
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
lowerCaseUrl;
// If the URL is empty or invalid, return false
let parsedUrl;
let parsedUrl: any;
try {
parsedUrl = parse(decryptedUrl);
} catch {
@ -133,12 +134,12 @@ export function isUrlBlocked(url: string): boolean {
}
// Block exact matches
if (decryptedBlocklist.includes(domain)) {
if (blockedlist.includes(domain)) {
return true;
}
// Block subdomains
if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) {
if (blockedlist.some((blocked) => domain.endsWith(`.${blocked}`))) {
return true;
}
@ -146,7 +147,7 @@ export function isUrlBlocked(url: string): boolean {
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
if (
publicSuffix &&
decryptedBlocklist.some(
blockedlist.some(
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
)
) {