mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:39:00 +08:00
test(blocklist): Enhance URL blocking test coverage with decrypted domain validation
This commit is contained in:
parent
24e8aaf6b5
commit
bcd74498e3
@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent.
|
|||||||
}
|
}
|
||||||
|
|
||||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
|
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Multi entity schema anlayzer
|
// Multi entity schema anlayzer
|
||||||
|
@ -1,23 +1,111 @@
|
|||||||
import { isUrlBlocked } from "../blocklist";
|
import { decryptAES, isUrlBlocked } from "../blocklist";
|
||||||
|
|
||||||
|
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||||
|
|
||||||
describe("isUrlBlocked function", () => {
|
describe("isUrlBlocked function", () => {
|
||||||
test("Blocks exact domain facebook.com", () => {
|
beforeAll(() => {
|
||||||
expect(isUrlBlocked("facebook.com")).toBe(true);
|
// Mock the decryptedBlocklist function to return known values
|
||||||
expect(isUrlBlocked("http://facebook.com")).toBe(true);
|
jest
|
||||||
expect(isUrlBlocked("https://facebook.com")).toBe(true);
|
.spyOn(require("../blocklist"), "decryptedBlocklist")
|
||||||
|
.mockReturnValue([
|
||||||
|
"h8ngAFXUNLO3ZqQufJjGVA==",
|
||||||
|
"fEGiDm/TWDBkXUXejFVICg==",
|
||||||
|
"l6Mei7IGbEmTTFoSudUnqQ==",
|
||||||
|
"4OjallJzXRiZUAWDiC2Xww==",
|
||||||
|
"ReSvkSfx34TNEdecmmSDdQ==",
|
||||||
|
"X1E4WtdmXAv3SAX9xN925Q==",
|
||||||
|
"VTzBQfMtXZzM05mnNkWkjA==",
|
||||||
|
"m/q4Lb2Z8cxwU7/CoztOFg==",
|
||||||
|
"UbVnmRaeG+gKcyVDLAm0vg==",
|
||||||
|
"xNQhczYG22tTVc6lYE3qwg==",
|
||||||
|
"CQfGDydbg4l1swRCru6O6Q==",
|
||||||
|
"l86LQxm2NonTWMauXwEsPw==",
|
||||||
|
"6v4QDUcwjnID80G+uU+tgw==",
|
||||||
|
"pCF/6nrKZAxaYntzEGluZQ==",
|
||||||
|
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
|
||||||
|
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
|
||||||
|
"QV436UZuQ6D0Dqrx9MwaGw==",
|
||||||
|
"OYVvrwILYbzA2mSSqOPPpw==",
|
||||||
|
"xW2i4C0Dzcnp+qu12u0SAw==",
|
||||||
|
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
|
||||||
|
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
|
||||||
|
"ObdlM5NEkvBJ/sojRW5K/Q==",
|
||||||
|
"C8Th38X0SjsE1vL/OsD8bA==",
|
||||||
|
"PTbGg8PK/h0Seyw4HEpK4Q==",
|
||||||
|
"lZdQMknjHb7+4+sjF3qNTw==",
|
||||||
|
"LsgSq54q5oDysbva29JxnQ==",
|
||||||
|
"KZfBtpwjOpdSoqacRbz7og==",
|
||||||
|
"Indtl4yxJMHCKBGF4KABCQ==",
|
||||||
|
"e3HFXLVgxhaVoadYpwb2BA==",
|
||||||
|
"b+asgLayXQ5Jq+se+q56jA==",
|
||||||
|
"86ZDUI7vmp4MvNq3fvZrGQ==",
|
||||||
|
"sEGFoYZ6GEg4Zocd+TiyfQ==",
|
||||||
|
"6OOL72eXthgnJ1Hj4PfOQQ==",
|
||||||
|
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
||||||
|
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
||||||
|
"k6vBalxYFhAvkPsF19t9gQ==",
|
||||||
|
"b+asgLayXQ5Jq+se+q56jA==",
|
||||||
|
"KKttwRz4w+AMJrZcB828WQ==",
|
||||||
|
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||||
|
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||||
|
"+yz9bnYYMnC0trJZGJwf6Q==",
|
||||||
|
]);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks subdomains of facebook.com", () => {
|
test("Blocks exact domain with and without protocol", () => {
|
||||||
expect(isUrlBlocked("www.facebook.com")).toBe(true);
|
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
|
||||||
expect(isUrlBlocked("ads.facebook.com")).toBe(true);
|
true,
|
||||||
expect(isUrlBlocked("business.facebook.com")).toBe(true);
|
);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks different TLDs (facebook.pt, facebook.io)", () => {
|
test("Blocks subdomains of a blocked domain", () => {
|
||||||
expect(isUrlBlocked("facebook.pt")).toBe(true);
|
expect(
|
||||||
expect(isUrlBlocked("facebook.io")).toBe(true);
|
isUrlBlocked(
|
||||||
expect(isUrlBlocked("facebook.co.uk")).toBe(true);
|
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
expect(isUrlBlocked("https://facebook.de")).toBe(true);
|
),
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
|
||||||
|
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Allows unrelated domains like whateverfacebook.com", () => {
|
test("Allows unrelated domains like whateverfacebook.com", () => {
|
||||||
@ -26,23 +114,39 @@ describe("isUrlBlocked function", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks other domains from the blocklist", () => {
|
test("Blocks other domains from the blocklist", () => {
|
||||||
expect(isUrlBlocked("tiktok.com")).toBe(true);
|
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
|
||||||
expect(isUrlBlocked("www.tiktok.com")).toBe(true);
|
true,
|
||||||
expect(isUrlBlocked("reddit.com")).toBe(true);
|
);
|
||||||
expect(isUrlBlocked("youtube.com")).toBe(true);
|
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
|
||||||
|
true,
|
||||||
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Allows allowed keywords URLs", () => {
|
test("Allows allowed keywords URLs [developers.*, library.*, ads.*]", () => {
|
||||||
expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false);
|
expect(
|
||||||
expect(isUrlBlocked("https://developers.facebook.com")).toBe(false);
|
isUrlBlocked(
|
||||||
expect(isUrlBlocked("https://library.tiktok.com")).toBe(false);
|
decryptAES(
|
||||||
});
|
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
|
||||||
|
hashKey,
|
||||||
test("Handles URLs with and without protocols", () => {
|
),
|
||||||
expect(isUrlBlocked("facebook.com")).toBe(true);
|
),
|
||||||
expect(isUrlBlocked("http://facebook.com")).toBe(true);
|
).toBe(false);
|
||||||
expect(isUrlBlocked("https://facebook.com")).toBe(true);
|
expect(
|
||||||
expect(isUrlBlocked("www.facebook.com")).toBe(true);
|
isUrlBlocked(
|
||||||
|
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
|
expect(
|
||||||
|
isUrlBlocked(
|
||||||
|
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
|
||||||
|
),
|
||||||
|
).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Should return false if the URL is invalid", () => {
|
test("Should return false if the URL is invalid", () => {
|
||||||
|
@ -7,7 +7,7 @@ configDotenv();
|
|||||||
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||||
const algorithm = "aes-256-ecb";
|
const algorithm = "aes-256-ecb";
|
||||||
|
|
||||||
function encryptAES(plaintext: string, key: Buffer): string {
|
export function encryptAES(plaintext: string, key: Buffer): string {
|
||||||
const cipher = crypto.createCipheriv(algorithm, key, null);
|
const cipher = crypto.createCipheriv(algorithm, key, null);
|
||||||
const encrypted = Buffer.concat([
|
const encrypted = Buffer.concat([
|
||||||
cipher.update(plaintext, "utf-8"),
|
cipher.update(plaintext, "utf-8"),
|
||||||
@ -16,7 +16,7 @@ function encryptAES(plaintext: string, key: Buffer): string {
|
|||||||
return encrypted.toString("base64");
|
return encrypted.toString("base64");
|
||||||
}
|
}
|
||||||
|
|
||||||
function decryptAES(ciphertext: string, key: Buffer): string {
|
export function decryptAES(ciphertext: string, key: Buffer): string {
|
||||||
const decipher = crypto.createDecipheriv(algorithm, key, null);
|
const decipher = crypto.createDecipheriv(algorithm, key, null);
|
||||||
const decrypted = Buffer.concat([
|
const decrypted = Buffer.concat([
|
||||||
decipher.update(Buffer.from(ciphertext, "base64")),
|
decipher.update(Buffer.from(ciphertext, "base64")),
|
||||||
@ -62,18 +62,12 @@ const urlBlocklist = [
|
|||||||
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
||||||
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
||||||
"k6vBalxYFhAvkPsF19t9gQ==",
|
"k6vBalxYFhAvkPsF19t9gQ==",
|
||||||
"e3HFXLVgxhaVoadYpwb2BA==",
|
|
||||||
"b+asgLayXQ5Jq+se+q56jA==",
|
"b+asgLayXQ5Jq+se+q56jA==",
|
||||||
"KKttwRz4w+AMJrZcB828WQ==",
|
"KKttwRz4w+AMJrZcB828WQ==",
|
||||||
"vMdzZ33BXoyWVZnAPOBcrg==",
|
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||||
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||||
"+yz9bnYYMnC0trJZGJwf6Q==",
|
"+yz9bnYYMnC0trJZGJwf6Q==",
|
||||||
];
|
]
|
||||||
|
|
||||||
const decryptedBlocklist =
|
|
||||||
hashKey.length > 0
|
|
||||||
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
|
|
||||||
: [];
|
|
||||||
|
|
||||||
const allowedKeywords = [
|
const allowedKeywords = [
|
||||||
"pulse",
|
"pulse",
|
||||||
@ -100,15 +94,22 @@ const allowedKeywords = [
|
|||||||
"://www.facebook.com/ads/library",
|
"://www.facebook.com/ads/library",
|
||||||
];
|
];
|
||||||
|
|
||||||
|
function decryptedBlocklist(list: string[]): string[] {
|
||||||
|
return hashKey.length > 0
|
||||||
|
? list.map((ciphertext) => decryptAES(ciphertext, hashKey))
|
||||||
|
: [];
|
||||||
|
}
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
const lowerCaseUrl = url.trim().toLowerCase();
|
const lowerCaseUrl = url.trim().toLowerCase();
|
||||||
|
|
||||||
|
const blockedlist = decryptedBlocklist(urlBlocklist);
|
||||||
const decryptedUrl =
|
const decryptedUrl =
|
||||||
decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
||||||
lowerCaseUrl;
|
lowerCaseUrl;
|
||||||
|
|
||||||
// If the URL is empty or invalid, return false
|
// If the URL is empty or invalid, return false
|
||||||
let parsedUrl;
|
let parsedUrl: any;
|
||||||
try {
|
try {
|
||||||
parsedUrl = parse(decryptedUrl);
|
parsedUrl = parse(decryptedUrl);
|
||||||
} catch {
|
} catch {
|
||||||
@ -133,12 +134,12 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Block exact matches
|
// Block exact matches
|
||||||
if (decryptedBlocklist.includes(domain)) {
|
if (blockedlist.includes(domain)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Block subdomains
|
// Block subdomains
|
||||||
if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) {
|
if (blockedlist.some((blocked) => domain.endsWith(`.${blocked}`))) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -146,7 +147,7 @@ export function isUrlBlocked(url: string): boolean {
|
|||||||
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
|
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
|
||||||
if (
|
if (
|
||||||
publicSuffix &&
|
publicSuffix &&
|
||||||
decryptedBlocklist.some(
|
blockedlist.some(
|
||||||
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
|
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user