mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 11:49:07 +08:00
Merge pull request #1117 from mendableai/feat/blocked-domains
feat(blocklist): Improve URL blocking with tldts parsing
This commit is contained in:
commit
996352c24c
@ -121,6 +121,7 @@
|
||||
"scrapingbee": "^1.7.4",
|
||||
"stripe": "^16.1.0",
|
||||
"systeminformation": "^5.22.11",
|
||||
"tldts": "^6.1.75",
|
||||
"turndown": "^7.1.3",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"typesense": "^1.5.4",
|
||||
|
16
apps/api/pnpm-lock.yaml
generated
16
apps/api/pnpm-lock.yaml
generated
@ -203,6 +203,9 @@ importers:
|
||||
systeminformation:
|
||||
specifier: ^5.22.11
|
||||
version: 5.22.11
|
||||
tldts:
|
||||
specifier: ^6.1.75
|
||||
version: 6.1.75
|
||||
turndown:
|
||||
specifier: ^7.1.3
|
||||
version: 7.2.0
|
||||
@ -4286,6 +4289,13 @@ packages:
|
||||
through@2.3.8:
|
||||
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
|
||||
|
||||
tldts-core@6.1.75:
|
||||
resolution: {integrity: sha512-AOvV5YYIAFFBfransBzSTyztkc3IMfz5Eq3YluaRiEu55nn43Fzaufx70UqEKYr8BoLCach4q8g/bg6e5+/aFw==}
|
||||
|
||||
tldts@6.1.75:
|
||||
resolution: {integrity: sha512-+lFzEXhpl7JXgWYaXcB6DqTYXbUArvrWAE/5ioq/X3CdWLbDjpPP4XTrQBmEJ91y3xbe4Fkw7Lxv4P3GWeJaNg==}
|
||||
hasBin: true
|
||||
|
||||
tmpl@1.0.5:
|
||||
resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==}
|
||||
|
||||
@ -9612,6 +9622,12 @@ snapshots:
|
||||
|
||||
through@2.3.8: {}
|
||||
|
||||
tldts-core@6.1.75: {}
|
||||
|
||||
tldts@6.1.75:
|
||||
dependencies:
|
||||
tldts-core: 6.1.75
|
||||
|
||||
tmpl@1.0.5: {}
|
||||
|
||||
to-fast-properties@2.0.0: {}
|
||||
|
@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent.
|
||||
}
|
||||
|
||||
export function buildRerankerUserPrompt(searchQuery: string): string {
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`;
|
||||
return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`;
|
||||
}
|
||||
|
||||
// Multi entity schema anlayzer
|
||||
|
@ -1,94 +1,157 @@
|
||||
import { isUrlBlocked } from "../blocklist";
|
||||
import { decryptAES, isUrlBlocked } from "../blocklist";
|
||||
|
||||
describe("Blocklist Functionality", () => {
|
||||
describe("isUrlBlocked", () => {
|
||||
test.each([
|
||||
"https://facebook.com/fake-test",
|
||||
"https://x.com/user-profile",
|
||||
"https://twitter.com/home",
|
||||
"https://instagram.com/explore",
|
||||
"https://linkedin.com/in/johndoe",
|
||||
"https://snapchat.com/add/johndoe",
|
||||
"https://tiktok.com/@johndoe",
|
||||
"https://reddit.com/r/funny",
|
||||
"https://tumblr.com/dashboard",
|
||||
"https://flickr.com/photos/johndoe",
|
||||
"https://whatsapp.com/download",
|
||||
"https://wechat.com/features",
|
||||
"https://telegram.org/apps",
|
||||
])("should return true for blocklisted URL %s", (url) => {
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
});
|
||||
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||
|
||||
test.each([
|
||||
"https://facebook.com/policy",
|
||||
"https://twitter.com/tos",
|
||||
"https://instagram.com/about/legal/terms",
|
||||
"https://linkedin.com/legal/privacy-policy",
|
||||
"https://pinterest.com/about/privacy",
|
||||
"https://snapchat.com/legal/terms",
|
||||
"https://tiktok.com/legal/privacy-policy",
|
||||
"https://reddit.com/policies",
|
||||
"https://tumblr.com/policy/en/privacy",
|
||||
"https://flickr.com/help/terms",
|
||||
"https://whatsapp.com/legal",
|
||||
"https://wechat.com/en/privacy-policy",
|
||||
"https://telegram.org/tos",
|
||||
])("should return false for allowed URLs with keywords %s", (url) => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
|
||||
test("should return false for non-blocklisted domain", () => {
|
||||
const url = "https://example.com";
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
|
||||
test("should handle invalid URLs gracefully", () => {
|
||||
const url = "htp://invalid-url";
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
});
|
||||
describe("isUrlBlocked function", () => {
|
||||
beforeAll(() => {
|
||||
// Mock the decryptedBlocklist function to return known values
|
||||
jest
|
||||
.spyOn(require("../blocklist"), "decryptedBlocklist")
|
||||
.mockReturnValue([
|
||||
"h8ngAFXUNLO3ZqQufJjGVA==",
|
||||
"fEGiDm/TWDBkXUXejFVICg==",
|
||||
"l6Mei7IGbEmTTFoSudUnqQ==",
|
||||
"4OjallJzXRiZUAWDiC2Xww==",
|
||||
"ReSvkSfx34TNEdecmmSDdQ==",
|
||||
"X1E4WtdmXAv3SAX9xN925Q==",
|
||||
"VTzBQfMtXZzM05mnNkWkjA==",
|
||||
"m/q4Lb2Z8cxwU7/CoztOFg==",
|
||||
"UbVnmRaeG+gKcyVDLAm0vg==",
|
||||
"xNQhczYG22tTVc6lYE3qwg==",
|
||||
"CQfGDydbg4l1swRCru6O6Q==",
|
||||
"l86LQxm2NonTWMauXwEsPw==",
|
||||
"6v4QDUcwjnID80G+uU+tgw==",
|
||||
"pCF/6nrKZAxaYntzEGluZQ==",
|
||||
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
|
||||
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
|
||||
"QV436UZuQ6D0Dqrx9MwaGw==",
|
||||
"OYVvrwILYbzA2mSSqOPPpw==",
|
||||
"xW2i4C0Dzcnp+qu12u0SAw==",
|
||||
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
|
||||
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
|
||||
"ObdlM5NEkvBJ/sojRW5K/Q==",
|
||||
"C8Th38X0SjsE1vL/OsD8bA==",
|
||||
"PTbGg8PK/h0Seyw4HEpK4Q==",
|
||||
"lZdQMknjHb7+4+sjF3qNTw==",
|
||||
"LsgSq54q5oDysbva29JxnQ==",
|
||||
"KZfBtpwjOpdSoqacRbz7og==",
|
||||
"Indtl4yxJMHCKBGF4KABCQ==",
|
||||
"e3HFXLVgxhaVoadYpwb2BA==",
|
||||
"b+asgLayXQ5Jq+se+q56jA==",
|
||||
"86ZDUI7vmp4MvNq3fvZrGQ==",
|
||||
"sEGFoYZ6GEg4Zocd+TiyfQ==",
|
||||
"6OOL72eXthgnJ1Hj4PfOQQ==",
|
||||
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
||||
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
||||
"k6vBalxYFhAvkPsF19t9gQ==",
|
||||
"b+asgLayXQ5Jq+se+q56jA==",
|
||||
"KKttwRz4w+AMJrZcB828WQ==",
|
||||
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||
"+yz9bnYYMnC0trJZGJwf6Q==",
|
||||
]);
|
||||
});
|
||||
|
||||
test.each([
|
||||
"https://subdomain.facebook.com",
|
||||
"https://facebook.com.someotherdomain.com",
|
||||
"https://www.facebook.com/profile",
|
||||
"https://api.twitter.com/info",
|
||||
"https://instagram.com/accounts/login",
|
||||
])(
|
||||
"should return true for URLs with blocklisted domains in subdomains or paths %s",
|
||||
(url) => {
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
},
|
||||
);
|
||||
|
||||
test.each([
|
||||
"https://example.com/facebook.com",
|
||||
"https://example.com/redirect?url=https://twitter.com",
|
||||
"https://facebook.com.policy.example.com",
|
||||
])(
|
||||
"should return false for URLs where blocklisted domain is part of another domain or path %s",
|
||||
(url) => {
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
},
|
||||
);
|
||||
|
||||
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
|
||||
"should handle case variations %s",
|
||||
(url) => {
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
},
|
||||
);
|
||||
|
||||
test.each([
|
||||
"https://facebook.com?redirect=https://example.com",
|
||||
"https://twitter.com?query=something",
|
||||
])("should handle query parameters %s", (url) => {
|
||||
expect(isUrlBlocked(url)).toBe(true);
|
||||
test("Blocks exact domain with and without protocol", () => {
|
||||
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("should handle internationalized domain names", () => {
|
||||
const url = "https://xn--d1acpjx3f.xn--p1ai";
|
||||
expect(isUrlBlocked(url)).toBe(false);
|
||||
test("Blocks subdomains of a blocked domain", () => {
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
|
||||
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("Allows unrelated domains like whateverfacebook.com", () => {
|
||||
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
|
||||
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
|
||||
});
|
||||
|
||||
test("Blocks other domains from the blocklist", () => {
|
||||
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("Allows allowed keywords URLs [developers.*, library.*, ads.*]", () => {
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES(
|
||||
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
|
||||
hashKey,
|
||||
),
|
||||
),
|
||||
).toBe(false);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
|
||||
),
|
||||
).toBe(false);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
test("Should return false if the URL is invalid", () => {
|
||||
expect(isUrlBlocked("randomstring")).toBe(false);
|
||||
expect(isUrlBlocked("htp://bad.url")).toBe(false);
|
||||
expect(isUrlBlocked("")).toBe(false);
|
||||
});
|
||||
});
|
||||
|
@ -1,12 +1,13 @@
|
||||
import { logger } from "../../../lib/logger";
|
||||
import crypto from "crypto";
|
||||
import { configDotenv } from "dotenv";
|
||||
import crypto from "crypto";
|
||||
import { parse } from "tldts";
|
||||
|
||||
configDotenv();
|
||||
|
||||
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||
const algorithm = "aes-256-ecb";
|
||||
|
||||
function encryptAES(plaintext: string, key: Buffer): string {
|
||||
export function encryptAES(plaintext: string, key: Buffer): string {
|
||||
const cipher = crypto.createCipheriv(algorithm, key, null);
|
||||
const encrypted = Buffer.concat([
|
||||
cipher.update(plaintext, "utf-8"),
|
||||
@ -15,7 +16,7 @@ function encryptAES(plaintext: string, key: Buffer): string {
|
||||
return encrypted.toString("base64");
|
||||
}
|
||||
|
||||
function decryptAES(ciphertext: string, key: Buffer): string {
|
||||
export function decryptAES(ciphertext: string, key: Buffer): string {
|
||||
const decipher = crypto.createDecipheriv(algorithm, key, null);
|
||||
const decrypted = Buffer.concat([
|
||||
decipher.update(Buffer.from(ciphertext, "base64")),
|
||||
@ -61,18 +62,12 @@ const urlBlocklist = [
|
||||
"g/ME+Sh1CAFboKrwkVb+5Q==",
|
||||
"Pw+xawUoX8xBYbX2yqqGWQ==",
|
||||
"k6vBalxYFhAvkPsF19t9gQ==",
|
||||
"e3HFXLVgxhaVoadYpwb2BA==",
|
||||
"b+asgLayXQ5Jq+se+q56jA==",
|
||||
"KKttwRz4w+AMJrZcB828WQ==",
|
||||
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||
"+yz9bnYYMnC0trJZGJwf6Q=="
|
||||
];
|
||||
|
||||
const decryptedBlocklist =
|
||||
hashKey.length > 0
|
||||
? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey))
|
||||
: [];
|
||||
"+yz9bnYYMnC0trJZGJwf6Q==",
|
||||
]
|
||||
|
||||
const allowedKeywords = [
|
||||
"pulse",
|
||||
@ -99,39 +94,65 @@ const allowedKeywords = [
|
||||
"://www.facebook.com/ads/library",
|
||||
];
|
||||
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
const lowerCaseUrl = url.toLowerCase();
|
||||
function decryptedBlocklist(list: string[]): string[] {
|
||||
return hashKey.length > 0
|
||||
? list.map((ciphertext) => decryptAES(ciphertext, hashKey))
|
||||
: [];
|
||||
}
|
||||
|
||||
// Check if the URL contains any allowed keywords as whole words
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
const lowerCaseUrl = url.trim().toLowerCase();
|
||||
|
||||
const blockedlist = decryptedBlocklist(urlBlocklist);
|
||||
const decryptedUrl =
|
||||
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
||||
lowerCaseUrl;
|
||||
|
||||
// If the URL is empty or invalid, return false
|
||||
let parsedUrl: any;
|
||||
try {
|
||||
parsedUrl = parse(decryptedUrl);
|
||||
} catch {
|
||||
console.log("Error parsing URL:", url);
|
||||
return false;
|
||||
}
|
||||
|
||||
const domain = parsedUrl.domain;
|
||||
const publicSuffix = parsedUrl.publicSuffix;
|
||||
|
||||
if (!domain) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check if URL contains any allowed keyword
|
||||
if (
|
||||
allowedKeywords.some((keyword) =>
|
||||
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
|
||||
lowerCaseUrl.includes(keyword.toLowerCase()),
|
||||
)
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
||||
url = "https://" + url;
|
||||
}
|
||||
|
||||
const urlObj = new URL(url);
|
||||
const hostname = urlObj.hostname.toLowerCase();
|
||||
|
||||
// Check if the URL matches any domain in the blocklist
|
||||
const isBlocked = decryptedBlocklist.some((domain) => {
|
||||
const domainPattern = new RegExp(
|
||||
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
|
||||
"i",
|
||||
);
|
||||
return domainPattern.test(hostname);
|
||||
});
|
||||
|
||||
return isBlocked;
|
||||
} catch (e) {
|
||||
// If an error occurs (e.g., invalid URL), return false
|
||||
logger.error(`Error parsing the following URL: ${url}`);
|
||||
return false;
|
||||
// Block exact matches
|
||||
if (blockedlist.includes(domain)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Block subdomains
|
||||
if (blockedlist.some((blocked) => domain.endsWith(`.${blocked}`))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// Block different TLDs of the same base domain
|
||||
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
|
||||
if (
|
||||
publicSuffix &&
|
||||
blockedlist.some(
|
||||
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
|
||||
)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user