mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-20 17:25:12 +08:00
feat(blocklist): Improve URL blocking with tldts parsing
This commit is contained in:
parent
948f7866df
commit
24e8aaf6b5
@ -121,6 +121,7 @@
|
|||||||
"scrapingbee": "^1.7.4",
|
"scrapingbee": "^1.7.4",
|
||||||
"stripe": "^16.1.0",
|
"stripe": "^16.1.0",
|
||||||
"systeminformation": "^5.22.11",
|
"systeminformation": "^5.22.11",
|
||||||
|
"tldts": "^6.1.75",
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"typesense": "^1.5.4",
|
"typesense": "^1.5.4",
|
||||||
@ -142,4 +143,4 @@
|
|||||||
"temp"
|
"temp"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
16
apps/api/pnpm-lock.yaml
generated
16
apps/api/pnpm-lock.yaml
generated
@ -203,6 +203,9 @@ importers:
|
|||||||
systeminformation:
|
systeminformation:
|
||||||
specifier: ^5.22.11
|
specifier: ^5.22.11
|
||||||
version: 5.22.11
|
version: 5.22.11
|
||||||
|
tldts:
|
||||||
|
specifier: ^6.1.75
|
||||||
|
version: 6.1.75
|
||||||
turndown:
|
turndown:
|
||||||
specifier: ^7.1.3
|
specifier: ^7.1.3
|
||||||
version: 7.2.0
|
version: 7.2.0
|
||||||
@ -4286,6 +4289,13 @@ packages:
|
|||||||
through@2.3.8:
|
through@2.3.8:
|
||||||
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
|
resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==}
|
||||||
|
|
||||||
|
tldts-core@6.1.75:
|
||||||
|
resolution: {integrity: sha512-AOvV5YYIAFFBfransBzSTyztkc3IMfz5Eq3YluaRiEu55nn43Fzaufx70UqEKYr8BoLCach4q8g/bg6e5+/aFw==}
|
||||||
|
|
||||||
|
tldts@6.1.75:
|
||||||
|
resolution: {integrity: sha512-+lFzEXhpl7JXgWYaXcB6DqTYXbUArvrWAE/5ioq/X3CdWLbDjpPP4XTrQBmEJ91y3xbe4Fkw7Lxv4P3GWeJaNg==}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
tmpl@1.0.5:
|
tmpl@1.0.5:
|
||||||
resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==}
|
resolution: {integrity: sha512-3f0uOEAQwIqGuWW2MVzYg8fV/QNnc/IpuJNG837rLuczAaLVHslWHZQj4IGiEl5Hs3kkbhwL9Ab7Hrsmuj+Smw==}
|
||||||
|
|
||||||
@ -9612,6 +9622,12 @@ snapshots:
|
|||||||
|
|
||||||
through@2.3.8: {}
|
through@2.3.8: {}
|
||||||
|
|
||||||
|
tldts-core@6.1.75: {}
|
||||||
|
|
||||||
|
tldts@6.1.75:
|
||||||
|
dependencies:
|
||||||
|
tldts-core: 6.1.75
|
||||||
|
|
||||||
tmpl@1.0.5: {}
|
tmpl@1.0.5: {}
|
||||||
|
|
||||||
to-fast-properties@2.0.0: {}
|
to-fast-properties@2.0.0: {}
|
||||||
|
@ -1,94 +1,53 @@
|
|||||||
import { isUrlBlocked } from "../blocklist";
|
import { isUrlBlocked } from "../blocklist";
|
||||||
|
|
||||||
describe("Blocklist Functionality", () => {
|
describe("isUrlBlocked function", () => {
|
||||||
describe("isUrlBlocked", () => {
|
test("Blocks exact domain facebook.com", () => {
|
||||||
test.each([
|
expect(isUrlBlocked("facebook.com")).toBe(true);
|
||||||
"https://facebook.com/fake-test",
|
expect(isUrlBlocked("http://facebook.com")).toBe(true);
|
||||||
"https://x.com/user-profile",
|
expect(isUrlBlocked("https://facebook.com")).toBe(true);
|
||||||
"https://twitter.com/home",
|
|
||||||
"https://instagram.com/explore",
|
|
||||||
"https://linkedin.com/in/johndoe",
|
|
||||||
"https://snapchat.com/add/johndoe",
|
|
||||||
"https://tiktok.com/@johndoe",
|
|
||||||
"https://reddit.com/r/funny",
|
|
||||||
"https://tumblr.com/dashboard",
|
|
||||||
"https://flickr.com/photos/johndoe",
|
|
||||||
"https://whatsapp.com/download",
|
|
||||||
"https://wechat.com/features",
|
|
||||||
"https://telegram.org/apps",
|
|
||||||
])("should return true for blocklisted URL %s", (url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
});
|
|
||||||
|
|
||||||
test.each([
|
|
||||||
"https://facebook.com/policy",
|
|
||||||
"https://twitter.com/tos",
|
|
||||||
"https://instagram.com/about/legal/terms",
|
|
||||||
"https://linkedin.com/legal/privacy-policy",
|
|
||||||
"https://pinterest.com/about/privacy",
|
|
||||||
"https://snapchat.com/legal/terms",
|
|
||||||
"https://tiktok.com/legal/privacy-policy",
|
|
||||||
"https://reddit.com/policies",
|
|
||||||
"https://tumblr.com/policy/en/privacy",
|
|
||||||
"https://flickr.com/help/terms",
|
|
||||||
"https://whatsapp.com/legal",
|
|
||||||
"https://wechat.com/en/privacy-policy",
|
|
||||||
"https://telegram.org/tos",
|
|
||||||
])("should return false for allowed URLs with keywords %s", (url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("should return false for non-blocklisted domain", () => {
|
|
||||||
const url = "https://example.com";
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("should handle invalid URLs gracefully", () => {
|
|
||||||
const url = "htp://invalid-url";
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test.each([
|
test("Blocks subdomains of facebook.com", () => {
|
||||||
"https://subdomain.facebook.com",
|
expect(isUrlBlocked("www.facebook.com")).toBe(true);
|
||||||
"https://facebook.com.someotherdomain.com",
|
expect(isUrlBlocked("ads.facebook.com")).toBe(true);
|
||||||
"https://www.facebook.com/profile",
|
expect(isUrlBlocked("business.facebook.com")).toBe(true);
|
||||||
"https://api.twitter.com/info",
|
|
||||||
"https://instagram.com/accounts/login",
|
|
||||||
])(
|
|
||||||
"should return true for URLs with blocklisted domains in subdomains or paths %s",
|
|
||||||
(url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
test.each([
|
|
||||||
"https://example.com/facebook.com",
|
|
||||||
"https://example.com/redirect?url=https://twitter.com",
|
|
||||||
"https://facebook.com.policy.example.com",
|
|
||||||
])(
|
|
||||||
"should return false for URLs where blocklisted domain is part of another domain or path %s",
|
|
||||||
(url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
test.each(["https://FACEBOOK.com", "https://INSTAGRAM.com/@something"])(
|
|
||||||
"should handle case variations %s",
|
|
||||||
(url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
},
|
|
||||||
);
|
|
||||||
|
|
||||||
test.each([
|
|
||||||
"https://facebook.com?redirect=https://example.com",
|
|
||||||
"https://twitter.com?query=something",
|
|
||||||
])("should handle query parameters %s", (url) => {
|
|
||||||
expect(isUrlBlocked(url)).toBe(true);
|
|
||||||
});
|
});
|
||||||
|
|
||||||
test("should handle internationalized domain names", () => {
|
test("Blocks different TLDs (facebook.pt, facebook.io)", () => {
|
||||||
const url = "https://xn--d1acpjx3f.xn--p1ai";
|
expect(isUrlBlocked("facebook.pt")).toBe(true);
|
||||||
expect(isUrlBlocked(url)).toBe(false);
|
expect(isUrlBlocked("facebook.io")).toBe(true);
|
||||||
|
expect(isUrlBlocked("facebook.co.uk")).toBe(true);
|
||||||
|
expect(isUrlBlocked("https://facebook.de")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Allows unrelated domains like whateverfacebook.com", () => {
|
||||||
|
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
|
||||||
|
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Blocks other domains from the blocklist", () => {
|
||||||
|
expect(isUrlBlocked("tiktok.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("www.tiktok.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("reddit.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("youtube.com")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Allows allowed keywords URLs", () => {
|
||||||
|
expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false);
|
||||||
|
expect(isUrlBlocked("https://developers.facebook.com")).toBe(false);
|
||||||
|
expect(isUrlBlocked("https://library.tiktok.com")).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Handles URLs with and without protocols", () => {
|
||||||
|
expect(isUrlBlocked("facebook.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("http://facebook.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("https://facebook.com")).toBe(true);
|
||||||
|
expect(isUrlBlocked("www.facebook.com")).toBe(true);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Should return false if the URL is invalid", () => {
|
||||||
|
expect(isUrlBlocked("randomstring")).toBe(false);
|
||||||
|
expect(isUrlBlocked("htp://bad.url")).toBe(false);
|
||||||
|
expect(isUrlBlocked("")).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { logger } from "../../../lib/logger";
|
|
||||||
import crypto from "crypto";
|
|
||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
|
import crypto from "crypto";
|
||||||
|
import { parse } from "tldts";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
|
||||||
@ -66,7 +67,7 @@ const urlBlocklist = [
|
|||||||
"KKttwRz4w+AMJrZcB828WQ==",
|
"KKttwRz4w+AMJrZcB828WQ==",
|
||||||
"vMdzZ33BXoyWVZnAPOBcrg==",
|
"vMdzZ33BXoyWVZnAPOBcrg==",
|
||||||
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
"l8GDVI8w/ueHnNzdN1ODuQ==",
|
||||||
"+yz9bnYYMnC0trJZGJwf6Q=="
|
"+yz9bnYYMnC0trJZGJwf6Q==",
|
||||||
];
|
];
|
||||||
|
|
||||||
const decryptedBlocklist =
|
const decryptedBlocklist =
|
||||||
@ -100,38 +101,57 @@ const allowedKeywords = [
|
|||||||
];
|
];
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string): boolean {
|
||||||
const lowerCaseUrl = url.toLowerCase();
|
const lowerCaseUrl = url.trim().toLowerCase();
|
||||||
|
|
||||||
// Check if the URL contains any allowed keywords as whole words
|
const decryptedUrl =
|
||||||
|
decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
||||||
|
lowerCaseUrl;
|
||||||
|
|
||||||
|
// If the URL is empty or invalid, return false
|
||||||
|
let parsedUrl;
|
||||||
|
try {
|
||||||
|
parsedUrl = parse(decryptedUrl);
|
||||||
|
} catch {
|
||||||
|
console.log("Error parsing URL:", url);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const domain = parsedUrl.domain;
|
||||||
|
const publicSuffix = parsedUrl.publicSuffix;
|
||||||
|
|
||||||
|
if (!domain) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if URL contains any allowed keyword
|
||||||
if (
|
if (
|
||||||
allowedKeywords.some((keyword) =>
|
allowedKeywords.some((keyword) =>
|
||||||
new RegExp(`\\b${keyword}\\b`, "i").test(lowerCaseUrl),
|
lowerCaseUrl.includes(keyword.toLowerCase()),
|
||||||
)
|
)
|
||||||
) {
|
) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
// Block exact matches
|
||||||
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
if (decryptedBlocklist.includes(domain)) {
|
||||||
url = "https://" + url;
|
return true;
|
||||||
}
|
|
||||||
|
|
||||||
const urlObj = new URL(url);
|
|
||||||
const hostname = urlObj.hostname.toLowerCase();
|
|
||||||
|
|
||||||
// Check if the URL matches any domain in the blocklist
|
|
||||||
const isBlocked = decryptedBlocklist.some((domain) => {
|
|
||||||
const domainPattern = new RegExp(
|
|
||||||
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
|
|
||||||
"i",
|
|
||||||
);
|
|
||||||
return domainPattern.test(hostname);
|
|
||||||
});
|
|
||||||
|
|
||||||
return isBlocked;
|
|
||||||
} catch (e) {
|
|
||||||
// If an error occurs (e.g., invalid URL), return false
|
|
||||||
logger.error(`Error parsing the following URL: ${url}`);
|
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Block subdomains
|
||||||
|
if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block different TLDs of the same base domain
|
||||||
|
const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com")
|
||||||
|
if (
|
||||||
|
publicSuffix &&
|
||||||
|
decryptedBlocklist.some(
|
||||||
|
(blocked) => blocked.startsWith(baseDomain) && blocked !== domain,
|
||||||
|
)
|
||||||
|
) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user