Merge pull request #971 from mendableai/Hash-Urls

Remove Block List
This commit is contained in:
Nicolas 2024-12-12 18:19:51 -03:00 committed by GitHub
commit 6b41916e1a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 46 additions and 21 deletions

View File

@ -123,7 +123,7 @@ function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
return res.status(403).json({
success: false,
error:
"URL is blocked intentionally. Firecrawl currently does not support social media scraping due to policy restrictions.",
"URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions.",
});
}
}

View File

@ -1,26 +1,51 @@
import { logger } from "../../../lib/logger";
import crypto from "crypto";
import { configDotenv } from "dotenv";
configDotenv();
const socialMediaBlocklist = [
"facebook.com",
"x.com",
"twitter.com",
"instagram.com",
"linkedin.com",
"snapchat.com",
"tiktok.com",
"reddit.com",
"tumblr.com",
"flickr.com",
"whatsapp.com",
"wechat.com",
"telegram.org",
"researchhub.com",
"youtube.com",
"corterix.com",
"southwest.com",
"ryanair.com",
const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8");
const algorithm = "aes-256-ecb";
function decryptAES(ciphertext: string, key: Buffer): string {
const decipher = crypto.createDecipheriv(algorithm, key, null);
const decrypted = Buffer.concat([
decipher.update(Buffer.from(ciphertext, "base64")),
decipher.final(),
]);
return decrypted.toString("utf-8");
}
const urlBlocklist = [
"h8ngAFXUNLO3ZqQufJjGVA==",
"fEGiDm/TWDBkXUXejFVICg==",
"l6Mei7IGbEmTTFoSudUnqQ==",
"4OjallJzXRiZUAWDiC2Xww==",
"ReSvkSfx34TNEdecmmSDdQ==",
"X1E4WtdmXAv3SAX9xN925Q==",
"VTzBQfMtXZzM05mnNkWkjA==",
"m/q4Lb2Z8cxwU7/CoztOFg==",
"UbVnmRaeG+gKcyVDLAm0vg==",
"xNQhczYG22tTVc6lYE3qwg==",
"CQfGDydbg4l1swRCru6O6Q==",
"l86LQxm2NonTWMauXwEsPw==",
"6v4QDUcwjnID80G+uU+tgw==",
"pCF/6nrKZAxaYntzEGluZQ==",
"r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==",
"aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==",
"QV436UZuQ6D0Dqrx9MwaGw==",
"OYVvrwILYbzA2mSSqOPPpw==",
"xW2i4C0Dzcnp+qu12u0SAw==",
"OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=",
"X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=",
"ObdlM5NEkvBJ/sojRW5K/Q==",
"C8Th38X0SjsE1vL/OsD8bA==",
"PTbGg8PK/h0Seyw4HEpK4Q==",
"lZdQMknjHb7+4+sjF3qNTw==",
"LsgSq54q5oDysbva29JxnQ==",
];
const decryptedBlocklist = hashKey.length > 0 ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) : [];
const allowedKeywords = [
"pulse",
"privacy",
@ -65,7 +90,7 @@ export function isUrlBlocked(url: string): boolean {
const hostname = urlObj.hostname.toLowerCase();
// Check if the URL matches any domain in the blocklist
const isBlocked = socialMediaBlocklist.some((domain) => {
const isBlocked = decryptedBlocklist.some((domain) => {
const domainPattern = new RegExp(
`(^|\\.)${domain.replace(".", "\\.")}(\\.|$)`,
"i",