diff --git a/apps/api/src/lib/extract/build-prompts.ts b/apps/api/src/lib/extract/build-prompts.ts index 6d89eb2f..24710660 100644 --- a/apps/api/src/lib/extract/build-prompts.ts +++ b/apps/api/src/lib/extract/build-prompts.ts @@ -40,7 +40,7 @@ to determine their relevance to the user's query and intent. } export function buildRerankerUserPrompt(searchQuery: string): string { - return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relvancy score of 0.6+.`; + return `Given these URLs and their content, identify which ones are relevant to the user's extraction request: "${searchQuery}". Return an array of relevant links with their relevance scores (0-1). Higher scores should be given to URLs that directly address the user's extraction request. Be very mindful with the links you select, as if they are not that relevant it may affect the quality of the extraction. Only include URLs that have a relevancy score of 0.6+.`; } // Multi entity schema anlayzer diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts index 8144f2c8..cbba98e1 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -1,23 +1,111 @@ -import { isUrlBlocked } from "../blocklist"; +import { decryptAES, isUrlBlocked } from "../blocklist"; + +const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); describe("isUrlBlocked function", () => { - test("Blocks exact domain facebook.com", () => { - expect(isUrlBlocked("facebook.com")).toBe(true); - expect(isUrlBlocked("http://facebook.com")).toBe(true); - expect(isUrlBlocked("https://facebook.com")).toBe(true); + beforeAll(() => { + // Mock the decryptedBlocklist function to return known values + jest + .spyOn(require("../blocklist"), "decryptedBlocklist") + .mockReturnValue([ + "h8ngAFXUNLO3ZqQufJjGVA==", + "fEGiDm/TWDBkXUXejFVICg==", + "l6Mei7IGbEmTTFoSudUnqQ==", + "4OjallJzXRiZUAWDiC2Xww==", + "ReSvkSfx34TNEdecmmSDdQ==", + "X1E4WtdmXAv3SAX9xN925Q==", + "VTzBQfMtXZzM05mnNkWkjA==", + "m/q4Lb2Z8cxwU7/CoztOFg==", + "UbVnmRaeG+gKcyVDLAm0vg==", + "xNQhczYG22tTVc6lYE3qwg==", + "CQfGDydbg4l1swRCru6O6Q==", + "l86LQxm2NonTWMauXwEsPw==", + "6v4QDUcwjnID80G+uU+tgw==", + "pCF/6nrKZAxaYntzEGluZQ==", + "r0CRhAmQqSe7V2s3073T00sAh4WcS5779jwuGJ26ows==", + "aBOVqRFBM4UVg33usY10NdiF0HCnFH/ImtD0n+zIpc8==", + "QV436UZuQ6D0Dqrx9MwaGw==", + "OYVvrwILYbzA2mSSqOPPpw==", + "xW2i4C0Dzcnp+qu12u0SAw==", + "OLHba209l0dfl0MI4EnQonBITK9z8Qwgd/NsuaTkXmA=", + "X0VynmNjpL3PrYxpUIG7sFMBt8OlrmQWtxj8oXVu2QM=", + "ObdlM5NEkvBJ/sojRW5K/Q==", + "C8Th38X0SjsE1vL/OsD8bA==", + "PTbGg8PK/h0Seyw4HEpK4Q==", + "lZdQMknjHb7+4+sjF3qNTw==", + "LsgSq54q5oDysbva29JxnQ==", + "KZfBtpwjOpdSoqacRbz7og==", + "Indtl4yxJMHCKBGF4KABCQ==", + "e3HFXLVgxhaVoadYpwb2BA==", + "b+asgLayXQ5Jq+se+q56jA==", + "86ZDUI7vmp4MvNq3fvZrGQ==", + "sEGFoYZ6GEg4Zocd+TiyfQ==", + "6OOL72eXthgnJ1Hj4PfOQQ==", + "g/ME+Sh1CAFboKrwkVb+5Q==", + "Pw+xawUoX8xBYbX2yqqGWQ==", + "k6vBalxYFhAvkPsF19t9gQ==", + "b+asgLayXQ5Jq+se+q56jA==", + "KKttwRz4w+AMJrZcB828WQ==", + "vMdzZ33BXoyWVZnAPOBcrg==", + "l8GDVI8w/ueHnNzdN1ODuQ==", + "+yz9bnYYMnC0trJZGJwf6Q==", + ]); }); - test("Blocks subdomains of facebook.com", () => { - expect(isUrlBlocked("www.facebook.com")).toBe(true); - expect(isUrlBlocked("ads.facebook.com")).toBe(true); - expect(isUrlBlocked("business.facebook.com")).toBe(true); + test("Blocks exact domain with and without protocol", () => { + expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe( + true, + ); + expect( + isUrlBlocked( + decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey), + ), + ).toBe(true); + expect( + isUrlBlocked( + decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey), + ), + ).toBe(true); + expect( + isUrlBlocked( + decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), + ), + ).toBe(true); }); - test("Blocks different TLDs (facebook.pt, facebook.io)", () => { - expect(isUrlBlocked("facebook.pt")).toBe(true); - expect(isUrlBlocked("facebook.io")).toBe(true); - expect(isUrlBlocked("facebook.co.uk")).toBe(true); - expect(isUrlBlocked("https://facebook.de")).toBe(true); + test("Blocks subdomains of a blocked domain", () => { + expect( + isUrlBlocked( + decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), + ), + ).toBe(true); + expect( + isUrlBlocked( + decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey), + ), + ).toBe(true); + expect( + isUrlBlocked( + decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey), + ), + ).toBe(true); + }); + + test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => { + expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe( + true, + ); + expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe( + true, + ); + expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe( + true, + ); + expect( + isUrlBlocked( + decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey), + ), + ).toBe(true); }); test("Allows unrelated domains like whateverfacebook.com", () => { @@ -26,23 +114,39 @@ describe("isUrlBlocked function", () => { }); test("Blocks other domains from the blocklist", () => { - expect(isUrlBlocked("tiktok.com")).toBe(true); - expect(isUrlBlocked("www.tiktok.com")).toBe(true); - expect(isUrlBlocked("reddit.com")).toBe(true); - expect(isUrlBlocked("youtube.com")).toBe(true); + expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe( + true, + ); + expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe( + true, + ); + expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe( + true, + ); + expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe( + true, + ); }); - test("Allows allowed keywords URLs", () => { - expect(isUrlBlocked("https://www.facebook.com/ads/library")).toBe(false); - expect(isUrlBlocked("https://developers.facebook.com")).toBe(false); - expect(isUrlBlocked("https://library.tiktok.com")).toBe(false); - }); - - test("Handles URLs with and without protocols", () => { - expect(isUrlBlocked("facebook.com")).toBe(true); - expect(isUrlBlocked("http://facebook.com")).toBe(true); - expect(isUrlBlocked("https://facebook.com")).toBe(true); - expect(isUrlBlocked("www.facebook.com")).toBe(true); + test("Allows allowed keywords URLs [developers.*, library.*, ads.*]", () => { + expect( + isUrlBlocked( + decryptAES( + "4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB", + hashKey, + ), + ), + ).toBe(false); + expect( + isUrlBlocked( + decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey), + ), + ).toBe(false); + expect( + isUrlBlocked( + decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey), + ), + ).toBe(false); }); test("Should return false if the URL is invalid", () => { diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index f1e28892..71d1f454 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -7,7 +7,7 @@ configDotenv(); const hashKey = Buffer.from(process.env.HASH_KEY || "", "utf-8"); const algorithm = "aes-256-ecb"; -function encryptAES(plaintext: string, key: Buffer): string { +export function encryptAES(plaintext: string, key: Buffer): string { const cipher = crypto.createCipheriv(algorithm, key, null); const encrypted = Buffer.concat([ cipher.update(plaintext, "utf-8"), @@ -16,7 +16,7 @@ function encryptAES(plaintext: string, key: Buffer): string { return encrypted.toString("base64"); } -function decryptAES(ciphertext: string, key: Buffer): string { +export function decryptAES(ciphertext: string, key: Buffer): string { const decipher = crypto.createDecipheriv(algorithm, key, null); const decrypted = Buffer.concat([ decipher.update(Buffer.from(ciphertext, "base64")), @@ -62,18 +62,12 @@ const urlBlocklist = [ "g/ME+Sh1CAFboKrwkVb+5Q==", "Pw+xawUoX8xBYbX2yqqGWQ==", "k6vBalxYFhAvkPsF19t9gQ==", - "e3HFXLVgxhaVoadYpwb2BA==", "b+asgLayXQ5Jq+se+q56jA==", "KKttwRz4w+AMJrZcB828WQ==", "vMdzZ33BXoyWVZnAPOBcrg==", "l8GDVI8w/ueHnNzdN1ODuQ==", "+yz9bnYYMnC0trJZGJwf6Q==", -]; - -const decryptedBlocklist = - hashKey.length > 0 - ? urlBlocklist.map((ciphertext) => decryptAES(ciphertext, hashKey)) - : []; +] const allowedKeywords = [ "pulse", @@ -100,15 +94,22 @@ const allowedKeywords = [ "://www.facebook.com/ads/library", ]; +function decryptedBlocklist(list: string[]): string[] { + return hashKey.length > 0 + ? list.map((ciphertext) => decryptAES(ciphertext, hashKey)) + : []; +} + export function isUrlBlocked(url: string): boolean { const lowerCaseUrl = url.trim().toLowerCase(); - + + const blockedlist = decryptedBlocklist(urlBlocklist); const decryptedUrl = - decryptedBlocklist.find((decrypted) => lowerCaseUrl === decrypted) || + blockedlist.find((decrypted) => lowerCaseUrl === decrypted) || lowerCaseUrl; // If the URL is empty or invalid, return false - let parsedUrl; + let parsedUrl: any; try { parsedUrl = parse(decryptedUrl); } catch { @@ -133,12 +134,12 @@ export function isUrlBlocked(url: string): boolean { } // Block exact matches - if (decryptedBlocklist.includes(domain)) { + if (blockedlist.includes(domain)) { return true; } // Block subdomains - if (decryptedBlocklist.some((blocked) => domain.endsWith(`.${blocked}`))) { + if (blockedlist.some((blocked) => domain.endsWith(`.${blocked}`))) { return true; } @@ -146,7 +147,7 @@ export function isUrlBlocked(url: string): boolean { const baseDomain = domain.split(".")[0]; // Extract the base domain (e.g., "facebook" from "facebook.com") if ( publicSuffix && - decryptedBlocklist.some( + blockedlist.some( (blocked) => blocked.startsWith(baseDomain) && blocked !== domain, ) ) {