From fa581995e6702ac068cc841ceb3526b88d5f791f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 8 May 2025 20:23:35 +0200 Subject: [PATCH] feat(acuc): propagate team flags (FIR-1879) (#1522) * feat(acuc): propagate team flags * feat(flags): further functionality --- apps/api/src/controllers/auth.ts | 6 ++- apps/api/src/controllers/v0/crawl.ts | 4 +- apps/api/src/controllers/v0/crawlPreview.ts | 4 +- apps/api/src/controllers/v0/scrape.ts | 5 ++- apps/api/src/controllers/v0/search.ts | 5 ++- apps/api/src/controllers/v1/batch-scrape.ts | 17 ++++++- apps/api/src/controllers/v1/crawl.ts | 2 +- apps/api/src/controllers/v1/extract.ts | 11 +++++ apps/api/src/controllers/v1/map.ts | 6 ++- apps/api/src/controllers/v1/search.ts | 10 +++-- apps/api/src/controllers/v1/types.ts | 10 +++-- apps/api/src/lib/crawl-redis.ts | 5 ++- .../deep-research/deep-research-service.ts | 6 ++- .../api/src/lib/extract/extraction-service.ts | 4 +- .../extract/fire-0/extraction-service-f0.ts | 4 ++ .../api/src/lib/extract/fire-0/reranker-f0.ts | 10 +++-- .../lib/extract/fire-0/url-processor-f0.ts | 8 +++- apps/api/src/lib/extract/reranker.ts | 10 +++-- apps/api/src/lib/extract/url-processor.ts | 7 ++- .../generate-llmstxt-service.ts | 3 ++ apps/api/src/routes/v1.ts | 5 ++- .../utils/__tests__/blocklist.test.ts | 44 +++++++++++++------ .../src/scraper/WebScraper/utils/blocklist.ts | 10 ++++- apps/api/src/services/queue-worker.ts | 9 ++-- 24 files changed, 153 insertions(+), 52 deletions(-) diff --git a/apps/api/src/controllers/auth.ts b/apps/api/src/controllers/auth.ts index c9c0e4b2..60d5d73b 100644 --- a/apps/api/src/controllers/auth.ts +++ b/apps/api/src/controllers/auth.ts @@ -103,6 +103,7 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage planModifier: 0.1, }, concurrency: is_extract ? 200 : 2, + flags: null, is_extract, }); @@ -137,6 +138,7 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({ planModifier: 0.1, }, concurrency: 99999999, + flags: null, is_extract: false, }); @@ -181,7 +183,7 @@ export async function getACUC( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_30", + "auth_credit_usage_chunk_32", { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); @@ -298,7 +300,7 @@ export async function getACUCTeam( const client = Math.random() > (2/3) ? supabase_rr_service : supabase_service; ({ data, error } = await client.rpc( - "auth_credit_usage_chunk_30_from_team", + "auth_credit_usage_chunk_32_from_team", { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, { get: true }, )); diff --git a/apps/api/src/controllers/v0/crawl.ts b/apps/api/src/controllers/v0/crawl.ts index b54a14df..5755b3bb 100644 --- a/apps/api/src/controllers/v0/crawl.ts +++ b/apps/api/src/controllers/v0/crawl.ts @@ -115,7 +115,7 @@ export async function crawlController(req: Request, res: Response) { .json({ error: e.message ?? e }); } - if (isUrlBlocked(url)) { + if (isUrlBlocked(url, auth.chunk?.flags ?? null)) { return res.status(403).json({ error: BLOCKLISTED_URL_MESSAGE, }); @@ -173,7 +173,7 @@ export async function crawlController(req: Request, res: Response) { createdAt: Date.now(), }; - const crawler = crawlToCrawler(id, sc); + const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null); try { sc.robots = await crawler.getRobotsTxt(); diff --git a/apps/api/src/controllers/v0/crawlPreview.ts b/apps/api/src/controllers/v0/crawlPreview.ts index abdbdda9..3f9940a2 100644 --- a/apps/api/src/controllers/v0/crawlPreview.ts +++ b/apps/api/src/controllers/v0/crawlPreview.ts @@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) { .json({ error: e.message ?? e }); } - if (isUrlBlocked(url)) { + if (isUrlBlocked(url, auth.chunk?.flags ?? null)) { return res.status(403).json({ error: BLOCKLISTED_URL_MESSAGE, }); @@ -112,7 +112,7 @@ export async function crawlPreviewController(req: Request, res: Response) { await saveCrawl(id, sc); - const crawler = crawlToCrawler(id, sc); + const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null); await finishCrawlKickoff(id); diff --git a/apps/api/src/controllers/v0/scrape.ts b/apps/api/src/controllers/v0/scrape.ts index 55bbc691..98a35654 100644 --- a/apps/api/src/controllers/v0/scrape.ts +++ b/apps/api/src/controllers/v0/scrape.ts @@ -9,6 +9,7 @@ import { RateLimiterMode } from "../../types"; import { logJob } from "../../services/logging/log_job"; import { fromLegacyCombo, + TeamFlags, toLegacyDocument, url as urlSchema, } from "../v1/types"; @@ -40,6 +41,7 @@ export async function scrapeHelper( pageOptions: PageOptions, extractorOptions: ExtractorOptions, timeout: number, + flags: TeamFlags, ): Promise<{ success: boolean; error?: string; @@ -51,7 +53,7 @@ export async function scrapeHelper( return { success: false, error: "Url is required", returnCode: 400 }; } - if (isUrlBlocked(url)) { + if (isUrlBlocked(url, flags)) { return { success: false, error: BLOCKLISTED_URL_MESSAGE, @@ -241,6 +243,7 @@ export async function scrapeController(req: Request, res: Response) { pageOptions, extractorOptions, timeout, + chunk?.flags ?? null, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/v0/search.ts b/apps/api/src/controllers/v0/search.ts index e216db4b..8d88bce8 100644 --- a/apps/api/src/controllers/v0/search.ts +++ b/apps/api/src/controllers/v0/search.ts @@ -20,6 +20,7 @@ import { Document, fromLegacyCombo, fromLegacyScrapeOptions, + TeamFlags, toLegacyDocument, } from "../v1/types"; import { getJobFromGCS } from "../../lib/gcs-jobs"; @@ -32,6 +33,7 @@ export async function searchHelper( crawlerOptions: any, pageOptions: PageOptions, searchOptions: SearchOptions, + flags: TeamFlags, ): Promise<{ success: boolean; error?: string; @@ -85,7 +87,7 @@ export async function searchHelper( return { success: true, data: res, returnCode: 200 }; } - res = res.filter((r) => !isUrlBlocked(r.url)); + res = res.filter((r) => !isUrlBlocked(r.url, flags)); if (res.length > num_results) { res = res.slice(0, num_results); } @@ -202,6 +204,7 @@ export async function searchController(req: Request, res: Response) { crawlerOptions, pageOptions, searchOptions, + chunk?.flags ?? null, ); const endTime = new Date().getTime(); const timeTakenInSeconds = (endTime - startTime) / 1000; diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 962cd333..326aba5f 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -23,6 +23,8 @@ import { addScrapeJobs } from "../../services/queue-jobs"; import { callWebhook } from "../../services/webhook"; import { logger as _logger } from "../../lib/logger"; import { CostTracking } from "../../lib/extract/extraction-service"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; export async function batchScrapeController( req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, @@ -54,11 +56,24 @@ export async function batchScrapeController( for (const u of pendingURLs) { try { const nu = urlSchema.parse(u); - urls.push(nu); + if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) { + urls.push(nu); + } else { + invalidURLs.push(u); + } } catch (_) { invalidURLs.push(u); } } + } else { + if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) { + if (!res.headersSent) { + return res.status(403).json({ + success: false, + error: BLOCKLISTED_URL_MESSAGE, + }); + } + } } logger.debug("Batch scrape " + id + " starting", { diff --git a/apps/api/src/controllers/v1/crawl.ts b/apps/api/src/controllers/v1/crawl.ts index b60661a2..3948566f 100644 --- a/apps/api/src/controllers/v1/crawl.ts +++ b/apps/api/src/controllers/v1/crawl.ts @@ -89,7 +89,7 @@ export async function crawlController( createdAt: Date.now(), }; - const crawler = crawlToCrawler(id, sc); + const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null); try { sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); diff --git a/apps/api/src/controllers/v1/extract.ts b/apps/api/src/controllers/v1/extract.ts index 6782b4ed..ea842629 100644 --- a/apps/api/src/controllers/v1/extract.ts +++ b/apps/api/src/controllers/v1/extract.ts @@ -11,6 +11,8 @@ import { saveExtract } from "../../lib/extract/extract-redis"; import { getTeamIdSyncB } from "../../lib/extract/team-id-sync"; import { performExtraction } from "../../lib/extract/extraction-service"; import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0"; +import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; +import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; export async function oldExtract( req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, @@ -58,6 +60,15 @@ export async function extractController( const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true"; req.body = extractRequestSchema.parse(req.body); + if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) { + if (!res.headersSent) { + return res.status(403).json({ + success: false, + error: BLOCKLISTED_URL_MESSAGE, + }); + } + } + const extractId = crypto.randomUUID(); const jobData = { request: req.body, diff --git a/apps/api/src/controllers/v1/map.ts b/apps/api/src/controllers/v1/map.ts index 455f1ed9..33e23fa1 100644 --- a/apps/api/src/controllers/v1/map.ts +++ b/apps/api/src/controllers/v1/map.ts @@ -5,6 +5,7 @@ import { mapRequestSchema, RequestWithAuth, scrapeOptions, + TeamFlags, TimeoutSignal, } from "./types"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; @@ -56,6 +57,7 @@ export async function getMapResults({ abort = new AbortController().signal, // noop mock, filterByPath = true, + flags, }: { url: string; search?: string; @@ -70,6 +72,7 @@ export async function getMapResults({ abort?: AbortSignal; mock?: string; filterByPath?: boolean; + flags: TeamFlags; }): Promise { const id = uuidv4(); let links: string[] = [url]; @@ -88,7 +91,7 @@ export async function getMapResults({ createdAt: Date.now(), }; - const crawler = crawlToCrawler(id, sc); + const crawler = crawlToCrawler(id, sc, flags); try { sc.robots = await crawler.getRobotsTxt(false, abort); @@ -322,6 +325,7 @@ export async function mapController( abort: abort.signal, mock: req.body.useMock, filterByPath: req.body.filterByPath !== false, + flags: req.acuc.flags ?? null, }), ...(req.body.timeout !== undefined ? [ new Promise((resolve, reject) => setTimeout(() => { diff --git a/apps/api/src/controllers/v1/search.ts b/apps/api/src/controllers/v1/search.ts index 9ac0104c..5e90f272 100644 --- a/apps/api/src/controllers/v1/search.ts +++ b/apps/api/src/controllers/v1/search.ts @@ -6,6 +6,7 @@ import { SearchResponse, searchRequestSchema, ScrapeOptions, + TeamFlags, } from "./types"; import { billTeam } from "../../services/billing/credit_billing"; import { v4 as uuidv4 } from "uuid"; @@ -34,6 +35,7 @@ export async function searchAndScrapeSearchResult( }, logger: Logger, costTracking: CostTracking, + flags: TeamFlags, ): Promise { try { const searchResults = await search({ @@ -51,7 +53,8 @@ export async function searchAndScrapeSearchResult( }, options, logger, - costTracking + costTracking, + flags ) ) ); @@ -72,6 +75,7 @@ async function scrapeSearchResult( }, logger: Logger, costTracking: CostTracking, + flags: TeamFlags, ): Promise { const jobId = uuidv4(); const jobPriority = await getJobPriority({ @@ -80,7 +84,7 @@ async function scrapeSearchResult( }); try { - if (isUrlBlocked(searchResult.url)) { + if (isUrlBlocked(searchResult.url, flags)) { throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE); } logger.info("Adding scrape job", { @@ -220,7 +224,7 @@ export async function searchController( origin: req.body.origin, timeout: req.body.timeout, scrapeOptions: req.body.scrapeOptions, - }, logger, costTracking), + }, logger, costTracking, req.acuc?.flags ?? null), ); const docs = await Promise.all(scrapePromises); diff --git a/apps/api/src/controllers/v1/types.ts b/apps/api/src/controllers/v1/types.ts index 7a291b90..6e4c3f46 100644 --- a/apps/api/src/controllers/v1/types.ts +++ b/apps/api/src/controllers/v1/types.ts @@ -1,6 +1,5 @@ import { Request, Response } from "express"; import { z } from "zod"; -import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { countries } from "../../lib/validate-country"; import { @@ -10,7 +9,6 @@ import { Document as V0Document, } from "../../lib/entities"; import { InternalOptions } from "../../scraper/scrapeURL"; -import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings"; export type Format = | "markdown" @@ -49,7 +47,7 @@ export const url = z.preprocess( return false; } }, "Invalid URL") - .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), + // .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), ); const strictMessage = @@ -914,11 +912,17 @@ export type AuthCreditUsageChunk = { scrapeAgentPreview?: number; }; concurrency: number; + flags: TeamFlags; // appended on JS-side is_extract?: boolean; }; +export type TeamFlags = { + ignoreRobots?: boolean; + unblockedDomains?: string[]; +} | null; + export type AuthCreditUsageChunkFromTeam = Omit; export interface RequestWithMaybeACUC< diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index a9a57239..caa95429 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -1,5 +1,5 @@ import { InternalOptions } from "../scraper/scrapeURL"; -import { ScrapeOptions } from "../controllers/v1/types"; +import { ScrapeOptions, TeamFlags } from "../controllers/v1/types"; import { WebCrawler } from "../scraper/WebScraper/crawler"; import { redisConnection } from "../services/queue-service"; import { logger as _logger } from "./logger"; @@ -383,6 +383,7 @@ export async function lockURLsIndividually( export function crawlToCrawler( id: string, sc: StoredCrawl, + teamFlags: TeamFlags, newBase?: string, crawlerOptions?: any, ): WebCrawler { @@ -403,7 +404,7 @@ export function crawlToCrawler( allowExternalContentLinks: sc.crawlerOptions?.allowExternalContentLinks ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, - ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, + ignoreRobotsTxt: teamFlags?.ignoreRobots ?? sc.crawlerOptions?.ignoreRobotsTxt ?? false, regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false, maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth, currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0, diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 296661c9..ce50823f 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -6,7 +6,7 @@ import { logJob } from "../../services/logging/log_job"; import { billTeam } from "../../services/billing/credit_billing"; import { ExtractOptions } from "../../controllers/v1/types"; import { CostTracking } from "../extract/extraction-service"; - +import { getACUCTeam } from "../../controllers/auth"; interface DeepResearchServiceOptions { researchId: string; teamId: string; @@ -45,6 +45,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { ); const llmService = new ResearchLLMService(logger); + const acuc = await getACUCTeam(teamId); + try { while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) { logger.debug("[Deep Research] Current depth:", state.getCurrentDepth()); @@ -112,7 +114,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { fastMode: false, blockAds: false, }, - }, logger, costTracking); + }, logger, costTracking, acuc?.flags ?? null); return response.length > 0 ? response : []; }); diff --git a/apps/api/src/lib/extract/extraction-service.ts b/apps/api/src/lib/extract/extraction-service.ts index fce96584..8c883a6f 100644 --- a/apps/api/src/lib/extract/extraction-service.ts +++ b/apps/api/src/lib/extract/extraction-service.ts @@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs"; import { normalizeUrl } from "../canonical-url"; import { search } from "../../search"; import { buildRephraseToSerpPrompt } from "./build-prompts"; - +import { getACUCTeam } from "../../controllers/auth"; interface ExtractServiceOptions { request: ExtractRequest; teamId: string; @@ -134,6 +134,7 @@ export async function performExtraction( let sources: Record = {}; let costTracking = new CostTracking(subId ? null : 1.5); + const acuc = await getACUCTeam(teamId); let log = { extractId, @@ -323,6 +324,7 @@ export async function performExtraction( }, logger.child({ module: "extract", method: "processUrl", url }), costTracking, + acuc?.flags ?? null, ), ); diff --git a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts index d3ab1589..fcf133e5 100644 --- a/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts +++ b/apps/api/src/lib/extract/fire-0/extraction-service-f0.ts @@ -32,6 +32,7 @@ import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0"; import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0"; import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0"; import { SourceTracker_F0 } from "./helpers/source-tracker-f0"; +import { getACUCTeam } from "../../../controllers/auth"; interface ExtractServiceOptions { @@ -77,6 +78,8 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0"; let singleAnswerResult: any = {}; let totalUrlsScraped = 0; let sources: Record = {}; + + const acuc = await getACUCTeam(teamId); const logger = _logger.child({ @@ -174,6 +177,7 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0"; }); }, logger.child({ module: "extract", method: "processUrl", url }), + acuc?.flags ?? null, ), ); diff --git a/apps/api/src/lib/extract/fire-0/reranker-f0.ts b/apps/api/src/lib/extract/fire-0/reranker-f0.ts index 155df0c6..16ad1840 100644 --- a/apps/api/src/lib/extract/fire-0/reranker-f0.ts +++ b/apps/api/src/lib/extract/fire-0/reranker-f0.ts @@ -1,4 +1,4 @@ -import { MapDocument, URLTrace } from "../../../controllers/v1/types"; +import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types"; import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; import { logger } from "../../logger"; import { CohereClient } from "cohere-ai"; @@ -48,6 +48,7 @@ export async function rerankLinks_F0( mappedLinks: MapDocument[], searchQuery: string, urlTraces: URLTrace[], + flags: TeamFlags, ): Promise { // console.log("Going to rerank links"); const mappedLinksRerank = mappedLinks.map( @@ -65,6 +66,7 @@ export async function rerankLinks_F0( mappedLinks, linksAndScores, extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE, + flags, ); // If we don't have enough high-quality links, try with lower threshold @@ -76,6 +78,7 @@ export async function rerankLinks_F0( mappedLinks, linksAndScores, extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE, + flags, ); if (filteredLinks.length === 0) { @@ -89,7 +92,7 @@ export async function rerankLinks_F0( .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags), ); } } @@ -145,13 +148,14 @@ function filterAndProcessLinks_F0( originalIndex: number; }[], threshold: number, + flags: TeamFlags, ): MapDocument[] { return linksAndScores .filter((x) => x.score > threshold) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags), ); } diff --git a/apps/api/src/lib/extract/fire-0/url-processor-f0.ts b/apps/api/src/lib/extract/fire-0/url-processor-f0.ts index dc410167..8fe5348d 100644 --- a/apps/api/src/lib/extract/fire-0/url-processor-f0.ts +++ b/apps/api/src/lib/extract/fire-0/url-processor-f0.ts @@ -1,4 +1,4 @@ -import { MapDocument, URLTrace } from "../../../controllers/v1/types"; +import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types"; import { getMapResults } from "../../../controllers/v1/map"; import { removeDuplicateUrls } from "../../validateUrl"; import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; @@ -9,6 +9,7 @@ import type { Logger } from "winston"; import { generateText } from "ai"; import { getModel } from "../../generic-ai"; import { CostTracking } from "../extraction-service"; +import { getACUCTeam } from "../../../controllers/auth"; export async function generateBasicCompletion_FO(prompt: string) { const { text } = await generateText({ @@ -34,6 +35,7 @@ export async function processUrl_F0( urlTraces: URLTrace[], updateExtractCallback: (links: string[]) => void, logger: Logger, + teamFlags: TeamFlags, ): Promise { const trace: URLTrace = { url: options.url, @@ -45,7 +47,7 @@ export async function processUrl_F0( urlTraces.push(trace); if (!options.url.includes("/*") && !options.allowExternalLinks) { - if (!isUrlBlocked(options.url)) { + if (!isUrlBlocked(options.url, teamFlags)) { trace.usedInCompletion = true; return [options.url]; } @@ -85,6 +87,7 @@ export async function processUrl_F0( ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, + flags: teamFlags, }); let mappedLinks = mapResults.mapResults as MapDocument[]; @@ -121,6 +124,7 @@ export async function processUrl_F0( ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, + flags: teamFlags, }); mappedLinks = retryMapResults.mapResults as MapDocument[]; diff --git a/apps/api/src/lib/extract/reranker.ts b/apps/api/src/lib/extract/reranker.ts index f9169363..62b1d266 100644 --- a/apps/api/src/lib/extract/reranker.ts +++ b/apps/api/src/lib/extract/reranker.ts @@ -1,4 +1,4 @@ -import { MapDocument, URLTrace } from "../../controllers/v1/types"; +import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types"; import { performRanking } from "../ranker"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { logger } from "../logger"; @@ -57,6 +57,7 @@ export async function rerankLinks( mappedLinks: MapDocument[], searchQuery: string, urlTraces: URLTrace[], + flags: TeamFlags, ): Promise { // console.log("Going to rerank links"); const mappedLinksRerank = mappedLinks.map( @@ -74,6 +75,7 @@ export async function rerankLinks( mappedLinks, linksAndScores, extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE, + flags, ); // If we don't have enough high-quality links, try with lower threshold @@ -85,6 +87,7 @@ export async function rerankLinks( mappedLinks, linksAndScores, extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE, + flags, ); if (filteredLinks.length === 0) { @@ -98,7 +101,7 @@ export async function rerankLinks( .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags), ); } } @@ -154,13 +157,14 @@ function filterAndProcessLinks( originalIndex: number; }[], threshold: number, + flags: TeamFlags, ): MapDocument[] { return linksAndScores .filter((x) => x.score > threshold) .map((x) => mappedLinks.find((link) => link.url === x.link)) .filter( (x): x is MapDocument => - x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), + x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags), ); } diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index 6591a41b..0197f455 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -1,4 +1,4 @@ -import { MapDocument, URLTrace } from "../../controllers/v1/types"; +import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types"; import { getMapResults } from "../../controllers/v1/map"; import { removeDuplicateUrls } from "../validateUrl"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; @@ -93,6 +93,7 @@ export async function processUrl( updateExtractCallback: (links: string[]) => void, logger: Logger, costTracking: CostTracking, + teamFlags: TeamFlags, ): Promise { const trace: URLTrace = { url: options.url, @@ -104,7 +105,7 @@ export async function processUrl( urlTraces.push(trace); if (!options.url.includes("/*") && !options.allowExternalLinks) { - if (!isUrlBlocked(options.url)) { + if (!isUrlBlocked(options.url, teamFlags)) { trace.usedInCompletion = true; return [options.url]; } @@ -144,6 +145,7 @@ export async function processUrl( ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, + flags: teamFlags, }); let mappedLinks = mapResults.mapResults as MapDocument[]; @@ -181,6 +183,7 @@ export async function processUrl( ignoreSitemap: false, includeMetadata: true, includeSubdomains: options.includeSubdomains, + flags: teamFlags, }); mappedLinks = retryMapResults.mapResults as MapDocument[]; diff --git a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts index 7f48872e..a3b30188 100644 --- a/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts +++ b/apps/api/src/lib/generate-llmstxt/generate-llmstxt-service.ts @@ -12,6 +12,7 @@ import { logJob } from "../../services/logging/log_job"; import { getModel } from "../generic-ai"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { CostTracking } from "../extract/extraction-service"; +import { getACUCTeam } from "../../controllers/auth"; interface GenerateLLMsTextServiceOptions { generationId: string; teamId: string; @@ -72,6 +73,7 @@ export async function performGenerateLlmsTxt( teamId, }); const costTracking = new CostTracking(); + const acuc = await getACUCTeam(teamId); try { // Enforce max URL limit @@ -116,6 +118,7 @@ export async function performGenerateLlmsTxt( includeSubdomains: false, ignoreSitemap: false, includeMetadata: true, + flags: acuc?.flags ?? null, }); if (!mapResult || !mapResult.links) { diff --git a/apps/api/src/routes/v1.ts b/apps/api/src/routes/v1.ts index 70a5ec7c..781ed6af 100644 --- a/apps/api/src/routes/v1.ts +++ b/apps/api/src/routes/v1.ts @@ -147,8 +147,8 @@ function idempotencyMiddleware( })().catch((err) => next(err)); } -function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { - if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { +function blocklistMiddleware(req: RequestWithACUC, res: Response, next: NextFunction) { + if (typeof req.body.url === "string" && isUrlBlocked(req.body.url, req.acuc?.flags ?? null)) { if (!res.headersSent) { return res.status(403).json({ success: false, @@ -267,6 +267,7 @@ v1Router.get( v1Router.post( "/llmstxt", authMiddleware(RateLimiterMode.Scrape), + blocklistMiddleware, wrap(generateLLMsTextController), ); diff --git a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts index cbba98e1..6734aff5 100644 --- a/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts +++ b/apps/api/src/scraper/WebScraper/utils/__tests__/blocklist.test.ts @@ -53,22 +53,25 @@ describe("isUrlBlocked function", () => { }); test("Blocks exact domain with and without protocol", () => { - expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey), null)).toBe( true, ); expect( isUrlBlocked( decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey), + null, ), ).toBe(true); expect( isUrlBlocked( decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey), + null, ), ).toBe(true); expect( isUrlBlocked( decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), + null, ), ).toBe(true); }); @@ -77,53 +80,57 @@ describe("isUrlBlocked function", () => { expect( isUrlBlocked( decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), + null, ), ).toBe(true); expect( isUrlBlocked( decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey), + null, ), ).toBe(true); expect( isUrlBlocked( decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey), + null, ), ).toBe(true); }); test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => { - expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey), null)).toBe( true, ); - expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey), null)).toBe( true, ); - expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey), null)).toBe( true, ); expect( isUrlBlocked( decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey), + null, ), ).toBe(true); }); test("Allows unrelated domains like whateverfacebook.com", () => { - expect(isUrlBlocked("whateverfacebook.com")).toBe(false); - expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false); + expect(isUrlBlocked("whateverfacebook.com", null)).toBe(false); + expect(isUrlBlocked("https://whateverfacebook.com", null)).toBe(false); }); test("Blocks other domains from the blocklist", () => { - expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey), null)).toBe( true, ); - expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey), null)).toBe( true, ); - expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey), null)).toBe( true, ); - expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe( + expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey), null)).toBe( true, ); }); @@ -135,23 +142,34 @@ describe("isUrlBlocked function", () => { "4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB", hashKey, ), + null, ), ).toBe(false); expect( isUrlBlocked( decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey), + null, ), ).toBe(false); expect( isUrlBlocked( decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey), + null, ), ).toBe(false); }); test("Should return false if the URL is invalid", () => { - expect(isUrlBlocked("randomstring")).toBe(false); - expect(isUrlBlocked("htp://bad.url")).toBe(false); - expect(isUrlBlocked("")).toBe(false); + expect(isUrlBlocked("randomstring", null)).toBe(false); + expect(isUrlBlocked("htp://bad.url", null)).toBe(false); + expect(isUrlBlocked("", null)).toBe(false); + }); + + test("Should respect flags", () => { + const decryptedDomain = decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey); + + expect(isUrlBlocked(decryptedDomain, { + unblockedDomains: [decryptedDomain], + })).toBe(false); }); }); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 21889795..6c44682a 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -1,6 +1,7 @@ import { configDotenv } from "dotenv"; import crypto from "crypto"; import { parse } from "tldts"; +import { TeamFlags } from "../../../controllers/v1/types"; configDotenv(); @@ -101,10 +102,15 @@ export function decryptedBlocklist(list: string[]): string[] { : []; } -export function isUrlBlocked(url: string): boolean { +export function isUrlBlocked(url: string, flags: TeamFlags): boolean { const lowerCaseUrl = url.trim().toLowerCase(); - const blockedlist = decryptedBlocklist(urlBlocklist); + let blockedlist = decryptedBlocklist(urlBlocklist); + + if (flags?.unblockedDomains) { + blockedlist = blockedlist.filter((blocked) => !flags.unblockedDomains!.includes(blocked)); + } + const decryptedUrl = blockedlist.find((decrypted) => lowerCaseUrl === decrypted) || lowerCaseUrl; diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 4e56394c..1c329c96 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -80,6 +80,7 @@ import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis"; import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0"; import { CostTracking } from "../lib/extract/extraction-service"; +import { getACUCTeam } from "../controllers/auth"; configDotenv(); @@ -144,6 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { const crawler = crawlToCrawler( job.data.crawl_id, sc, + (await getACUCTeam(job.data.team_id))?.flags ?? null, sc.originUrl!, job.data.crawlerOptions, ); @@ -871,7 +873,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) { try { const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; - const crawler = crawlToCrawler(job.data.crawl_id, sc); + const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null); logger.debug("Locking URL..."); await lockURL(job.data.crawl_id, sc, job.data.url); @@ -1135,7 +1137,7 @@ async function processJob(job: Job & { id: string }, token: string) { normalizeURL(doc.metadata.sourceURL, sc) && job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape ) { - const crawler = crawlToCrawler(job.data.crawl_id, sc); + const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null); if ( crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === null && @@ -1160,7 +1162,7 @@ async function processJob(job: Job & { id: string }, token: string) { await saveCrawl(job.data.crawl_id, sc); } - if (isUrlBlocked(doc.metadata.url)) { + if (isUrlBlocked(doc.metadata.url, (await getACUCTeam(job.data.team_id))?.flags ?? null)) { throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking } @@ -1219,6 +1221,7 @@ async function processJob(job: Job & { id: string }, token: string) { const crawler = crawlToCrawler( job.data.crawl_id, sc, + (await getACUCTeam(job.data.team_id))?.flags ?? null, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!, job.data.crawlerOptions, );