feat(acuc): propagate team flags (FIR-1879) (#1522)

* feat(acuc): propagate team flags

* feat(flags): further functionality
This commit is contained in:
Gergő Móricz 2025-05-08 20:23:35 +02:00 committed by GitHub
parent 017a915ae8
commit fa581995e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 153 additions and 52 deletions

View File

@ -103,6 +103,7 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage
planModifier: 0.1,
},
concurrency: is_extract ? 200 : 2,
flags: null,
is_extract,
});
@ -137,6 +138,7 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({
planModifier: 0.1,
},
concurrency: 99999999,
flags: null,
is_extract: false,
});
@ -181,7 +183,7 @@ export async function getACUC(
const client =
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
({ data, error } = await client.rpc(
"auth_credit_usage_chunk_30",
"auth_credit_usage_chunk_32",
{ input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
{ get: true },
));
@ -298,7 +300,7 @@ export async function getACUCTeam(
const client =
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
({ data, error } = await client.rpc(
"auth_credit_usage_chunk_30_from_team",
"auth_credit_usage_chunk_32_from_team",
{ input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
{ get: true },
));

View File

@ -115,7 +115,7 @@ export async function crawlController(req: Request, res: Response) {
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
return res.status(403).json({
error: BLOCKLISTED_URL_MESSAGE,
});
@ -173,7 +173,7 @@ export async function crawlController(req: Request, res: Response) {
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
try {
sc.robots = await crawler.getRobotsTxt();

View File

@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
return res.status(403).json({
error: BLOCKLISTED_URL_MESSAGE,
});
@ -112,7 +112,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc);
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
await finishCrawlKickoff(id);

View File

@ -9,6 +9,7 @@ import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job";
import {
fromLegacyCombo,
TeamFlags,
toLegacyDocument,
url as urlSchema,
} from "../v1/types";
@ -40,6 +41,7 @@ export async function scrapeHelper(
pageOptions: PageOptions,
extractorOptions: ExtractorOptions,
timeout: number,
flags: TeamFlags,
): Promise<{
success: boolean;
error?: string;
@ -51,7 +53,7 @@ export async function scrapeHelper(
return { success: false, error: "Url is required", returnCode: 400 };
}
if (isUrlBlocked(url)) {
if (isUrlBlocked(url, flags)) {
return {
success: false,
error: BLOCKLISTED_URL_MESSAGE,
@ -241,6 +243,7 @@ export async function scrapeController(req: Request, res: Response) {
pageOptions,
extractorOptions,
timeout,
chunk?.flags ?? null,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -20,6 +20,7 @@ import {
Document,
fromLegacyCombo,
fromLegacyScrapeOptions,
TeamFlags,
toLegacyDocument,
} from "../v1/types";
import { getJobFromGCS } from "../../lib/gcs-jobs";
@ -32,6 +33,7 @@ export async function searchHelper(
crawlerOptions: any,
pageOptions: PageOptions,
searchOptions: SearchOptions,
flags: TeamFlags,
): Promise<{
success: boolean;
error?: string;
@ -85,7 +87,7 @@ export async function searchHelper(
return { success: true, data: res, returnCode: 200 };
}
res = res.filter((r) => !isUrlBlocked(r.url));
res = res.filter((r) => !isUrlBlocked(r.url, flags));
if (res.length > num_results) {
res = res.slice(0, num_results);
}
@ -202,6 +204,7 @@ export async function searchController(req: Request, res: Response) {
crawlerOptions,
pageOptions,
searchOptions,
chunk?.flags ?? null,
);
const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -23,6 +23,8 @@ import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger";
import { CostTracking } from "../../lib/extract/extraction-service";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
export async function batchScrapeController(
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
@ -54,11 +56,24 @@ export async function batchScrapeController(
for (const u of pendingURLs) {
try {
const nu = urlSchema.parse(u);
urls.push(nu);
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
urls.push(nu);
} else {
invalidURLs.push(u);
}
} catch (_) {
invalidURLs.push(u);
}
}
} else {
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
error: BLOCKLISTED_URL_MESSAGE,
});
}
}
}
logger.debug("Batch scrape " + id + " starting", {

View File

@ -89,7 +89,7 @@ export async function crawlController(
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null);
try {
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);

View File

@ -11,6 +11,8 @@ import { saveExtract } from "../../lib/extract/extract-redis";
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
import { performExtraction } from "../../lib/extract/extraction-service";
import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
export async function oldExtract(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
@ -58,6 +60,15 @@ export async function extractController(
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
req.body = extractRequestSchema.parse(req.body);
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
error: BLOCKLISTED_URL_MESSAGE,
});
}
}
const extractId = crypto.randomUUID();
const jobData = {
request: req.body,

View File

@ -5,6 +5,7 @@ import {
mapRequestSchema,
RequestWithAuth,
scrapeOptions,
TeamFlags,
TimeoutSignal,
} from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
@ -56,6 +57,7 @@ export async function getMapResults({
abort = new AbortController().signal, // noop
mock,
filterByPath = true,
flags,
}: {
url: string;
search?: string;
@ -70,6 +72,7 @@ export async function getMapResults({
abort?: AbortSignal;
mock?: string;
filterByPath?: boolean;
flags: TeamFlags;
}): Promise<MapResult> {
const id = uuidv4();
let links: string[] = [url];
@ -88,7 +91,7 @@ export async function getMapResults({
createdAt: Date.now(),
};
const crawler = crawlToCrawler(id, sc);
const crawler = crawlToCrawler(id, sc, flags);
try {
sc.robots = await crawler.getRobotsTxt(false, abort);
@ -322,6 +325,7 @@ export async function mapController(
abort: abort.signal,
mock: req.body.useMock,
filterByPath: req.body.filterByPath !== false,
flags: req.acuc.flags ?? null,
}),
...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => {

View File

@ -6,6 +6,7 @@ import {
SearchResponse,
searchRequestSchema,
ScrapeOptions,
TeamFlags,
} from "./types";
import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid";
@ -34,6 +35,7 @@ export async function searchAndScrapeSearchResult(
},
logger: Logger,
costTracking: CostTracking,
flags: TeamFlags,
): Promise<Document[]> {
try {
const searchResults = await search({
@ -51,7 +53,8 @@ export async function searchAndScrapeSearchResult(
},
options,
logger,
costTracking
costTracking,
flags
)
)
);
@ -72,6 +75,7 @@ async function scrapeSearchResult(
},
logger: Logger,
costTracking: CostTracking,
flags: TeamFlags,
): Promise<Document> {
const jobId = uuidv4();
const jobPriority = await getJobPriority({
@ -80,7 +84,7 @@ async function scrapeSearchResult(
});
try {
if (isUrlBlocked(searchResult.url)) {
if (isUrlBlocked(searchResult.url, flags)) {
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
}
logger.info("Adding scrape job", {
@ -220,7 +224,7 @@ export async function searchController(
origin: req.body.origin,
timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions,
}, logger, costTracking),
}, logger, costTracking, req.acuc?.flags ?? null),
);
const docs = await Promise.all(scrapePromises);

View File

@ -1,6 +1,5 @@
import { Request, Response } from "express";
import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { countries } from "../../lib/validate-country";
import {
@ -10,7 +9,6 @@ import {
Document as V0Document,
} from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export type Format =
| "markdown"
@ -49,7 +47,7 @@ export const url = z.preprocess(
return false;
}
}, "Invalid URL")
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
);
const strictMessage =
@ -914,11 +912,17 @@ export type AuthCreditUsageChunk = {
scrapeAgentPreview?: number;
};
concurrency: number;
flags: TeamFlags;
// appended on JS-side
is_extract?: boolean;
};
export type TeamFlags = {
ignoreRobots?: boolean;
unblockedDomains?: string[];
} | null;
export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">;
export interface RequestWithMaybeACUC<

View File

@ -1,5 +1,5 @@
import { InternalOptions } from "../scraper/scrapeURL";
import { ScrapeOptions } from "../controllers/v1/types";
import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service";
import { logger as _logger } from "./logger";
@ -383,6 +383,7 @@ export async function lockURLsIndividually(
export function crawlToCrawler(
id: string,
sc: StoredCrawl,
teamFlags: TeamFlags,
newBase?: string,
crawlerOptions?: any,
): WebCrawler {
@ -403,7 +404,7 @@ export function crawlToCrawler(
allowExternalContentLinks:
sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
ignoreRobotsTxt: teamFlags?.ignoreRobots ?? sc.crawlerOptions?.ignoreRobotsTxt ?? false,
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,

View File

@ -6,7 +6,7 @@ import { logJob } from "../../services/logging/log_job";
import { billTeam } from "../../services/billing/credit_billing";
import { ExtractOptions } from "../../controllers/v1/types";
import { CostTracking } from "../extract/extraction-service";
import { getACUCTeam } from "../../controllers/auth";
interface DeepResearchServiceOptions {
researchId: string;
teamId: string;
@ -45,6 +45,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
);
const llmService = new ResearchLLMService(logger);
const acuc = await getACUCTeam(teamId);
try {
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
@ -112,7 +114,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
fastMode: false,
blockAds: false,
},
}, logger, costTracking);
}, logger, costTracking, acuc?.flags ?? null);
return response.length > 0 ? response : [];
});

View File

@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
import { normalizeUrl } from "../canonical-url";
import { search } from "../../search";
import { buildRephraseToSerpPrompt } from "./build-prompts";
import { getACUCTeam } from "../../controllers/auth";
interface ExtractServiceOptions {
request: ExtractRequest;
teamId: string;
@ -134,6 +134,7 @@ export async function performExtraction(
let sources: Record<string, string[]> = {};
let costTracking = new CostTracking(subId ? null : 1.5);
const acuc = await getACUCTeam(teamId);
let log = {
extractId,
@ -323,6 +324,7 @@ export async function performExtraction(
},
logger.child({ module: "extract", method: "processUrl", url }),
costTracking,
acuc?.flags ?? null,
),
);

View File

@ -32,6 +32,7 @@ import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0";
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
import { getACUCTeam } from "../../../controllers/auth";
interface ExtractServiceOptions {
@ -77,6 +78,8 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
let singleAnswerResult: any = {};
let totalUrlsScraped = 0;
let sources: Record<string, string[]> = {};
const acuc = await getACUCTeam(teamId);
const logger = _logger.child({
@ -174,6 +177,7 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
});
},
logger.child({ module: "extract", method: "processUrl", url }),
acuc?.flags ?? null,
),
);

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
import { logger } from "../../logger";
import { CohereClient } from "cohere-ai";
@ -48,6 +48,7 @@ export async function rerankLinks_F0(
mappedLinks: MapDocument[],
searchQuery: string,
urlTraces: URLTrace[],
flags: TeamFlags,
): Promise<MapDocument[]> {
// console.log("Going to rerank links");
const mappedLinksRerank = mappedLinks.map(
@ -65,6 +66,7 @@ export async function rerankLinks_F0(
mappedLinks,
linksAndScores,
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
);
// If we don't have enough high-quality links, try with lower threshold
@ -76,6 +78,7 @@ export async function rerankLinks_F0(
mappedLinks,
linksAndScores,
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
);
if (filteredLinks.length === 0) {
@ -89,7 +92,7 @@ export async function rerankLinks_F0(
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
);
}
}
@ -145,13 +148,14 @@ function filterAndProcessLinks_F0(
originalIndex: number;
}[],
threshold: number,
flags: TeamFlags,
): MapDocument[] {
return linksAndScores
.filter((x) => x.score > threshold)
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
);
}

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
import { getMapResults } from "../../../controllers/v1/map";
import { removeDuplicateUrls } from "../../validateUrl";
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
@ -9,6 +9,7 @@ import type { Logger } from "winston";
import { generateText } from "ai";
import { getModel } from "../../generic-ai";
import { CostTracking } from "../extraction-service";
import { getACUCTeam } from "../../../controllers/auth";
export async function generateBasicCompletion_FO(prompt: string) {
const { text } = await generateText({
@ -34,6 +35,7 @@ export async function processUrl_F0(
urlTraces: URLTrace[],
updateExtractCallback: (links: string[]) => void,
logger: Logger,
teamFlags: TeamFlags,
): Promise<string[]> {
const trace: URLTrace = {
url: options.url,
@ -45,7 +47,7 @@ export async function processUrl_F0(
urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) {
if (!isUrlBlocked(options.url, teamFlags)) {
trace.usedInCompletion = true;
return [options.url];
}
@ -85,6 +87,7 @@ export async function processUrl_F0(
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
flags: teamFlags,
});
let mappedLinks = mapResults.mapResults as MapDocument[];
@ -121,6 +124,7 @@ export async function processUrl_F0(
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
flags: teamFlags,
});
mappedLinks = retryMapResults.mapResults as MapDocument[];

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../controllers/v1/types";
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
import { performRanking } from "../ranker";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger";
@ -57,6 +57,7 @@ export async function rerankLinks(
mappedLinks: MapDocument[],
searchQuery: string,
urlTraces: URLTrace[],
flags: TeamFlags,
): Promise<MapDocument[]> {
// console.log("Going to rerank links");
const mappedLinksRerank = mappedLinks.map(
@ -74,6 +75,7 @@ export async function rerankLinks(
mappedLinks,
linksAndScores,
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
);
// If we don't have enough high-quality links, try with lower threshold
@ -85,6 +87,7 @@ export async function rerankLinks(
mappedLinks,
linksAndScores,
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
);
if (filteredLinks.length === 0) {
@ -98,7 +101,7 @@ export async function rerankLinks(
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
);
}
}
@ -154,13 +157,14 @@ function filterAndProcessLinks(
originalIndex: number;
}[],
threshold: number,
flags: TeamFlags,
): MapDocument[] {
return linksAndScores
.filter((x) => x.score > threshold)
.map((x) => mappedLinks.find((link) => link.url === x.link))
.filter(
(x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
);
}

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../controllers/v1/types";
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
import { getMapResults } from "../../controllers/v1/map";
import { removeDuplicateUrls } from "../validateUrl";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
@ -93,6 +93,7 @@ export async function processUrl(
updateExtractCallback: (links: string[]) => void,
logger: Logger,
costTracking: CostTracking,
teamFlags: TeamFlags,
): Promise<string[]> {
const trace: URLTrace = {
url: options.url,
@ -104,7 +105,7 @@ export async function processUrl(
urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) {
if (!isUrlBlocked(options.url, teamFlags)) {
trace.usedInCompletion = true;
return [options.url];
}
@ -144,6 +145,7 @@ export async function processUrl(
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
flags: teamFlags,
});
let mappedLinks = mapResults.mapResults as MapDocument[];
@ -181,6 +183,7 @@ export async function processUrl(
ignoreSitemap: false,
includeMetadata: true,
includeSubdomains: options.includeSubdomains,
flags: teamFlags,
});
mappedLinks = retryMapResults.mapResults as MapDocument[];

View File

@ -12,6 +12,7 @@ import { logJob } from "../../services/logging/log_job";
import { getModel } from "../generic-ai";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { CostTracking } from "../extract/extraction-service";
import { getACUCTeam } from "../../controllers/auth";
interface GenerateLLMsTextServiceOptions {
generationId: string;
teamId: string;
@ -72,6 +73,7 @@ export async function performGenerateLlmsTxt(
teamId,
});
const costTracking = new CostTracking();
const acuc = await getACUCTeam(teamId);
try {
// Enforce max URL limit
@ -116,6 +118,7 @@ export async function performGenerateLlmsTxt(
includeSubdomains: false,
ignoreSitemap: false,
includeMetadata: true,
flags: acuc?.flags ?? null,
});
if (!mapResult || !mapResult.links) {

View File

@ -147,8 +147,8 @@ function idempotencyMiddleware(
})().catch((err) => next(err));
}
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
function blocklistMiddleware(req: RequestWithACUC<any, any, any>, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url, req.acuc?.flags ?? null)) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
@ -267,6 +267,7 @@ v1Router.get(
v1Router.post(
"/llmstxt",
authMiddleware(RateLimiterMode.Scrape),
blocklistMiddleware,
wrap(generateLLMsTextController),
);

View File

@ -53,22 +53,25 @@ describe("isUrlBlocked function", () => {
});
test("Blocks exact domain with and without protocol", () => {
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey), null)).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
null,
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
null,
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
),
).toBe(true);
});
@ -77,53 +80,57 @@ describe("isUrlBlocked function", () => {
expect(
isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
),
).toBe(true);
expect(
isUrlBlocked(
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
null,
),
).toBe(true);
});
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey), null)).toBe(
true,
);
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey), null)).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey), null)).toBe(
true,
);
expect(
isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
null,
),
).toBe(true);
});
test("Allows unrelated domains like whateverfacebook.com", () => {
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
expect(isUrlBlocked("whateverfacebook.com", null)).toBe(false);
expect(isUrlBlocked("https://whateverfacebook.com", null)).toBe(false);
});
test("Blocks other domains from the blocklist", () => {
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey), null)).toBe(
true,
);
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey), null)).toBe(
true,
);
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey), null)).toBe(
true,
);
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey), null)).toBe(
true,
);
});
@ -135,23 +142,34 @@ describe("isUrlBlocked function", () => {
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
hashKey,
),
null,
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
null,
),
).toBe(false);
expect(
isUrlBlocked(
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
null,
),
).toBe(false);
});
test("Should return false if the URL is invalid", () => {
expect(isUrlBlocked("randomstring")).toBe(false);
expect(isUrlBlocked("htp://bad.url")).toBe(false);
expect(isUrlBlocked("")).toBe(false);
expect(isUrlBlocked("randomstring", null)).toBe(false);
expect(isUrlBlocked("htp://bad.url", null)).toBe(false);
expect(isUrlBlocked("", null)).toBe(false);
});
test("Should respect flags", () => {
const decryptedDomain = decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey);
expect(isUrlBlocked(decryptedDomain, {
unblockedDomains: [decryptedDomain],
})).toBe(false);
});
});

View File

@ -1,6 +1,7 @@
import { configDotenv } from "dotenv";
import crypto from "crypto";
import { parse } from "tldts";
import { TeamFlags } from "../../../controllers/v1/types";
configDotenv();
@ -101,10 +102,15 @@ export function decryptedBlocklist(list: string[]): string[] {
: [];
}
export function isUrlBlocked(url: string): boolean {
export function isUrlBlocked(url: string, flags: TeamFlags): boolean {
const lowerCaseUrl = url.trim().toLowerCase();
const blockedlist = decryptedBlocklist(urlBlocklist);
let blockedlist = decryptedBlocklist(urlBlocklist);
if (flags?.unblockedDomains) {
blockedlist = blockedlist.filter((blocked) => !flags.unblockedDomains!.includes(blocked));
}
const decryptedUrl =
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
lowerCaseUrl;

View File

@ -80,6 +80,7 @@ import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
import { CostTracking } from "../lib/extract/extraction-service";
import { getACUCTeam } from "../controllers/auth";
configDotenv();
@ -144,6 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const crawler = crawlToCrawler(
job.data.crawl_id,
sc,
(await getACUCTeam(job.data.team_id))?.flags ?? null,
sc.originUrl!,
job.data.crawlerOptions,
);
@ -871,7 +873,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
try {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
logger.debug("Locking URL...");
await lockURL(job.data.crawl_id, sc, job.data.url);
@ -1135,7 +1137,7 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.sourceURL, sc) &&
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
) {
const crawler = crawlToCrawler(job.data.crawl_id, sc);
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
null &&
@ -1160,7 +1162,7 @@ async function processJob(job: Job & { id: string }, token: string) {
await saveCrawl(job.data.crawl_id, sc);
}
if (isUrlBlocked(doc.metadata.url)) {
if (isUrlBlocked(doc.metadata.url, (await getACUCTeam(job.data.team_id))?.flags ?? null)) {
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
}
@ -1219,6 +1221,7 @@ async function processJob(job: Job & { id: string }, token: string) {
const crawler = crawlToCrawler(
job.data.crawl_id,
sc,
(await getACUCTeam(job.data.team_id))?.flags ?? null,
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
job.data.crawlerOptions,
);