feat(acuc): propagate team flags (FIR-1879) (#1522)

* feat(acuc): propagate team flags

* feat(flags): further functionality
This commit is contained in:
Gergő Móricz 2025-05-08 20:23:35 +02:00 committed by GitHub
parent 017a915ae8
commit fa581995e6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
24 changed files with 153 additions and 52 deletions

View File

@ -103,6 +103,7 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage
planModifier: 0.1, planModifier: 0.1,
}, },
concurrency: is_extract ? 200 : 2, concurrency: is_extract ? 200 : 2,
flags: null,
is_extract, is_extract,
}); });
@ -137,6 +138,7 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({
planModifier: 0.1, planModifier: 0.1,
}, },
concurrency: 99999999, concurrency: 99999999,
flags: null,
is_extract: false, is_extract: false,
}); });
@ -181,7 +183,7 @@ export async function getACUC(
const client = const client =
Math.random() > (2/3) ? supabase_rr_service : supabase_service; Math.random() > (2/3) ? supabase_rr_service : supabase_service;
({ data, error } = await client.rpc( ({ data, error } = await client.rpc(
"auth_credit_usage_chunk_30", "auth_credit_usage_chunk_32",
{ input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true }, { input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
{ get: true }, { get: true },
)); ));
@ -298,7 +300,7 @@ export async function getACUCTeam(
const client = const client =
Math.random() > (2/3) ? supabase_rr_service : supabase_service; Math.random() > (2/3) ? supabase_rr_service : supabase_service;
({ data, error } = await client.rpc( ({ data, error } = await client.rpc(
"auth_credit_usage_chunk_30_from_team", "auth_credit_usage_chunk_32_from_team",
{ input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true }, { input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
{ get: true }, { get: true },
)); ));

View File

@ -115,7 +115,7 @@ export async function crawlController(req: Request, res: Response) {
.json({ error: e.message ?? e }); .json({ error: e.message ?? e });
} }
if (isUrlBlocked(url)) { if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
return res.status(403).json({ return res.status(403).json({
error: BLOCKLISTED_URL_MESSAGE, error: BLOCKLISTED_URL_MESSAGE,
}); });
@ -173,7 +173,7 @@ export async function crawlController(req: Request, res: Response) {
createdAt: Date.now(), createdAt: Date.now(),
}; };
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
try { try {
sc.robots = await crawler.getRobotsTxt(); sc.robots = await crawler.getRobotsTxt();

View File

@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
.json({ error: e.message ?? e }); .json({ error: e.message ?? e });
} }
if (isUrlBlocked(url)) { if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
return res.status(403).json({ return res.status(403).json({
error: BLOCKLISTED_URL_MESSAGE, error: BLOCKLISTED_URL_MESSAGE,
}); });
@ -112,7 +112,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
await saveCrawl(id, sc); await saveCrawl(id, sc);
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
await finishCrawlKickoff(id); await finishCrawlKickoff(id);

View File

@ -9,6 +9,7 @@ import { RateLimiterMode } from "../../types";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { import {
fromLegacyCombo, fromLegacyCombo,
TeamFlags,
toLegacyDocument, toLegacyDocument,
url as urlSchema, url as urlSchema,
} from "../v1/types"; } from "../v1/types";
@ -40,6 +41,7 @@ export async function scrapeHelper(
pageOptions: PageOptions, pageOptions: PageOptions,
extractorOptions: ExtractorOptions, extractorOptions: ExtractorOptions,
timeout: number, timeout: number,
flags: TeamFlags,
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -51,7 +53,7 @@ export async function scrapeHelper(
return { success: false, error: "Url is required", returnCode: 400 }; return { success: false, error: "Url is required", returnCode: 400 };
} }
if (isUrlBlocked(url)) { if (isUrlBlocked(url, flags)) {
return { return {
success: false, success: false,
error: BLOCKLISTED_URL_MESSAGE, error: BLOCKLISTED_URL_MESSAGE,
@ -241,6 +243,7 @@ export async function scrapeController(req: Request, res: Response) {
pageOptions, pageOptions,
extractorOptions, extractorOptions,
timeout, timeout,
chunk?.flags ?? null,
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -20,6 +20,7 @@ import {
Document, Document,
fromLegacyCombo, fromLegacyCombo,
fromLegacyScrapeOptions, fromLegacyScrapeOptions,
TeamFlags,
toLegacyDocument, toLegacyDocument,
} from "../v1/types"; } from "../v1/types";
import { getJobFromGCS } from "../../lib/gcs-jobs"; import { getJobFromGCS } from "../../lib/gcs-jobs";
@ -32,6 +33,7 @@ export async function searchHelper(
crawlerOptions: any, crawlerOptions: any,
pageOptions: PageOptions, pageOptions: PageOptions,
searchOptions: SearchOptions, searchOptions: SearchOptions,
flags: TeamFlags,
): Promise<{ ): Promise<{
success: boolean; success: boolean;
error?: string; error?: string;
@ -85,7 +87,7 @@ export async function searchHelper(
return { success: true, data: res, returnCode: 200 }; return { success: true, data: res, returnCode: 200 };
} }
res = res.filter((r) => !isUrlBlocked(r.url)); res = res.filter((r) => !isUrlBlocked(r.url, flags));
if (res.length > num_results) { if (res.length > num_results) {
res = res.slice(0, num_results); res = res.slice(0, num_results);
} }
@ -202,6 +204,7 @@ export async function searchController(req: Request, res: Response) {
crawlerOptions, crawlerOptions,
pageOptions, pageOptions,
searchOptions, searchOptions,
chunk?.flags ?? null,
); );
const endTime = new Date().getTime(); const endTime = new Date().getTime();
const timeTakenInSeconds = (endTime - startTime) / 1000; const timeTakenInSeconds = (endTime - startTime) / 1000;

View File

@ -23,6 +23,8 @@ import { addScrapeJobs } from "../../services/queue-jobs";
import { callWebhook } from "../../services/webhook"; import { callWebhook } from "../../services/webhook";
import { logger as _logger } from "../../lib/logger"; import { logger as _logger } from "../../lib/logger";
import { CostTracking } from "../../lib/extract/extraction-service"; import { CostTracking } from "../../lib/extract/extraction-service";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
export async function batchScrapeController( export async function batchScrapeController(
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>, req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
@ -54,11 +56,24 @@ export async function batchScrapeController(
for (const u of pendingURLs) { for (const u of pendingURLs) {
try { try {
const nu = urlSchema.parse(u); const nu = urlSchema.parse(u);
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
urls.push(nu); urls.push(nu);
} else {
invalidURLs.push(u);
}
} catch (_) { } catch (_) {
invalidURLs.push(u); invalidURLs.push(u);
} }
} }
} else {
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
error: BLOCKLISTED_URL_MESSAGE,
});
}
}
} }
logger.debug("Batch scrape " + id + " starting", { logger.debug("Batch scrape " + id + " starting", {

View File

@ -89,7 +89,7 @@ export async function crawlController(
createdAt: Date.now(), createdAt: Date.now(),
}; };
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null);
try { try {
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification); sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);

View File

@ -11,6 +11,8 @@ import { saveExtract } from "../../lib/extract/extract-redis";
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync"; import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
import { performExtraction } from "../../lib/extract/extraction-service"; import { performExtraction } from "../../lib/extract/extraction-service";
import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0"; import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
export async function oldExtract( export async function oldExtract(
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>, req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
@ -58,6 +60,15 @@ export async function extractController(
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true"; const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
req.body = extractRequestSchema.parse(req.body); req.body = extractRequestSchema.parse(req.body);
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
if (!res.headersSent) {
return res.status(403).json({
success: false,
error: BLOCKLISTED_URL_MESSAGE,
});
}
}
const extractId = crypto.randomUUID(); const extractId = crypto.randomUUID();
const jobData = { const jobData = {
request: req.body, request: req.body,

View File

@ -5,6 +5,7 @@ import {
mapRequestSchema, mapRequestSchema,
RequestWithAuth, RequestWithAuth,
scrapeOptions, scrapeOptions,
TeamFlags,
TimeoutSignal, TimeoutSignal,
} from "./types"; } from "./types";
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
@ -56,6 +57,7 @@ export async function getMapResults({
abort = new AbortController().signal, // noop abort = new AbortController().signal, // noop
mock, mock,
filterByPath = true, filterByPath = true,
flags,
}: { }: {
url: string; url: string;
search?: string; search?: string;
@ -70,6 +72,7 @@ export async function getMapResults({
abort?: AbortSignal; abort?: AbortSignal;
mock?: string; mock?: string;
filterByPath?: boolean; filterByPath?: boolean;
flags: TeamFlags;
}): Promise<MapResult> { }): Promise<MapResult> {
const id = uuidv4(); const id = uuidv4();
let links: string[] = [url]; let links: string[] = [url];
@ -88,7 +91,7 @@ export async function getMapResults({
createdAt: Date.now(), createdAt: Date.now(),
}; };
const crawler = crawlToCrawler(id, sc); const crawler = crawlToCrawler(id, sc, flags);
try { try {
sc.robots = await crawler.getRobotsTxt(false, abort); sc.robots = await crawler.getRobotsTxt(false, abort);
@ -322,6 +325,7 @@ export async function mapController(
abort: abort.signal, abort: abort.signal,
mock: req.body.useMock, mock: req.body.useMock,
filterByPath: req.body.filterByPath !== false, filterByPath: req.body.filterByPath !== false,
flags: req.acuc.flags ?? null,
}), }),
...(req.body.timeout !== undefined ? [ ...(req.body.timeout !== undefined ? [
new Promise((resolve, reject) => setTimeout(() => { new Promise((resolve, reject) => setTimeout(() => {

View File

@ -6,6 +6,7 @@ import {
SearchResponse, SearchResponse,
searchRequestSchema, searchRequestSchema,
ScrapeOptions, ScrapeOptions,
TeamFlags,
} from "./types"; } from "./types";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { v4 as uuidv4 } from "uuid"; import { v4 as uuidv4 } from "uuid";
@ -34,6 +35,7 @@ export async function searchAndScrapeSearchResult(
}, },
logger: Logger, logger: Logger,
costTracking: CostTracking, costTracking: CostTracking,
flags: TeamFlags,
): Promise<Document[]> { ): Promise<Document[]> {
try { try {
const searchResults = await search({ const searchResults = await search({
@ -51,7 +53,8 @@ export async function searchAndScrapeSearchResult(
}, },
options, options,
logger, logger,
costTracking costTracking,
flags
) )
) )
); );
@ -72,6 +75,7 @@ async function scrapeSearchResult(
}, },
logger: Logger, logger: Logger,
costTracking: CostTracking, costTracking: CostTracking,
flags: TeamFlags,
): Promise<Document> { ): Promise<Document> {
const jobId = uuidv4(); const jobId = uuidv4();
const jobPriority = await getJobPriority({ const jobPriority = await getJobPriority({
@ -80,7 +84,7 @@ async function scrapeSearchResult(
}); });
try { try {
if (isUrlBlocked(searchResult.url)) { if (isUrlBlocked(searchResult.url, flags)) {
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE); throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
} }
logger.info("Adding scrape job", { logger.info("Adding scrape job", {
@ -220,7 +224,7 @@ export async function searchController(
origin: req.body.origin, origin: req.body.origin,
timeout: req.body.timeout, timeout: req.body.timeout,
scrapeOptions: req.body.scrapeOptions, scrapeOptions: req.body.scrapeOptions,
}, logger, costTracking), }, logger, costTracking, req.acuc?.flags ?? null),
); );
const docs = await Promise.all(scrapePromises); const docs = await Promise.all(scrapePromises);

View File

@ -1,6 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { z } from "zod"; import { z } from "zod";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { protocolIncluded, checkUrl } from "../../lib/validateUrl"; import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
import { countries } from "../../lib/validate-country"; import { countries } from "../../lib/validate-country";
import { import {
@ -10,7 +9,6 @@ import {
Document as V0Document, Document as V0Document,
} from "../../lib/entities"; } from "../../lib/entities";
import { InternalOptions } from "../../scraper/scrapeURL"; import { InternalOptions } from "../../scraper/scrapeURL";
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
export type Format = export type Format =
| "markdown" | "markdown"
@ -49,7 +47,7 @@ export const url = z.preprocess(
return false; return false;
} }
}, "Invalid URL") }, "Invalid URL")
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE), // .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
); );
const strictMessage = const strictMessage =
@ -914,11 +912,17 @@ export type AuthCreditUsageChunk = {
scrapeAgentPreview?: number; scrapeAgentPreview?: number;
}; };
concurrency: number; concurrency: number;
flags: TeamFlags;
// appended on JS-side // appended on JS-side
is_extract?: boolean; is_extract?: boolean;
}; };
export type TeamFlags = {
ignoreRobots?: boolean;
unblockedDomains?: string[];
} | null;
export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">; export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">;
export interface RequestWithMaybeACUC< export interface RequestWithMaybeACUC<

View File

@ -1,5 +1,5 @@
import { InternalOptions } from "../scraper/scrapeURL"; import { InternalOptions } from "../scraper/scrapeURL";
import { ScrapeOptions } from "../controllers/v1/types"; import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
import { WebCrawler } from "../scraper/WebScraper/crawler"; import { WebCrawler } from "../scraper/WebScraper/crawler";
import { redisConnection } from "../services/queue-service"; import { redisConnection } from "../services/queue-service";
import { logger as _logger } from "./logger"; import { logger as _logger } from "./logger";
@ -383,6 +383,7 @@ export async function lockURLsIndividually(
export function crawlToCrawler( export function crawlToCrawler(
id: string, id: string,
sc: StoredCrawl, sc: StoredCrawl,
teamFlags: TeamFlags,
newBase?: string, newBase?: string,
crawlerOptions?: any, crawlerOptions?: any,
): WebCrawler { ): WebCrawler {
@ -403,7 +404,7 @@ export function crawlToCrawler(
allowExternalContentLinks: allowExternalContentLinks:
sc.crawlerOptions?.allowExternalContentLinks ?? false, sc.crawlerOptions?.allowExternalContentLinks ?? false,
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false, allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false, ignoreRobotsTxt: teamFlags?.ignoreRobots ?? sc.crawlerOptions?.ignoreRobotsTxt ?? false,
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false, regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth, maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0, currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,

View File

@ -6,7 +6,7 @@ import { logJob } from "../../services/logging/log_job";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { ExtractOptions } from "../../controllers/v1/types"; import { ExtractOptions } from "../../controllers/v1/types";
import { CostTracking } from "../extract/extraction-service"; import { CostTracking } from "../extract/extraction-service";
import { getACUCTeam } from "../../controllers/auth";
interface DeepResearchServiceOptions { interface DeepResearchServiceOptions {
researchId: string; researchId: string;
teamId: string; teamId: string;
@ -45,6 +45,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
); );
const llmService = new ResearchLLMService(logger); const llmService = new ResearchLLMService(logger);
const acuc = await getACUCTeam(teamId);
try { try {
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) { while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth()); logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
@ -112,7 +114,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
fastMode: false, fastMode: false,
blockAds: false, blockAds: false,
}, },
}, logger, costTracking); }, logger, costTracking, acuc?.flags ?? null);
return response.length > 0 ? response : []; return response.length > 0 ? response : [];
}); });

View File

@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
import { normalizeUrl } from "../canonical-url"; import { normalizeUrl } from "../canonical-url";
import { search } from "../../search"; import { search } from "../../search";
import { buildRephraseToSerpPrompt } from "./build-prompts"; import { buildRephraseToSerpPrompt } from "./build-prompts";
import { getACUCTeam } from "../../controllers/auth";
interface ExtractServiceOptions { interface ExtractServiceOptions {
request: ExtractRequest; request: ExtractRequest;
teamId: string; teamId: string;
@ -134,6 +134,7 @@ export async function performExtraction(
let sources: Record<string, string[]> = {}; let sources: Record<string, string[]> = {};
let costTracking = new CostTracking(subId ? null : 1.5); let costTracking = new CostTracking(subId ? null : 1.5);
const acuc = await getACUCTeam(teamId);
let log = { let log = {
extractId, extractId,
@ -323,6 +324,7 @@ export async function performExtraction(
}, },
logger.child({ module: "extract", method: "processUrl", url }), logger.child({ module: "extract", method: "processUrl", url }),
costTracking, costTracking,
acuc?.flags ?? null,
), ),
); );

View File

@ -32,6 +32,7 @@ import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0";
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0"; import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0"; import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
import { SourceTracker_F0 } from "./helpers/source-tracker-f0"; import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
import { getACUCTeam } from "../../../controllers/auth";
interface ExtractServiceOptions { interface ExtractServiceOptions {
@ -78,6 +79,8 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
let totalUrlsScraped = 0; let totalUrlsScraped = 0;
let sources: Record<string, string[]> = {}; let sources: Record<string, string[]> = {};
const acuc = await getACUCTeam(teamId);
const logger = _logger.child({ const logger = _logger.child({
module: "extract", module: "extract",
@ -174,6 +177,7 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
}); });
}, },
logger.child({ module: "extract", method: "processUrl", url }), logger.child({ module: "extract", method: "processUrl", url }),
acuc?.flags ?? null,
), ),
); );

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../../controllers/v1/types"; import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
import { logger } from "../../logger"; import { logger } from "../../logger";
import { CohereClient } from "cohere-ai"; import { CohereClient } from "cohere-ai";
@ -48,6 +48,7 @@ export async function rerankLinks_F0(
mappedLinks: MapDocument[], mappedLinks: MapDocument[],
searchQuery: string, searchQuery: string,
urlTraces: URLTrace[], urlTraces: URLTrace[],
flags: TeamFlags,
): Promise<MapDocument[]> { ): Promise<MapDocument[]> {
// console.log("Going to rerank links"); // console.log("Going to rerank links");
const mappedLinksRerank = mappedLinks.map( const mappedLinksRerank = mappedLinks.map(
@ -65,6 +66,7 @@ export async function rerankLinks_F0(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE, extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
); );
// If we don't have enough high-quality links, try with lower threshold // If we don't have enough high-quality links, try with lower threshold
@ -76,6 +78,7 @@ export async function rerankLinks_F0(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE, extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
); );
if (filteredLinks.length === 0) { if (filteredLinks.length === 0) {
@ -89,7 +92,7 @@ export async function rerankLinks_F0(
.map((x) => mappedLinks.find((link) => link.url === x.link)) .map((x) => mappedLinks.find((link) => link.url === x.link))
.filter( .filter(
(x): x is MapDocument => (x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
); );
} }
} }
@ -145,13 +148,14 @@ function filterAndProcessLinks_F0(
originalIndex: number; originalIndex: number;
}[], }[],
threshold: number, threshold: number,
flags: TeamFlags,
): MapDocument[] { ): MapDocument[] {
return linksAndScores return linksAndScores
.filter((x) => x.score > threshold) .filter((x) => x.score > threshold)
.map((x) => mappedLinks.find((link) => link.url === x.link)) .map((x) => mappedLinks.find((link) => link.url === x.link))
.filter( .filter(
(x): x is MapDocument => (x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
); );
} }

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../../controllers/v1/types"; import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
import { getMapResults } from "../../../controllers/v1/map"; import { getMapResults } from "../../../controllers/v1/map";
import { removeDuplicateUrls } from "../../validateUrl"; import { removeDuplicateUrls } from "../../validateUrl";
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
@ -9,6 +9,7 @@ import type { Logger } from "winston";
import { generateText } from "ai"; import { generateText } from "ai";
import { getModel } from "../../generic-ai"; import { getModel } from "../../generic-ai";
import { CostTracking } from "../extraction-service"; import { CostTracking } from "../extraction-service";
import { getACUCTeam } from "../../../controllers/auth";
export async function generateBasicCompletion_FO(prompt: string) { export async function generateBasicCompletion_FO(prompt: string) {
const { text } = await generateText({ const { text } = await generateText({
@ -34,6 +35,7 @@ export async function processUrl_F0(
urlTraces: URLTrace[], urlTraces: URLTrace[],
updateExtractCallback: (links: string[]) => void, updateExtractCallback: (links: string[]) => void,
logger: Logger, logger: Logger,
teamFlags: TeamFlags,
): Promise<string[]> { ): Promise<string[]> {
const trace: URLTrace = { const trace: URLTrace = {
url: options.url, url: options.url,
@ -45,7 +47,7 @@ export async function processUrl_F0(
urlTraces.push(trace); urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) { if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) { if (!isUrlBlocked(options.url, teamFlags)) {
trace.usedInCompletion = true; trace.usedInCompletion = true;
return [options.url]; return [options.url];
} }
@ -85,6 +87,7 @@ export async function processUrl_F0(
ignoreSitemap: false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
includeSubdomains: options.includeSubdomains, includeSubdomains: options.includeSubdomains,
flags: teamFlags,
}); });
let mappedLinks = mapResults.mapResults as MapDocument[]; let mappedLinks = mapResults.mapResults as MapDocument[];
@ -121,6 +124,7 @@ export async function processUrl_F0(
ignoreSitemap: false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
includeSubdomains: options.includeSubdomains, includeSubdomains: options.includeSubdomains,
flags: teamFlags,
}); });
mappedLinks = retryMapResults.mapResults as MapDocument[]; mappedLinks = retryMapResults.mapResults as MapDocument[];

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../controllers/v1/types"; import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
import { performRanking } from "../ranker"; import { performRanking } from "../ranker";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
import { logger } from "../logger"; import { logger } from "../logger";
@ -57,6 +57,7 @@ export async function rerankLinks(
mappedLinks: MapDocument[], mappedLinks: MapDocument[],
searchQuery: string, searchQuery: string,
urlTraces: URLTrace[], urlTraces: URLTrace[],
flags: TeamFlags,
): Promise<MapDocument[]> { ): Promise<MapDocument[]> {
// console.log("Going to rerank links"); // console.log("Going to rerank links");
const mappedLinksRerank = mappedLinks.map( const mappedLinksRerank = mappedLinks.map(
@ -74,6 +75,7 @@ export async function rerankLinks(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE, extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
); );
// If we don't have enough high-quality links, try with lower threshold // If we don't have enough high-quality links, try with lower threshold
@ -85,6 +87,7 @@ export async function rerankLinks(
mappedLinks, mappedLinks,
linksAndScores, linksAndScores,
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE, extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
flags,
); );
if (filteredLinks.length === 0) { if (filteredLinks.length === 0) {
@ -98,7 +101,7 @@ export async function rerankLinks(
.map((x) => mappedLinks.find((link) => link.url === x.link)) .map((x) => mappedLinks.find((link) => link.url === x.link))
.filter( .filter(
(x): x is MapDocument => (x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
); );
} }
} }
@ -154,13 +157,14 @@ function filterAndProcessLinks(
originalIndex: number; originalIndex: number;
}[], }[],
threshold: number, threshold: number,
flags: TeamFlags,
): MapDocument[] { ): MapDocument[] {
return linksAndScores return linksAndScores
.filter((x) => x.score > threshold) .filter((x) => x.score > threshold)
.map((x) => mappedLinks.find((link) => link.url === x.link)) .map((x) => mappedLinks.find((link) => link.url === x.link))
.filter( .filter(
(x): x is MapDocument => (x): x is MapDocument =>
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url), x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
); );
} }

View File

@ -1,4 +1,4 @@
import { MapDocument, URLTrace } from "../../controllers/v1/types"; import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
import { getMapResults } from "../../controllers/v1/map"; import { getMapResults } from "../../controllers/v1/map";
import { removeDuplicateUrls } from "../validateUrl"; import { removeDuplicateUrls } from "../validateUrl";
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist"; import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
@ -93,6 +93,7 @@ export async function processUrl(
updateExtractCallback: (links: string[]) => void, updateExtractCallback: (links: string[]) => void,
logger: Logger, logger: Logger,
costTracking: CostTracking, costTracking: CostTracking,
teamFlags: TeamFlags,
): Promise<string[]> { ): Promise<string[]> {
const trace: URLTrace = { const trace: URLTrace = {
url: options.url, url: options.url,
@ -104,7 +105,7 @@ export async function processUrl(
urlTraces.push(trace); urlTraces.push(trace);
if (!options.url.includes("/*") && !options.allowExternalLinks) { if (!options.url.includes("/*") && !options.allowExternalLinks) {
if (!isUrlBlocked(options.url)) { if (!isUrlBlocked(options.url, teamFlags)) {
trace.usedInCompletion = true; trace.usedInCompletion = true;
return [options.url]; return [options.url];
} }
@ -144,6 +145,7 @@ export async function processUrl(
ignoreSitemap: false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
includeSubdomains: options.includeSubdomains, includeSubdomains: options.includeSubdomains,
flags: teamFlags,
}); });
let mappedLinks = mapResults.mapResults as MapDocument[]; let mappedLinks = mapResults.mapResults as MapDocument[];
@ -181,6 +183,7 @@ export async function processUrl(
ignoreSitemap: false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
includeSubdomains: options.includeSubdomains, includeSubdomains: options.includeSubdomains,
flags: teamFlags,
}); });
mappedLinks = retryMapResults.mapResults as MapDocument[]; mappedLinks = retryMapResults.mapResults as MapDocument[];

View File

@ -12,6 +12,7 @@ import { logJob } from "../../services/logging/log_job";
import { getModel } from "../generic-ai"; import { getModel } from "../generic-ai";
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
import { CostTracking } from "../extract/extraction-service"; import { CostTracking } from "../extract/extraction-service";
import { getACUCTeam } from "../../controllers/auth";
interface GenerateLLMsTextServiceOptions { interface GenerateLLMsTextServiceOptions {
generationId: string; generationId: string;
teamId: string; teamId: string;
@ -72,6 +73,7 @@ export async function performGenerateLlmsTxt(
teamId, teamId,
}); });
const costTracking = new CostTracking(); const costTracking = new CostTracking();
const acuc = await getACUCTeam(teamId);
try { try {
// Enforce max URL limit // Enforce max URL limit
@ -116,6 +118,7 @@ export async function performGenerateLlmsTxt(
includeSubdomains: false, includeSubdomains: false,
ignoreSitemap: false, ignoreSitemap: false,
includeMetadata: true, includeMetadata: true,
flags: acuc?.flags ?? null,
}); });
if (!mapResult || !mapResult.links) { if (!mapResult || !mapResult.links) {

View File

@ -147,8 +147,8 @@ function idempotencyMiddleware(
})().catch((err) => next(err)); })().catch((err) => next(err));
} }
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) { function blocklistMiddleware(req: RequestWithACUC<any, any, any>, res: Response, next: NextFunction) {
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) { if (typeof req.body.url === "string" && isUrlBlocked(req.body.url, req.acuc?.flags ?? null)) {
if (!res.headersSent) { if (!res.headersSent) {
return res.status(403).json({ return res.status(403).json({
success: false, success: false,
@ -267,6 +267,7 @@ v1Router.get(
v1Router.post( v1Router.post(
"/llmstxt", "/llmstxt",
authMiddleware(RateLimiterMode.Scrape), authMiddleware(RateLimiterMode.Scrape),
blocklistMiddleware,
wrap(generateLLMsTextController), wrap(generateLLMsTextController),
); );

View File

@ -53,22 +53,25 @@ describe("isUrlBlocked function", () => {
}); });
test("Blocks exact domain with and without protocol", () => { test("Blocks exact domain with and without protocol", () => {
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey), null)).toBe(
true, true,
); );
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey), decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey), decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
}); });
@ -77,53 +80,57 @@ describe("isUrlBlocked function", () => {
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey), decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey), decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey), decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
}); });
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => { test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey), null)).toBe(
true, true,
); );
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey), null)).toBe(
true, true,
); );
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey), null)).toBe(
true, true,
); );
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey), decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
null,
), ),
).toBe(true); ).toBe(true);
}); });
test("Allows unrelated domains like whateverfacebook.com", () => { test("Allows unrelated domains like whateverfacebook.com", () => {
expect(isUrlBlocked("whateverfacebook.com")).toBe(false); expect(isUrlBlocked("whateverfacebook.com", null)).toBe(false);
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false); expect(isUrlBlocked("https://whateverfacebook.com", null)).toBe(false);
}); });
test("Blocks other domains from the blocklist", () => { test("Blocks other domains from the blocklist", () => {
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey), null)).toBe(
true, true,
); );
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey), null)).toBe(
true, true,
); );
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey), null)).toBe(
true, true,
); );
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe( expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey), null)).toBe(
true, true,
); );
}); });
@ -135,23 +142,34 @@ describe("isUrlBlocked function", () => {
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB", "4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
hashKey, hashKey,
), ),
null,
), ),
).toBe(false); ).toBe(false);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey), decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
null,
), ),
).toBe(false); ).toBe(false);
expect( expect(
isUrlBlocked( isUrlBlocked(
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey), decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
null,
), ),
).toBe(false); ).toBe(false);
}); });
test("Should return false if the URL is invalid", () => { test("Should return false if the URL is invalid", () => {
expect(isUrlBlocked("randomstring")).toBe(false); expect(isUrlBlocked("randomstring", null)).toBe(false);
expect(isUrlBlocked("htp://bad.url")).toBe(false); expect(isUrlBlocked("htp://bad.url", null)).toBe(false);
expect(isUrlBlocked("")).toBe(false); expect(isUrlBlocked("", null)).toBe(false);
});
test("Should respect flags", () => {
const decryptedDomain = decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey);
expect(isUrlBlocked(decryptedDomain, {
unblockedDomains: [decryptedDomain],
})).toBe(false);
}); });
}); });

View File

@ -1,6 +1,7 @@
import { configDotenv } from "dotenv"; import { configDotenv } from "dotenv";
import crypto from "crypto"; import crypto from "crypto";
import { parse } from "tldts"; import { parse } from "tldts";
import { TeamFlags } from "../../../controllers/v1/types";
configDotenv(); configDotenv();
@ -101,10 +102,15 @@ export function decryptedBlocklist(list: string[]): string[] {
: []; : [];
} }
export function isUrlBlocked(url: string): boolean { export function isUrlBlocked(url: string, flags: TeamFlags): boolean {
const lowerCaseUrl = url.trim().toLowerCase(); const lowerCaseUrl = url.trim().toLowerCase();
const blockedlist = decryptedBlocklist(urlBlocklist); let blockedlist = decryptedBlocklist(urlBlocklist);
if (flags?.unblockedDomains) {
blockedlist = blockedlist.filter((blocked) => !flags.unblockedDomains!.includes(blocked));
}
const decryptedUrl = const decryptedUrl =
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) || blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
lowerCaseUrl; lowerCaseUrl;

View File

@ -80,6 +80,7 @@ import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis"; import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0"; import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
import { CostTracking } from "../lib/extract/extraction-service"; import { CostTracking } from "../lib/extract/extraction-service";
import { getACUCTeam } from "../controllers/auth";
configDotenv(); configDotenv();
@ -144,6 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
const crawler = crawlToCrawler( const crawler = crawlToCrawler(
job.data.crawl_id, job.data.crawl_id,
sc, sc,
(await getACUCTeam(job.data.team_id))?.flags ?? null,
sc.originUrl!, sc.originUrl!,
job.data.crawlerOptions, job.data.crawlerOptions,
); );
@ -871,7 +873,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
try { try {
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl; const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
logger.debug("Locking URL..."); logger.debug("Locking URL...");
await lockURL(job.data.crawl_id, sc, job.data.url); await lockURL(job.data.crawl_id, sc, job.data.url);
@ -1135,7 +1137,7 @@ async function processJob(job: Job & { id: string }, token: string) {
normalizeURL(doc.metadata.sourceURL, sc) && normalizeURL(doc.metadata.sourceURL, sc) &&
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
) { ) {
const crawler = crawlToCrawler(job.data.crawl_id, sc); const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
if ( if (
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) === crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
null && null &&
@ -1160,7 +1162,7 @@ async function processJob(job: Job & { id: string }, token: string) {
await saveCrawl(job.data.crawl_id, sc); await saveCrawl(job.data.crawl_id, sc);
} }
if (isUrlBlocked(doc.metadata.url)) { if (isUrlBlocked(doc.metadata.url, (await getACUCTeam(job.data.team_id))?.flags ?? null)) {
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
} }
@ -1219,6 +1221,7 @@ async function processJob(job: Job & { id: string }, token: string) {
const crawler = crawlToCrawler( const crawler = crawlToCrawler(
job.data.crawl_id, job.data.crawl_id,
sc, sc,
(await getACUCTeam(job.data.team_id))?.flags ?? null,
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!, doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
job.data.crawlerOptions, job.data.crawlerOptions,
); );