mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 03:25:56 +08:00
feat(acuc): propagate team flags (FIR-1879) (#1522)
* feat(acuc): propagate team flags * feat(flags): further functionality
This commit is contained in:
parent
017a915ae8
commit
fa581995e6
@ -103,6 +103,7 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage
|
|||||||
planModifier: 0.1,
|
planModifier: 0.1,
|
||||||
},
|
},
|
||||||
concurrency: is_extract ? 200 : 2,
|
concurrency: is_extract ? 200 : 2,
|
||||||
|
flags: null,
|
||||||
is_extract,
|
is_extract,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -137,6 +138,7 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({
|
|||||||
planModifier: 0.1,
|
planModifier: 0.1,
|
||||||
},
|
},
|
||||||
concurrency: 99999999,
|
concurrency: 99999999,
|
||||||
|
flags: null,
|
||||||
is_extract: false,
|
is_extract: false,
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -181,7 +183,7 @@ export async function getACUC(
|
|||||||
const client =
|
const client =
|
||||||
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
||||||
({ data, error } = await client.rpc(
|
({ data, error } = await client.rpc(
|
||||||
"auth_credit_usage_chunk_30",
|
"auth_credit_usage_chunk_32",
|
||||||
{ input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
|
{ input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
|
||||||
{ get: true },
|
{ get: true },
|
||||||
));
|
));
|
||||||
@ -298,7 +300,7 @@ export async function getACUCTeam(
|
|||||||
const client =
|
const client =
|
||||||
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
||||||
({ data, error } = await client.rpc(
|
({ data, error } = await client.rpc(
|
||||||
"auth_credit_usage_chunk_30_from_team",
|
"auth_credit_usage_chunk_32_from_team",
|
||||||
{ input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
|
{ input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
|
||||||
{ get: true },
|
{ get: true },
|
||||||
));
|
));
|
||||||
|
@ -115,7 +115,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
.json({ error: e.message ?? e });
|
.json({ error: e.message ?? e });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
error: BLOCKLISTED_URL_MESSAGE,
|
error: BLOCKLISTED_URL_MESSAGE,
|
||||||
});
|
});
|
||||||
@ -173,7 +173,7 @@ export async function crawlController(req: Request, res: Response) {
|
|||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt();
|
sc.robots = await crawler.getRobotsTxt();
|
||||||
|
@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
.json({ error: e.message ?? e });
|
.json({ error: e.message ?? e });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
error: BLOCKLISTED_URL_MESSAGE,
|
error: BLOCKLISTED_URL_MESSAGE,
|
||||||
});
|
});
|
||||||
@ -112,7 +112,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
|||||||
|
|
||||||
await saveCrawl(id, sc);
|
await saveCrawl(id, sc);
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
|
||||||
|
|
||||||
await finishCrawlKickoff(id);
|
await finishCrawlKickoff(id);
|
||||||
|
|
||||||
|
@ -9,6 +9,7 @@ import { RateLimiterMode } from "../../types";
|
|||||||
import { logJob } from "../../services/logging/log_job";
|
import { logJob } from "../../services/logging/log_job";
|
||||||
import {
|
import {
|
||||||
fromLegacyCombo,
|
fromLegacyCombo,
|
||||||
|
TeamFlags,
|
||||||
toLegacyDocument,
|
toLegacyDocument,
|
||||||
url as urlSchema,
|
url as urlSchema,
|
||||||
} from "../v1/types";
|
} from "../v1/types";
|
||||||
@ -40,6 +41,7 @@ export async function scrapeHelper(
|
|||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
extractorOptions: ExtractorOptions,
|
extractorOptions: ExtractorOptions,
|
||||||
timeout: number,
|
timeout: number,
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -51,7 +53,7 @@ export async function scrapeHelper(
|
|||||||
return { success: false, error: "Url is required", returnCode: 400 };
|
return { success: false, error: "Url is required", returnCode: 400 };
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(url)) {
|
if (isUrlBlocked(url, flags)) {
|
||||||
return {
|
return {
|
||||||
success: false,
|
success: false,
|
||||||
error: BLOCKLISTED_URL_MESSAGE,
|
error: BLOCKLISTED_URL_MESSAGE,
|
||||||
@ -241,6 +243,7 @@ export async function scrapeController(req: Request, res: Response) {
|
|||||||
pageOptions,
|
pageOptions,
|
||||||
extractorOptions,
|
extractorOptions,
|
||||||
timeout,
|
timeout,
|
||||||
|
chunk?.flags ?? null,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
@ -20,6 +20,7 @@ import {
|
|||||||
Document,
|
Document,
|
||||||
fromLegacyCombo,
|
fromLegacyCombo,
|
||||||
fromLegacyScrapeOptions,
|
fromLegacyScrapeOptions,
|
||||||
|
TeamFlags,
|
||||||
toLegacyDocument,
|
toLegacyDocument,
|
||||||
} from "../v1/types";
|
} from "../v1/types";
|
||||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||||
@ -32,6 +33,7 @@ export async function searchHelper(
|
|||||||
crawlerOptions: any,
|
crawlerOptions: any,
|
||||||
pageOptions: PageOptions,
|
pageOptions: PageOptions,
|
||||||
searchOptions: SearchOptions,
|
searchOptions: SearchOptions,
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<{
|
): Promise<{
|
||||||
success: boolean;
|
success: boolean;
|
||||||
error?: string;
|
error?: string;
|
||||||
@ -85,7 +87,7 @@ export async function searchHelper(
|
|||||||
return { success: true, data: res, returnCode: 200 };
|
return { success: true, data: res, returnCode: 200 };
|
||||||
}
|
}
|
||||||
|
|
||||||
res = res.filter((r) => !isUrlBlocked(r.url));
|
res = res.filter((r) => !isUrlBlocked(r.url, flags));
|
||||||
if (res.length > num_results) {
|
if (res.length > num_results) {
|
||||||
res = res.slice(0, num_results);
|
res = res.slice(0, num_results);
|
||||||
}
|
}
|
||||||
@ -202,6 +204,7 @@ export async function searchController(req: Request, res: Response) {
|
|||||||
crawlerOptions,
|
crawlerOptions,
|
||||||
pageOptions,
|
pageOptions,
|
||||||
searchOptions,
|
searchOptions,
|
||||||
|
chunk?.flags ?? null,
|
||||||
);
|
);
|
||||||
const endTime = new Date().getTime();
|
const endTime = new Date().getTime();
|
||||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||||
|
@ -23,6 +23,8 @@ import { addScrapeJobs } from "../../services/queue-jobs";
|
|||||||
import { callWebhook } from "../../services/webhook";
|
import { callWebhook } from "../../services/webhook";
|
||||||
import { logger as _logger } from "../../lib/logger";
|
import { logger as _logger } from "../../lib/logger";
|
||||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||||
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
export async function batchScrapeController(
|
export async function batchScrapeController(
|
||||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||||
@ -54,11 +56,24 @@ export async function batchScrapeController(
|
|||||||
for (const u of pendingURLs) {
|
for (const u of pendingURLs) {
|
||||||
try {
|
try {
|
||||||
const nu = urlSchema.parse(u);
|
const nu = urlSchema.parse(u);
|
||||||
|
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
||||||
urls.push(nu);
|
urls.push(nu);
|
||||||
|
} else {
|
||||||
|
invalidURLs.push(u);
|
||||||
|
}
|
||||||
} catch (_) {
|
} catch (_) {
|
||||||
invalidURLs.push(u);
|
invalidURLs.push(u);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
|
||||||
|
if (!res.headersSent) {
|
||||||
|
return res.status(403).json({
|
||||||
|
success: false,
|
||||||
|
error: BLOCKLISTED_URL_MESSAGE,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.debug("Batch scrape " + id + " starting", {
|
logger.debug("Batch scrape " + id + " starting", {
|
||||||
|
@ -89,7 +89,7 @@ export async function crawlController(
|
|||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||||
|
@ -11,6 +11,8 @@ import { saveExtract } from "../../lib/extract/extract-redis";
|
|||||||
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
||||||
import { performExtraction } from "../../lib/extract/extraction-service";
|
import { performExtraction } from "../../lib/extract/extraction-service";
|
||||||
import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0";
|
import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0";
|
||||||
|
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||||
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
|
|
||||||
export async function oldExtract(
|
export async function oldExtract(
|
||||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||||
@ -58,6 +60,15 @@ export async function extractController(
|
|||||||
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
||||||
req.body = extractRequestSchema.parse(req.body);
|
req.body = extractRequestSchema.parse(req.body);
|
||||||
|
|
||||||
|
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
|
||||||
|
if (!res.headersSent) {
|
||||||
|
return res.status(403).json({
|
||||||
|
success: false,
|
||||||
|
error: BLOCKLISTED_URL_MESSAGE,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const extractId = crypto.randomUUID();
|
const extractId = crypto.randomUUID();
|
||||||
const jobData = {
|
const jobData = {
|
||||||
request: req.body,
|
request: req.body,
|
||||||
|
@ -5,6 +5,7 @@ import {
|
|||||||
mapRequestSchema,
|
mapRequestSchema,
|
||||||
RequestWithAuth,
|
RequestWithAuth,
|
||||||
scrapeOptions,
|
scrapeOptions,
|
||||||
|
TeamFlags,
|
||||||
TimeoutSignal,
|
TimeoutSignal,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||||
@ -56,6 +57,7 @@ export async function getMapResults({
|
|||||||
abort = new AbortController().signal, // noop
|
abort = new AbortController().signal, // noop
|
||||||
mock,
|
mock,
|
||||||
filterByPath = true,
|
filterByPath = true,
|
||||||
|
flags,
|
||||||
}: {
|
}: {
|
||||||
url: string;
|
url: string;
|
||||||
search?: string;
|
search?: string;
|
||||||
@ -70,6 +72,7 @@ export async function getMapResults({
|
|||||||
abort?: AbortSignal;
|
abort?: AbortSignal;
|
||||||
mock?: string;
|
mock?: string;
|
||||||
filterByPath?: boolean;
|
filterByPath?: boolean;
|
||||||
|
flags: TeamFlags;
|
||||||
}): Promise<MapResult> {
|
}): Promise<MapResult> {
|
||||||
const id = uuidv4();
|
const id = uuidv4();
|
||||||
let links: string[] = [url];
|
let links: string[] = [url];
|
||||||
@ -88,7 +91,7 @@ export async function getMapResults({
|
|||||||
createdAt: Date.now(),
|
createdAt: Date.now(),
|
||||||
};
|
};
|
||||||
|
|
||||||
const crawler = crawlToCrawler(id, sc);
|
const crawler = crawlToCrawler(id, sc, flags);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
sc.robots = await crawler.getRobotsTxt(false, abort);
|
sc.robots = await crawler.getRobotsTxt(false, abort);
|
||||||
@ -322,6 +325,7 @@ export async function mapController(
|
|||||||
abort: abort.signal,
|
abort: abort.signal,
|
||||||
mock: req.body.useMock,
|
mock: req.body.useMock,
|
||||||
filterByPath: req.body.filterByPath !== false,
|
filterByPath: req.body.filterByPath !== false,
|
||||||
|
flags: req.acuc.flags ?? null,
|
||||||
}),
|
}),
|
||||||
...(req.body.timeout !== undefined ? [
|
...(req.body.timeout !== undefined ? [
|
||||||
new Promise((resolve, reject) => setTimeout(() => {
|
new Promise((resolve, reject) => setTimeout(() => {
|
||||||
|
@ -6,6 +6,7 @@ import {
|
|||||||
SearchResponse,
|
SearchResponse,
|
||||||
searchRequestSchema,
|
searchRequestSchema,
|
||||||
ScrapeOptions,
|
ScrapeOptions,
|
||||||
|
TeamFlags,
|
||||||
} from "./types";
|
} from "./types";
|
||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { v4 as uuidv4 } from "uuid";
|
import { v4 as uuidv4 } from "uuid";
|
||||||
@ -34,6 +35,7 @@ export async function searchAndScrapeSearchResult(
|
|||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
costTracking: CostTracking,
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<Document[]> {
|
): Promise<Document[]> {
|
||||||
try {
|
try {
|
||||||
const searchResults = await search({
|
const searchResults = await search({
|
||||||
@ -51,7 +53,8 @@ export async function searchAndScrapeSearchResult(
|
|||||||
},
|
},
|
||||||
options,
|
options,
|
||||||
logger,
|
logger,
|
||||||
costTracking
|
costTracking,
|
||||||
|
flags
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
);
|
);
|
||||||
@ -72,6 +75,7 @@ async function scrapeSearchResult(
|
|||||||
},
|
},
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
costTracking: CostTracking,
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<Document> {
|
): Promise<Document> {
|
||||||
const jobId = uuidv4();
|
const jobId = uuidv4();
|
||||||
const jobPriority = await getJobPriority({
|
const jobPriority = await getJobPriority({
|
||||||
@ -80,7 +84,7 @@ async function scrapeSearchResult(
|
|||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (isUrlBlocked(searchResult.url)) {
|
if (isUrlBlocked(searchResult.url, flags)) {
|
||||||
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
||||||
}
|
}
|
||||||
logger.info("Adding scrape job", {
|
logger.info("Adding scrape job", {
|
||||||
@ -220,7 +224,7 @@ export async function searchController(
|
|||||||
origin: req.body.origin,
|
origin: req.body.origin,
|
||||||
timeout: req.body.timeout,
|
timeout: req.body.timeout,
|
||||||
scrapeOptions: req.body.scrapeOptions,
|
scrapeOptions: req.body.scrapeOptions,
|
||||||
}, logger, costTracking),
|
}, logger, costTracking, req.acuc?.flags ?? null),
|
||||||
);
|
);
|
||||||
|
|
||||||
const docs = await Promise.all(scrapePromises);
|
const docs = await Promise.all(scrapePromises);
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
import { Request, Response } from "express";
|
import { Request, Response } from "express";
|
||||||
import { z } from "zod";
|
import { z } from "zod";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
|
||||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||||
import { countries } from "../../lib/validate-country";
|
import { countries } from "../../lib/validate-country";
|
||||||
import {
|
import {
|
||||||
@ -10,7 +9,6 @@ import {
|
|||||||
Document as V0Document,
|
Document as V0Document,
|
||||||
} from "../../lib/entities";
|
} from "../../lib/entities";
|
||||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
|
||||||
|
|
||||||
export type Format =
|
export type Format =
|
||||||
| "markdown"
|
| "markdown"
|
||||||
@ -49,7 +47,7 @@ export const url = z.preprocess(
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}, "Invalid URL")
|
}, "Invalid URL")
|
||||||
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
||||||
);
|
);
|
||||||
|
|
||||||
const strictMessage =
|
const strictMessage =
|
||||||
@ -914,11 +912,17 @@ export type AuthCreditUsageChunk = {
|
|||||||
scrapeAgentPreview?: number;
|
scrapeAgentPreview?: number;
|
||||||
};
|
};
|
||||||
concurrency: number;
|
concurrency: number;
|
||||||
|
flags: TeamFlags;
|
||||||
|
|
||||||
// appended on JS-side
|
// appended on JS-side
|
||||||
is_extract?: boolean;
|
is_extract?: boolean;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export type TeamFlags = {
|
||||||
|
ignoreRobots?: boolean;
|
||||||
|
unblockedDomains?: string[];
|
||||||
|
} | null;
|
||||||
|
|
||||||
export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">;
|
export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">;
|
||||||
|
|
||||||
export interface RequestWithMaybeACUC<
|
export interface RequestWithMaybeACUC<
|
||||||
|
@ -1,5 +1,5 @@
|
|||||||
import { InternalOptions } from "../scraper/scrapeURL";
|
import { InternalOptions } from "../scraper/scrapeURL";
|
||||||
import { ScrapeOptions } from "../controllers/v1/types";
|
import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
|
||||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||||
import { redisConnection } from "../services/queue-service";
|
import { redisConnection } from "../services/queue-service";
|
||||||
import { logger as _logger } from "./logger";
|
import { logger as _logger } from "./logger";
|
||||||
@ -383,6 +383,7 @@ export async function lockURLsIndividually(
|
|||||||
export function crawlToCrawler(
|
export function crawlToCrawler(
|
||||||
id: string,
|
id: string,
|
||||||
sc: StoredCrawl,
|
sc: StoredCrawl,
|
||||||
|
teamFlags: TeamFlags,
|
||||||
newBase?: string,
|
newBase?: string,
|
||||||
crawlerOptions?: any,
|
crawlerOptions?: any,
|
||||||
): WebCrawler {
|
): WebCrawler {
|
||||||
@ -403,7 +404,7 @@ export function crawlToCrawler(
|
|||||||
allowExternalContentLinks:
|
allowExternalContentLinks:
|
||||||
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
ignoreRobotsTxt: teamFlags?.ignoreRobots ?? sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||||
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||||
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
|
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
|
||||||
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
|
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
|
||||||
|
@ -6,7 +6,7 @@ import { logJob } from "../../services/logging/log_job";
|
|||||||
import { billTeam } from "../../services/billing/credit_billing";
|
import { billTeam } from "../../services/billing/credit_billing";
|
||||||
import { ExtractOptions } from "../../controllers/v1/types";
|
import { ExtractOptions } from "../../controllers/v1/types";
|
||||||
import { CostTracking } from "../extract/extraction-service";
|
import { CostTracking } from "../extract/extraction-service";
|
||||||
|
import { getACUCTeam } from "../../controllers/auth";
|
||||||
interface DeepResearchServiceOptions {
|
interface DeepResearchServiceOptions {
|
||||||
researchId: string;
|
researchId: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@ -45,6 +45,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
);
|
);
|
||||||
const llmService = new ResearchLLMService(logger);
|
const llmService = new ResearchLLMService(logger);
|
||||||
|
|
||||||
|
const acuc = await getACUCTeam(teamId);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
|
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
|
||||||
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
|
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
|
||||||
@ -112,7 +114,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
|||||||
fastMode: false,
|
fastMode: false,
|
||||||
blockAds: false,
|
blockAds: false,
|
||||||
},
|
},
|
||||||
}, logger, costTracking);
|
}, logger, costTracking, acuc?.flags ?? null);
|
||||||
return response.length > 0 ? response : [];
|
return response.length > 0 ? response : [];
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
|||||||
import { normalizeUrl } from "../canonical-url";
|
import { normalizeUrl } from "../canonical-url";
|
||||||
import { search } from "../../search";
|
import { search } from "../../search";
|
||||||
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||||
|
import { getACUCTeam } from "../../controllers/auth";
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
request: ExtractRequest;
|
request: ExtractRequest;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@ -134,6 +134,7 @@ export async function performExtraction(
|
|||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
|
|
||||||
let costTracking = new CostTracking(subId ? null : 1.5);
|
let costTracking = new CostTracking(subId ? null : 1.5);
|
||||||
|
const acuc = await getACUCTeam(teamId);
|
||||||
|
|
||||||
let log = {
|
let log = {
|
||||||
extractId,
|
extractId,
|
||||||
@ -323,6 +324,7 @@ export async function performExtraction(
|
|||||||
},
|
},
|
||||||
logger.child({ module: "extract", method: "processUrl", url }),
|
logger.child({ module: "extract", method: "processUrl", url }),
|
||||||
costTracking,
|
costTracking,
|
||||||
|
acuc?.flags ?? null,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -32,6 +32,7 @@ import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0";
|
|||||||
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
|
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
|
||||||
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
|
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
|
||||||
import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
||||||
|
import { getACUCTeam } from "../../../controllers/auth";
|
||||||
|
|
||||||
|
|
||||||
interface ExtractServiceOptions {
|
interface ExtractServiceOptions {
|
||||||
@ -78,6 +79,8 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
|||||||
let totalUrlsScraped = 0;
|
let totalUrlsScraped = 0;
|
||||||
let sources: Record<string, string[]> = {};
|
let sources: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
const acuc = await getACUCTeam(teamId);
|
||||||
|
|
||||||
|
|
||||||
const logger = _logger.child({
|
const logger = _logger.child({
|
||||||
module: "extract",
|
module: "extract",
|
||||||
@ -174,6 +177,7 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
|||||||
});
|
});
|
||||||
},
|
},
|
||||||
logger.child({ module: "extract", method: "processUrl", url }),
|
logger.child({ module: "extract", method: "processUrl", url }),
|
||||||
|
acuc?.flags ?? null,
|
||||||
),
|
),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
|
||||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||||
import { logger } from "../../logger";
|
import { logger } from "../../logger";
|
||||||
import { CohereClient } from "cohere-ai";
|
import { CohereClient } from "cohere-ai";
|
||||||
@ -48,6 +48,7 @@ export async function rerankLinks_F0(
|
|||||||
mappedLinks: MapDocument[],
|
mappedLinks: MapDocument[],
|
||||||
searchQuery: string,
|
searchQuery: string,
|
||||||
urlTraces: URLTrace[],
|
urlTraces: URLTrace[],
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<MapDocument[]> {
|
): Promise<MapDocument[]> {
|
||||||
// console.log("Going to rerank links");
|
// console.log("Going to rerank links");
|
||||||
const mappedLinksRerank = mappedLinks.map(
|
const mappedLinksRerank = mappedLinks.map(
|
||||||
@ -65,6 +66,7 @@ export async function rerankLinks_F0(
|
|||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||||
|
flags,
|
||||||
);
|
);
|
||||||
|
|
||||||
// If we don't have enough high-quality links, try with lower threshold
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
@ -76,6 +78,7 @@ export async function rerankLinks_F0(
|
|||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||||
|
flags,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (filteredLinks.length === 0) {
|
if (filteredLinks.length === 0) {
|
||||||
@ -89,7 +92,7 @@ export async function rerankLinks_F0(
|
|||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -145,13 +148,14 @@ function filterAndProcessLinks_F0(
|
|||||||
originalIndex: number;
|
originalIndex: number;
|
||||||
}[],
|
}[],
|
||||||
threshold: number,
|
threshold: number,
|
||||||
|
flags: TeamFlags,
|
||||||
): MapDocument[] {
|
): MapDocument[] {
|
||||||
return linksAndScores
|
return linksAndScores
|
||||||
.filter((x) => x.score > threshold)
|
.filter((x) => x.score > threshold)
|
||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
|
||||||
import { getMapResults } from "../../../controllers/v1/map";
|
import { getMapResults } from "../../../controllers/v1/map";
|
||||||
import { removeDuplicateUrls } from "../../validateUrl";
|
import { removeDuplicateUrls } from "../../validateUrl";
|
||||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||||
@ -9,6 +9,7 @@ import type { Logger } from "winston";
|
|||||||
import { generateText } from "ai";
|
import { generateText } from "ai";
|
||||||
import { getModel } from "../../generic-ai";
|
import { getModel } from "../../generic-ai";
|
||||||
import { CostTracking } from "../extraction-service";
|
import { CostTracking } from "../extraction-service";
|
||||||
|
import { getACUCTeam } from "../../../controllers/auth";
|
||||||
|
|
||||||
export async function generateBasicCompletion_FO(prompt: string) {
|
export async function generateBasicCompletion_FO(prompt: string) {
|
||||||
const { text } = await generateText({
|
const { text } = await generateText({
|
||||||
@ -34,6 +35,7 @@ export async function processUrl_F0(
|
|||||||
urlTraces: URLTrace[],
|
urlTraces: URLTrace[],
|
||||||
updateExtractCallback: (links: string[]) => void,
|
updateExtractCallback: (links: string[]) => void,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
|
teamFlags: TeamFlags,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
const trace: URLTrace = {
|
const trace: URLTrace = {
|
||||||
url: options.url,
|
url: options.url,
|
||||||
@ -45,7 +47,7 @@ export async function processUrl_F0(
|
|||||||
urlTraces.push(trace);
|
urlTraces.push(trace);
|
||||||
|
|
||||||
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
||||||
if (!isUrlBlocked(options.url)) {
|
if (!isUrlBlocked(options.url, teamFlags)) {
|
||||||
trace.usedInCompletion = true;
|
trace.usedInCompletion = true;
|
||||||
return [options.url];
|
return [options.url];
|
||||||
}
|
}
|
||||||
@ -85,6 +87,7 @@ export async function processUrl_F0(
|
|||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: options.includeSubdomains,
|
includeSubdomains: options.includeSubdomains,
|
||||||
|
flags: teamFlags,
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||||
@ -121,6 +124,7 @@ export async function processUrl_F0(
|
|||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: options.includeSubdomains,
|
includeSubdomains: options.includeSubdomains,
|
||||||
|
flags: teamFlags,
|
||||||
});
|
});
|
||||||
|
|
||||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { MapDocument, URLTrace } from "../../controllers/v1/types";
|
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
|
||||||
import { performRanking } from "../ranker";
|
import { performRanking } from "../ranker";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
import { logger } from "../logger";
|
import { logger } from "../logger";
|
||||||
@ -57,6 +57,7 @@ export async function rerankLinks(
|
|||||||
mappedLinks: MapDocument[],
|
mappedLinks: MapDocument[],
|
||||||
searchQuery: string,
|
searchQuery: string,
|
||||||
urlTraces: URLTrace[],
|
urlTraces: URLTrace[],
|
||||||
|
flags: TeamFlags,
|
||||||
): Promise<MapDocument[]> {
|
): Promise<MapDocument[]> {
|
||||||
// console.log("Going to rerank links");
|
// console.log("Going to rerank links");
|
||||||
const mappedLinksRerank = mappedLinks.map(
|
const mappedLinksRerank = mappedLinks.map(
|
||||||
@ -74,6 +75,7 @@ export async function rerankLinks(
|
|||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||||
|
flags,
|
||||||
);
|
);
|
||||||
|
|
||||||
// If we don't have enough high-quality links, try with lower threshold
|
// If we don't have enough high-quality links, try with lower threshold
|
||||||
@ -85,6 +87,7 @@ export async function rerankLinks(
|
|||||||
mappedLinks,
|
mappedLinks,
|
||||||
linksAndScores,
|
linksAndScores,
|
||||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||||
|
flags,
|
||||||
);
|
);
|
||||||
|
|
||||||
if (filteredLinks.length === 0) {
|
if (filteredLinks.length === 0) {
|
||||||
@ -98,7 +101,7 @@ export async function rerankLinks(
|
|||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -154,13 +157,14 @@ function filterAndProcessLinks(
|
|||||||
originalIndex: number;
|
originalIndex: number;
|
||||||
}[],
|
}[],
|
||||||
threshold: number,
|
threshold: number,
|
||||||
|
flags: TeamFlags,
|
||||||
): MapDocument[] {
|
): MapDocument[] {
|
||||||
return linksAndScores
|
return linksAndScores
|
||||||
.filter((x) => x.score > threshold)
|
.filter((x) => x.score > threshold)
|
||||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||||
.filter(
|
.filter(
|
||||||
(x): x is MapDocument =>
|
(x): x is MapDocument =>
|
||||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { MapDocument, URLTrace } from "../../controllers/v1/types";
|
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
|
||||||
import { getMapResults } from "../../controllers/v1/map";
|
import { getMapResults } from "../../controllers/v1/map";
|
||||||
import { removeDuplicateUrls } from "../validateUrl";
|
import { removeDuplicateUrls } from "../validateUrl";
|
||||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||||
@ -93,6 +93,7 @@ export async function processUrl(
|
|||||||
updateExtractCallback: (links: string[]) => void,
|
updateExtractCallback: (links: string[]) => void,
|
||||||
logger: Logger,
|
logger: Logger,
|
||||||
costTracking: CostTracking,
|
costTracking: CostTracking,
|
||||||
|
teamFlags: TeamFlags,
|
||||||
): Promise<string[]> {
|
): Promise<string[]> {
|
||||||
const trace: URLTrace = {
|
const trace: URLTrace = {
|
||||||
url: options.url,
|
url: options.url,
|
||||||
@ -104,7 +105,7 @@ export async function processUrl(
|
|||||||
urlTraces.push(trace);
|
urlTraces.push(trace);
|
||||||
|
|
||||||
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
||||||
if (!isUrlBlocked(options.url)) {
|
if (!isUrlBlocked(options.url, teamFlags)) {
|
||||||
trace.usedInCompletion = true;
|
trace.usedInCompletion = true;
|
||||||
return [options.url];
|
return [options.url];
|
||||||
}
|
}
|
||||||
@ -144,6 +145,7 @@ export async function processUrl(
|
|||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: options.includeSubdomains,
|
includeSubdomains: options.includeSubdomains,
|
||||||
|
flags: teamFlags,
|
||||||
});
|
});
|
||||||
|
|
||||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||||
@ -181,6 +183,7 @@ export async function processUrl(
|
|||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
includeSubdomains: options.includeSubdomains,
|
includeSubdomains: options.includeSubdomains,
|
||||||
|
flags: teamFlags,
|
||||||
});
|
});
|
||||||
|
|
||||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||||
|
@ -12,6 +12,7 @@ import { logJob } from "../../services/logging/log_job";
|
|||||||
import { getModel } from "../generic-ai";
|
import { getModel } from "../generic-ai";
|
||||||
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||||
import { CostTracking } from "../extract/extraction-service";
|
import { CostTracking } from "../extract/extraction-service";
|
||||||
|
import { getACUCTeam } from "../../controllers/auth";
|
||||||
interface GenerateLLMsTextServiceOptions {
|
interface GenerateLLMsTextServiceOptions {
|
||||||
generationId: string;
|
generationId: string;
|
||||||
teamId: string;
|
teamId: string;
|
||||||
@ -72,6 +73,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
teamId,
|
teamId,
|
||||||
});
|
});
|
||||||
const costTracking = new CostTracking();
|
const costTracking = new CostTracking();
|
||||||
|
const acuc = await getACUCTeam(teamId);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Enforce max URL limit
|
// Enforce max URL limit
|
||||||
@ -116,6 +118,7 @@ export async function performGenerateLlmsTxt(
|
|||||||
includeSubdomains: false,
|
includeSubdomains: false,
|
||||||
ignoreSitemap: false,
|
ignoreSitemap: false,
|
||||||
includeMetadata: true,
|
includeMetadata: true,
|
||||||
|
flags: acuc?.flags ?? null,
|
||||||
});
|
});
|
||||||
|
|
||||||
if (!mapResult || !mapResult.links) {
|
if (!mapResult || !mapResult.links) {
|
||||||
|
@ -147,8 +147,8 @@ function idempotencyMiddleware(
|
|||||||
})().catch((err) => next(err));
|
})().catch((err) => next(err));
|
||||||
}
|
}
|
||||||
|
|
||||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
function blocklistMiddleware(req: RequestWithACUC<any, any, any>, res: Response, next: NextFunction) {
|
||||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url, req.acuc?.flags ?? null)) {
|
||||||
if (!res.headersSent) {
|
if (!res.headersSent) {
|
||||||
return res.status(403).json({
|
return res.status(403).json({
|
||||||
success: false,
|
success: false,
|
||||||
@ -267,6 +267,7 @@ v1Router.get(
|
|||||||
v1Router.post(
|
v1Router.post(
|
||||||
"/llmstxt",
|
"/llmstxt",
|
||||||
authMiddleware(RateLimiterMode.Scrape),
|
authMiddleware(RateLimiterMode.Scrape),
|
||||||
|
blocklistMiddleware,
|
||||||
wrap(generateLLMsTextController),
|
wrap(generateLLMsTextController),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -53,22 +53,25 @@ describe("isUrlBlocked function", () => {
|
|||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks exact domain with and without protocol", () => {
|
test("Blocks exact domain with and without protocol", () => {
|
||||||
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
|
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
|
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
@ -77,53 +80,57 @@ describe("isUrlBlocked function", () => {
|
|||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
|
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
|
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
|
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
|
||||||
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
|
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(true);
|
).toBe(true);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Allows unrelated domains like whateverfacebook.com", () => {
|
test("Allows unrelated domains like whateverfacebook.com", () => {
|
||||||
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
|
expect(isUrlBlocked("whateverfacebook.com", null)).toBe(false);
|
||||||
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
|
expect(isUrlBlocked("https://whateverfacebook.com", null)).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Blocks other domains from the blocklist", () => {
|
test("Blocks other domains from the blocklist", () => {
|
||||||
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
|
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey), null)).toBe(
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
@ -135,23 +142,34 @@ describe("isUrlBlocked function", () => {
|
|||||||
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
|
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
|
||||||
hashKey,
|
hashKey,
|
||||||
),
|
),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
|
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
expect(
|
expect(
|
||||||
isUrlBlocked(
|
isUrlBlocked(
|
||||||
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
|
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
|
||||||
|
null,
|
||||||
),
|
),
|
||||||
).toBe(false);
|
).toBe(false);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("Should return false if the URL is invalid", () => {
|
test("Should return false if the URL is invalid", () => {
|
||||||
expect(isUrlBlocked("randomstring")).toBe(false);
|
expect(isUrlBlocked("randomstring", null)).toBe(false);
|
||||||
expect(isUrlBlocked("htp://bad.url")).toBe(false);
|
expect(isUrlBlocked("htp://bad.url", null)).toBe(false);
|
||||||
expect(isUrlBlocked("")).toBe(false);
|
expect(isUrlBlocked("", null)).toBe(false);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("Should respect flags", () => {
|
||||||
|
const decryptedDomain = decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey);
|
||||||
|
|
||||||
|
expect(isUrlBlocked(decryptedDomain, {
|
||||||
|
unblockedDomains: [decryptedDomain],
|
||||||
|
})).toBe(false);
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
@ -1,6 +1,7 @@
|
|||||||
import { configDotenv } from "dotenv";
|
import { configDotenv } from "dotenv";
|
||||||
import crypto from "crypto";
|
import crypto from "crypto";
|
||||||
import { parse } from "tldts";
|
import { parse } from "tldts";
|
||||||
|
import { TeamFlags } from "../../../controllers/v1/types";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -101,10 +102,15 @@ export function decryptedBlocklist(list: string[]): string[] {
|
|||||||
: [];
|
: [];
|
||||||
}
|
}
|
||||||
|
|
||||||
export function isUrlBlocked(url: string): boolean {
|
export function isUrlBlocked(url: string, flags: TeamFlags): boolean {
|
||||||
const lowerCaseUrl = url.trim().toLowerCase();
|
const lowerCaseUrl = url.trim().toLowerCase();
|
||||||
|
|
||||||
const blockedlist = decryptedBlocklist(urlBlocklist);
|
let blockedlist = decryptedBlocklist(urlBlocklist);
|
||||||
|
|
||||||
|
if (flags?.unblockedDomains) {
|
||||||
|
blockedlist = blockedlist.filter((blocked) => !flags.unblockedDomains!.includes(blocked));
|
||||||
|
}
|
||||||
|
|
||||||
const decryptedUrl =
|
const decryptedUrl =
|
||||||
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
||||||
lowerCaseUrl;
|
lowerCaseUrl;
|
||||||
|
@ -80,6 +80,7 @@ import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt
|
|||||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||||
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
||||||
import { CostTracking } from "../lib/extract/extraction-service";
|
import { CostTracking } from "../lib/extract/extraction-service";
|
||||||
|
import { getACUCTeam } from "../controllers/auth";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -144,6 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
|||||||
const crawler = crawlToCrawler(
|
const crawler = crawlToCrawler(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
sc,
|
sc,
|
||||||
|
(await getACUCTeam(job.data.team_id))?.flags ?? null,
|
||||||
sc.originUrl!,
|
sc.originUrl!,
|
||||||
job.data.crawlerOptions,
|
job.data.crawlerOptions,
|
||||||
);
|
);
|
||||||
@ -871,7 +873,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
|||||||
|
|
||||||
try {
|
try {
|
||||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
|
||||||
|
|
||||||
logger.debug("Locking URL...");
|
logger.debug("Locking URL...");
|
||||||
await lockURL(job.data.crawl_id, sc, job.data.url);
|
await lockURL(job.data.crawl_id, sc, job.data.url);
|
||||||
@ -1135,7 +1137,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
normalizeURL(doc.metadata.sourceURL, sc) &&
|
normalizeURL(doc.metadata.sourceURL, sc) &&
|
||||||
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
|
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
|
||||||
) {
|
) {
|
||||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
|
||||||
if (
|
if (
|
||||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
|
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
|
||||||
null &&
|
null &&
|
||||||
@ -1160,7 +1162,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
await saveCrawl(job.data.crawl_id, sc);
|
await saveCrawl(job.data.crawl_id, sc);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (isUrlBlocked(doc.metadata.url)) {
|
if (isUrlBlocked(doc.metadata.url, (await getACUCTeam(job.data.team_id))?.flags ?? null)) {
|
||||||
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1219,6 +1221,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
|||||||
const crawler = crawlToCrawler(
|
const crawler = crawlToCrawler(
|
||||||
job.data.crawl_id,
|
job.data.crawl_id,
|
||||||
sc,
|
sc,
|
||||||
|
(await getACUCTeam(job.data.team_id))?.flags ?? null,
|
||||||
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
||||||
job.data.crawlerOptions,
|
job.data.crawlerOptions,
|
||||||
);
|
);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user