mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 04:55:55 +08:00
feat(acuc): propagate team flags (FIR-1879) (#1522)
* feat(acuc): propagate team flags * feat(flags): further functionality
This commit is contained in:
parent
017a915ae8
commit
fa581995e6
@ -103,6 +103,7 @@ const mockPreviewACUC: (team_id: string, is_extract: boolean) => AuthCreditUsage
|
||||
planModifier: 0.1,
|
||||
},
|
||||
concurrency: is_extract ? 200 : 2,
|
||||
flags: null,
|
||||
is_extract,
|
||||
});
|
||||
|
||||
@ -137,6 +138,7 @@ const mockACUC: () => AuthCreditUsageChunk = () => ({
|
||||
planModifier: 0.1,
|
||||
},
|
||||
concurrency: 99999999,
|
||||
flags: null,
|
||||
is_extract: false,
|
||||
});
|
||||
|
||||
@ -181,7 +183,7 @@ export async function getACUC(
|
||||
const client =
|
||||
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
||||
({ data, error } = await client.rpc(
|
||||
"auth_credit_usage_chunk_30",
|
||||
"auth_credit_usage_chunk_32",
|
||||
{ input_key: api_key, i_is_extract: isExtract, tally_untallied_credits: true },
|
||||
{ get: true },
|
||||
));
|
||||
@ -298,7 +300,7 @@ export async function getACUCTeam(
|
||||
const client =
|
||||
Math.random() > (2/3) ? supabase_rr_service : supabase_service;
|
||||
({ data, error } = await client.rpc(
|
||||
"auth_credit_usage_chunk_30_from_team",
|
||||
"auth_credit_usage_chunk_32_from_team",
|
||||
{ input_team: team_id, i_is_extract: isExtract, tally_untallied_credits: true },
|
||||
{ get: true },
|
||||
));
|
||||
|
@ -115,7 +115,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
|
||||
return res.status(403).json({
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
@ -173,7 +173,7 @@ export async function crawlController(req: Request, res: Response) {
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt();
|
||||
|
@ -43,7 +43,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
.json({ error: e.message ?? e });
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
if (isUrlBlocked(url, auth.chunk?.flags ?? null)) {
|
||||
return res.status(403).json({
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
@ -112,7 +112,7 @@ export async function crawlPreviewController(req: Request, res: Response) {
|
||||
|
||||
await saveCrawl(id, sc);
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
const crawler = crawlToCrawler(id, sc, auth.chunk?.flags ?? null);
|
||||
|
||||
await finishCrawlKickoff(id);
|
||||
|
||||
|
@ -9,6 +9,7 @@ import { RateLimiterMode } from "../../types";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import {
|
||||
fromLegacyCombo,
|
||||
TeamFlags,
|
||||
toLegacyDocument,
|
||||
url as urlSchema,
|
||||
} from "../v1/types";
|
||||
@ -40,6 +41,7 @@ export async function scrapeHelper(
|
||||
pageOptions: PageOptions,
|
||||
extractorOptions: ExtractorOptions,
|
||||
timeout: number,
|
||||
flags: TeamFlags,
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
@ -51,7 +53,7 @@ export async function scrapeHelper(
|
||||
return { success: false, error: "Url is required", returnCode: 400 };
|
||||
}
|
||||
|
||||
if (isUrlBlocked(url)) {
|
||||
if (isUrlBlocked(url, flags)) {
|
||||
return {
|
||||
success: false,
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
@ -241,6 +243,7 @@ export async function scrapeController(req: Request, res: Response) {
|
||||
pageOptions,
|
||||
extractorOptions,
|
||||
timeout,
|
||||
chunk?.flags ?? null,
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
@ -20,6 +20,7 @@ import {
|
||||
Document,
|
||||
fromLegacyCombo,
|
||||
fromLegacyScrapeOptions,
|
||||
TeamFlags,
|
||||
toLegacyDocument,
|
||||
} from "../v1/types";
|
||||
import { getJobFromGCS } from "../../lib/gcs-jobs";
|
||||
@ -32,6 +33,7 @@ export async function searchHelper(
|
||||
crawlerOptions: any,
|
||||
pageOptions: PageOptions,
|
||||
searchOptions: SearchOptions,
|
||||
flags: TeamFlags,
|
||||
): Promise<{
|
||||
success: boolean;
|
||||
error?: string;
|
||||
@ -85,7 +87,7 @@ export async function searchHelper(
|
||||
return { success: true, data: res, returnCode: 200 };
|
||||
}
|
||||
|
||||
res = res.filter((r) => !isUrlBlocked(r.url));
|
||||
res = res.filter((r) => !isUrlBlocked(r.url, flags));
|
||||
if (res.length > num_results) {
|
||||
res = res.slice(0, num_results);
|
||||
}
|
||||
@ -202,6 +204,7 @@ export async function searchController(req: Request, res: Response) {
|
||||
crawlerOptions,
|
||||
pageOptions,
|
||||
searchOptions,
|
||||
chunk?.flags ?? null,
|
||||
);
|
||||
const endTime = new Date().getTime();
|
||||
const timeTakenInSeconds = (endTime - startTime) / 1000;
|
||||
|
@ -23,6 +23,8 @@ import { addScrapeJobs } from "../../services/queue-jobs";
|
||||
import { callWebhook } from "../../services/webhook";
|
||||
import { logger as _logger } from "../../lib/logger";
|
||||
import { CostTracking } from "../../lib/extract/extraction-service";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
|
||||
export async function batchScrapeController(
|
||||
req: RequestWithAuth<{}, BatchScrapeResponse, BatchScrapeRequest>,
|
||||
@ -54,11 +56,24 @@ export async function batchScrapeController(
|
||||
for (const u of pendingURLs) {
|
||||
try {
|
||||
const nu = urlSchema.parse(u);
|
||||
urls.push(nu);
|
||||
if (!isUrlBlocked(nu, req.acuc?.flags ?? null)) {
|
||||
urls.push(nu);
|
||||
} else {
|
||||
invalidURLs.push(u);
|
||||
}
|
||||
} catch (_) {
|
||||
invalidURLs.push(u);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Batch scrape " + id + " starting", {
|
||||
|
@ -89,7 +89,7 @@ export async function crawlController(
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
const crawler = crawlToCrawler(id, sc, req.acuc.flags ?? null);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt(scrapeOptions.skipTlsVerification);
|
||||
|
@ -11,6 +11,8 @@ import { saveExtract } from "../../lib/extract/extract-redis";
|
||||
import { getTeamIdSyncB } from "../../lib/extract/team-id-sync";
|
||||
import { performExtraction } from "../../lib/extract/extraction-service";
|
||||
import { performExtraction_F0 } from "../../lib/extract/fire-0/extraction-service-f0";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
|
||||
export async function oldExtract(
|
||||
req: RequestWithAuth<{}, ExtractResponse, ExtractRequest>,
|
||||
@ -58,6 +60,15 @@ export async function extractController(
|
||||
const selfHosted = process.env.USE_DB_AUTHENTICATION !== "true";
|
||||
req.body = extractRequestSchema.parse(req.body);
|
||||
|
||||
if (req.body.urls?.some((url: string) => isUrlBlocked(url, req.acuc?.flags ?? null))) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
error: BLOCKLISTED_URL_MESSAGE,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const extractId = crypto.randomUUID();
|
||||
const jobData = {
|
||||
request: req.body,
|
||||
|
@ -5,6 +5,7 @@ import {
|
||||
mapRequestSchema,
|
||||
RequestWithAuth,
|
||||
scrapeOptions,
|
||||
TeamFlags,
|
||||
TimeoutSignal,
|
||||
} from "./types";
|
||||
import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis";
|
||||
@ -56,6 +57,7 @@ export async function getMapResults({
|
||||
abort = new AbortController().signal, // noop
|
||||
mock,
|
||||
filterByPath = true,
|
||||
flags,
|
||||
}: {
|
||||
url: string;
|
||||
search?: string;
|
||||
@ -70,6 +72,7 @@ export async function getMapResults({
|
||||
abort?: AbortSignal;
|
||||
mock?: string;
|
||||
filterByPath?: boolean;
|
||||
flags: TeamFlags;
|
||||
}): Promise<MapResult> {
|
||||
const id = uuidv4();
|
||||
let links: string[] = [url];
|
||||
@ -88,7 +91,7 @@ export async function getMapResults({
|
||||
createdAt: Date.now(),
|
||||
};
|
||||
|
||||
const crawler = crawlToCrawler(id, sc);
|
||||
const crawler = crawlToCrawler(id, sc, flags);
|
||||
|
||||
try {
|
||||
sc.robots = await crawler.getRobotsTxt(false, abort);
|
||||
@ -322,6 +325,7 @@ export async function mapController(
|
||||
abort: abort.signal,
|
||||
mock: req.body.useMock,
|
||||
filterByPath: req.body.filterByPath !== false,
|
||||
flags: req.acuc.flags ?? null,
|
||||
}),
|
||||
...(req.body.timeout !== undefined ? [
|
||||
new Promise((resolve, reject) => setTimeout(() => {
|
||||
|
@ -6,6 +6,7 @@ import {
|
||||
SearchResponse,
|
||||
searchRequestSchema,
|
||||
ScrapeOptions,
|
||||
TeamFlags,
|
||||
} from "./types";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { v4 as uuidv4 } from "uuid";
|
||||
@ -34,6 +35,7 @@ export async function searchAndScrapeSearchResult(
|
||||
},
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
flags: TeamFlags,
|
||||
): Promise<Document[]> {
|
||||
try {
|
||||
const searchResults = await search({
|
||||
@ -51,7 +53,8 @@ export async function searchAndScrapeSearchResult(
|
||||
},
|
||||
options,
|
||||
logger,
|
||||
costTracking
|
||||
costTracking,
|
||||
flags
|
||||
)
|
||||
)
|
||||
);
|
||||
@ -72,6 +75,7 @@ async function scrapeSearchResult(
|
||||
},
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
flags: TeamFlags,
|
||||
): Promise<Document> {
|
||||
const jobId = uuidv4();
|
||||
const jobPriority = await getJobPriority({
|
||||
@ -80,7 +84,7 @@ async function scrapeSearchResult(
|
||||
});
|
||||
|
||||
try {
|
||||
if (isUrlBlocked(searchResult.url)) {
|
||||
if (isUrlBlocked(searchResult.url, flags)) {
|
||||
throw new Error("Could not scrape url: " + BLOCKLISTED_URL_MESSAGE);
|
||||
}
|
||||
logger.info("Adding scrape job", {
|
||||
@ -220,7 +224,7 @@ export async function searchController(
|
||||
origin: req.body.origin,
|
||||
timeout: req.body.timeout,
|
||||
scrapeOptions: req.body.scrapeOptions,
|
||||
}, logger, costTracking),
|
||||
}, logger, costTracking, req.acuc?.flags ?? null),
|
||||
);
|
||||
|
||||
const docs = await Promise.all(scrapePromises);
|
||||
|
@ -1,6 +1,5 @@
|
||||
import { Request, Response } from "express";
|
||||
import { z } from "zod";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { protocolIncluded, checkUrl } from "../../lib/validateUrl";
|
||||
import { countries } from "../../lib/validate-country";
|
||||
import {
|
||||
@ -10,7 +9,6 @@ import {
|
||||
Document as V0Document,
|
||||
} from "../../lib/entities";
|
||||
import { InternalOptions } from "../../scraper/scrapeURL";
|
||||
import { BLOCKLISTED_URL_MESSAGE } from "../../lib/strings";
|
||||
|
||||
export type Format =
|
||||
| "markdown"
|
||||
@ -49,7 +47,7 @@ export const url = z.preprocess(
|
||||
return false;
|
||||
}
|
||||
}, "Invalid URL")
|
||||
.refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
||||
// .refine((x) => !isUrlBlocked(x as string), BLOCKLISTED_URL_MESSAGE),
|
||||
);
|
||||
|
||||
const strictMessage =
|
||||
@ -914,11 +912,17 @@ export type AuthCreditUsageChunk = {
|
||||
scrapeAgentPreview?: number;
|
||||
};
|
||||
concurrency: number;
|
||||
flags: TeamFlags;
|
||||
|
||||
// appended on JS-side
|
||||
is_extract?: boolean;
|
||||
};
|
||||
|
||||
export type TeamFlags = {
|
||||
ignoreRobots?: boolean;
|
||||
unblockedDomains?: string[];
|
||||
} | null;
|
||||
|
||||
export type AuthCreditUsageChunkFromTeam = Omit<AuthCreditUsageChunk, "api_key">;
|
||||
|
||||
export interface RequestWithMaybeACUC<
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { InternalOptions } from "../scraper/scrapeURL";
|
||||
import { ScrapeOptions } from "../controllers/v1/types";
|
||||
import { ScrapeOptions, TeamFlags } from "../controllers/v1/types";
|
||||
import { WebCrawler } from "../scraper/WebScraper/crawler";
|
||||
import { redisConnection } from "../services/queue-service";
|
||||
import { logger as _logger } from "./logger";
|
||||
@ -383,6 +383,7 @@ export async function lockURLsIndividually(
|
||||
export function crawlToCrawler(
|
||||
id: string,
|
||||
sc: StoredCrawl,
|
||||
teamFlags: TeamFlags,
|
||||
newBase?: string,
|
||||
crawlerOptions?: any,
|
||||
): WebCrawler {
|
||||
@ -403,7 +404,7 @@ export function crawlToCrawler(
|
||||
allowExternalContentLinks:
|
||||
sc.crawlerOptions?.allowExternalContentLinks ?? false,
|
||||
allowSubdomains: sc.crawlerOptions?.allowSubdomains ?? false,
|
||||
ignoreRobotsTxt: sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||
ignoreRobotsTxt: teamFlags?.ignoreRobots ?? sc.crawlerOptions?.ignoreRobotsTxt ?? false,
|
||||
regexOnFullURL: sc.crawlerOptions?.regexOnFullURL ?? false,
|
||||
maxDiscoveryDepth: sc.crawlerOptions?.maxDiscoveryDepth,
|
||||
currentDiscoveryDepth: crawlerOptions?.currentDiscoveryDepth ?? 0,
|
||||
|
@ -6,7 +6,7 @@ import { logJob } from "../../services/logging/log_job";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
import { CostTracking } from "../extract/extraction-service";
|
||||
|
||||
import { getACUCTeam } from "../../controllers/auth";
|
||||
interface DeepResearchServiceOptions {
|
||||
researchId: string;
|
||||
teamId: string;
|
||||
@ -45,6 +45,8 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
);
|
||||
const llmService = new ResearchLLMService(logger);
|
||||
|
||||
const acuc = await getACUCTeam(teamId);
|
||||
|
||||
try {
|
||||
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
|
||||
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
|
||||
@ -112,7 +114,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
fastMode: false,
|
||||
blockAds: false,
|
||||
},
|
||||
}, logger, costTracking);
|
||||
}, logger, costTracking, acuc?.flags ?? null);
|
||||
return response.length > 0 ? response : [];
|
||||
});
|
||||
|
||||
|
@ -36,7 +36,7 @@ import { getCachedDocs, saveCachedDocs } from "./helpers/cached-docs";
|
||||
import { normalizeUrl } from "../canonical-url";
|
||||
import { search } from "../../search";
|
||||
import { buildRephraseToSerpPrompt } from "./build-prompts";
|
||||
|
||||
import { getACUCTeam } from "../../controllers/auth";
|
||||
interface ExtractServiceOptions {
|
||||
request: ExtractRequest;
|
||||
teamId: string;
|
||||
@ -134,6 +134,7 @@ export async function performExtraction(
|
||||
let sources: Record<string, string[]> = {};
|
||||
|
||||
let costTracking = new CostTracking(subId ? null : 1.5);
|
||||
const acuc = await getACUCTeam(teamId);
|
||||
|
||||
let log = {
|
||||
extractId,
|
||||
@ -323,6 +324,7 @@ export async function performExtraction(
|
||||
},
|
||||
logger.child({ module: "extract", method: "processUrl", url }),
|
||||
costTracking,
|
||||
acuc?.flags ?? null,
|
||||
),
|
||||
);
|
||||
|
||||
|
@ -32,6 +32,7 @@ import { mixSchemaObjects_F0 } from "./helpers/mix-schema-objs-f0";
|
||||
import { singleAnswerCompletion_F0 } from "./completions/singleAnswer-f0";
|
||||
import { calculateFinalResultCost_F0, estimateTotalCost_F0 } from "./usage/llm-cost-f0";
|
||||
import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
||||
import { getACUCTeam } from "../../../controllers/auth";
|
||||
|
||||
|
||||
interface ExtractServiceOptions {
|
||||
@ -77,6 +78,8 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
||||
let singleAnswerResult: any = {};
|
||||
let totalUrlsScraped = 0;
|
||||
let sources: Record<string, string[]> = {};
|
||||
|
||||
const acuc = await getACUCTeam(teamId);
|
||||
|
||||
|
||||
const logger = _logger.child({
|
||||
@ -174,6 +177,7 @@ import { SourceTracker_F0 } from "./helpers/source-tracker-f0";
|
||||
});
|
||||
},
|
||||
logger.child({ module: "extract", method: "processUrl", url }),
|
||||
acuc?.flags ?? null,
|
||||
),
|
||||
);
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
||||
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
|
||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||
import { logger } from "../../logger";
|
||||
import { CohereClient } from "cohere-ai";
|
||||
@ -48,6 +48,7 @@ export async function rerankLinks_F0(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
flags: TeamFlags,
|
||||
): Promise<MapDocument[]> {
|
||||
// console.log("Going to rerank links");
|
||||
const mappedLinksRerank = mappedLinks.map(
|
||||
@ -65,6 +66,7 @@ export async function rerankLinks_F0(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
flags,
|
||||
);
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
@ -76,6 +78,7 @@ export async function rerankLinks_F0(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
flags,
|
||||
);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
@ -89,7 +92,7 @@ export async function rerankLinks_F0(
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -145,13 +148,14 @@ function filterAndProcessLinks_F0(
|
||||
originalIndex: number;
|
||||
}[],
|
||||
threshold: number,
|
||||
flags: TeamFlags,
|
||||
): MapDocument[] {
|
||||
return linksAndScores
|
||||
.filter((x) => x.score > threshold)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { MapDocument, URLTrace } from "../../../controllers/v1/types";
|
||||
import { MapDocument, TeamFlags, URLTrace } from "../../../controllers/v1/types";
|
||||
import { getMapResults } from "../../../controllers/v1/map";
|
||||
import { removeDuplicateUrls } from "../../validateUrl";
|
||||
import { isUrlBlocked } from "../../../scraper/WebScraper/utils/blocklist";
|
||||
@ -9,6 +9,7 @@ import type { Logger } from "winston";
|
||||
import { generateText } from "ai";
|
||||
import { getModel } from "../../generic-ai";
|
||||
import { CostTracking } from "../extraction-service";
|
||||
import { getACUCTeam } from "../../../controllers/auth";
|
||||
|
||||
export async function generateBasicCompletion_FO(prompt: string) {
|
||||
const { text } = await generateText({
|
||||
@ -34,6 +35,7 @@ export async function processUrl_F0(
|
||||
urlTraces: URLTrace[],
|
||||
updateExtractCallback: (links: string[]) => void,
|
||||
logger: Logger,
|
||||
teamFlags: TeamFlags,
|
||||
): Promise<string[]> {
|
||||
const trace: URLTrace = {
|
||||
url: options.url,
|
||||
@ -45,7 +47,7 @@ export async function processUrl_F0(
|
||||
urlTraces.push(trace);
|
||||
|
||||
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
||||
if (!isUrlBlocked(options.url)) {
|
||||
if (!isUrlBlocked(options.url, teamFlags)) {
|
||||
trace.usedInCompletion = true;
|
||||
return [options.url];
|
||||
}
|
||||
@ -85,6 +87,7 @@ export async function processUrl_F0(
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
flags: teamFlags,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||
@ -121,6 +124,7 @@ export async function processUrl_F0(
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
flags: teamFlags,
|
||||
});
|
||||
|
||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { MapDocument, URLTrace } from "../../controllers/v1/types";
|
||||
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
|
||||
import { performRanking } from "../ranker";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
import { logger } from "../logger";
|
||||
@ -57,6 +57,7 @@ export async function rerankLinks(
|
||||
mappedLinks: MapDocument[],
|
||||
searchQuery: string,
|
||||
urlTraces: URLTrace[],
|
||||
flags: TeamFlags,
|
||||
): Promise<MapDocument[]> {
|
||||
// console.log("Going to rerank links");
|
||||
const mappedLinksRerank = mappedLinks.map(
|
||||
@ -74,6 +75,7 @@ export async function rerankLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.INITIAL_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
flags,
|
||||
);
|
||||
|
||||
// If we don't have enough high-quality links, try with lower threshold
|
||||
@ -85,6 +87,7 @@ export async function rerankLinks(
|
||||
mappedLinks,
|
||||
linksAndScores,
|
||||
extractConfig.RERANKING.FALLBACK_SCORE_THRESHOLD_FOR_RELEVANCE,
|
||||
flags,
|
||||
);
|
||||
|
||||
if (filteredLinks.length === 0) {
|
||||
@ -98,7 +101,7 @@ export async function rerankLinks(
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||
);
|
||||
}
|
||||
}
|
||||
@ -154,13 +157,14 @@ function filterAndProcessLinks(
|
||||
originalIndex: number;
|
||||
}[],
|
||||
threshold: number,
|
||||
flags: TeamFlags,
|
||||
): MapDocument[] {
|
||||
return linksAndScores
|
||||
.filter((x) => x.score > threshold)
|
||||
.map((x) => mappedLinks.find((link) => link.url === x.link))
|
||||
.filter(
|
||||
(x): x is MapDocument =>
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url),
|
||||
x !== undefined && x.url !== undefined && !isUrlBlocked(x.url, flags),
|
||||
);
|
||||
}
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
import { MapDocument, URLTrace } from "../../controllers/v1/types";
|
||||
import { MapDocument, TeamFlags, URLTrace } from "../../controllers/v1/types";
|
||||
import { getMapResults } from "../../controllers/v1/map";
|
||||
import { removeDuplicateUrls } from "../validateUrl";
|
||||
import { isUrlBlocked } from "../../scraper/WebScraper/utils/blocklist";
|
||||
@ -93,6 +93,7 @@ export async function processUrl(
|
||||
updateExtractCallback: (links: string[]) => void,
|
||||
logger: Logger,
|
||||
costTracking: CostTracking,
|
||||
teamFlags: TeamFlags,
|
||||
): Promise<string[]> {
|
||||
const trace: URLTrace = {
|
||||
url: options.url,
|
||||
@ -104,7 +105,7 @@ export async function processUrl(
|
||||
urlTraces.push(trace);
|
||||
|
||||
if (!options.url.includes("/*") && !options.allowExternalLinks) {
|
||||
if (!isUrlBlocked(options.url)) {
|
||||
if (!isUrlBlocked(options.url, teamFlags)) {
|
||||
trace.usedInCompletion = true;
|
||||
return [options.url];
|
||||
}
|
||||
@ -144,6 +145,7 @@ export async function processUrl(
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
flags: teamFlags,
|
||||
});
|
||||
|
||||
let mappedLinks = mapResults.mapResults as MapDocument[];
|
||||
@ -181,6 +183,7 @@ export async function processUrl(
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
includeSubdomains: options.includeSubdomains,
|
||||
flags: teamFlags,
|
||||
});
|
||||
|
||||
mappedLinks = retryMapResults.mapResults as MapDocument[];
|
||||
|
@ -12,6 +12,7 @@ import { logJob } from "../../services/logging/log_job";
|
||||
import { getModel } from "../generic-ai";
|
||||
import { generateCompletions } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
import { CostTracking } from "../extract/extraction-service";
|
||||
import { getACUCTeam } from "../../controllers/auth";
|
||||
interface GenerateLLMsTextServiceOptions {
|
||||
generationId: string;
|
||||
teamId: string;
|
||||
@ -72,6 +73,7 @@ export async function performGenerateLlmsTxt(
|
||||
teamId,
|
||||
});
|
||||
const costTracking = new CostTracking();
|
||||
const acuc = await getACUCTeam(teamId);
|
||||
|
||||
try {
|
||||
// Enforce max URL limit
|
||||
@ -116,6 +118,7 @@ export async function performGenerateLlmsTxt(
|
||||
includeSubdomains: false,
|
||||
ignoreSitemap: false,
|
||||
includeMetadata: true,
|
||||
flags: acuc?.flags ?? null,
|
||||
});
|
||||
|
||||
if (!mapResult || !mapResult.links) {
|
||||
|
@ -147,8 +147,8 @@ function idempotencyMiddleware(
|
||||
})().catch((err) => next(err));
|
||||
}
|
||||
|
||||
function blocklistMiddleware(req: Request, res: Response, next: NextFunction) {
|
||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url)) {
|
||||
function blocklistMiddleware(req: RequestWithACUC<any, any, any>, res: Response, next: NextFunction) {
|
||||
if (typeof req.body.url === "string" && isUrlBlocked(req.body.url, req.acuc?.flags ?? null)) {
|
||||
if (!res.headersSent) {
|
||||
return res.status(403).json({
|
||||
success: false,
|
||||
@ -267,6 +267,7 @@ v1Router.get(
|
||||
v1Router.post(
|
||||
"/llmstxt",
|
||||
authMiddleware(RateLimiterMode.Scrape),
|
||||
blocklistMiddleware,
|
||||
wrap(generateLLMsTextController),
|
||||
);
|
||||
|
||||
|
@ -53,22 +53,25 @@ describe("isUrlBlocked function", () => {
|
||||
});
|
||||
|
||||
test("Blocks exact domain with and without protocol", () => {
|
||||
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("KZfBtpwjOpdSoqacRbz7og==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("TemsdmaA9kBK9cVJTaAmZksAh4WcS5779jwuGJ26ows=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV0ttO4rKIA14yZBb+2FDG7I=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
@ -77,53 +80,57 @@ describe("isUrlBlocked function", () => {
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("m+PjIWE9E4GF3lA/B9cUMDj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("o/ClKrW6Qo0uidbD2X8cVjj3smbHhZYOGxP74UTmd3M=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("Z53Ny7rvn7cBX/2bYpOZrRDosKfU7BiSM0OClb4bdWY=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("Blocks different TLDs (BLOCKED-DOMAIN.pt, BLOCKED-DOMAIN.io)", () => {
|
||||
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("vUMeqQdqk7ajwczYBr6prA==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("WOjW9VwGwrPu846jDo6VQg==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("Ti3vVa6sRew3wyTZ7a/Yag==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("0pCVMPgc7+IMrLjIA5lFV5cYWcOWC5LGWwvlbCW2GH4=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("Allows unrelated domains like whateverfacebook.com", () => {
|
||||
expect(isUrlBlocked("whateverfacebook.com")).toBe(false);
|
||||
expect(isUrlBlocked("https://whateverfacebook.com")).toBe(false);
|
||||
expect(isUrlBlocked("whateverfacebook.com", null)).toBe(false);
|
||||
expect(isUrlBlocked("https://whateverfacebook.com", null)).toBe(false);
|
||||
});
|
||||
|
||||
test("Blocks other domains from the blocklist", () => {
|
||||
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("XS61fAjZb5JfAWsyzzOoCQ==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("Indtl4yxJMHCKBGF4KABCQ==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey))).toBe(
|
||||
expect(isUrlBlocked(decryptAES("86ZDUI7vmp4MvNq3fvZrGQ==", hashKey), null)).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
@ -135,23 +142,34 @@ describe("isUrlBlocked function", () => {
|
||||
"4H7Uyz6sSCwE3mne1SsGU+6gs7VssjM3e5C6qsyUPUnhsthhQp2bAQwZ9xSCJsjB",
|
||||
hashKey,
|
||||
),
|
||||
null,
|
||||
),
|
||||
).toBe(false);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("rNA7JWR/voEnzAqpC4QJAYgZUratpaNBCBVujdFqDb0=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(false);
|
||||
expect(
|
||||
isUrlBlocked(
|
||||
decryptAES("ipHiDz83ep6vbIMee94+4XtxxVy1YMYWlaGnWKcG9gQ=", hashKey),
|
||||
null,
|
||||
),
|
||||
).toBe(false);
|
||||
});
|
||||
|
||||
test("Should return false if the URL is invalid", () => {
|
||||
expect(isUrlBlocked("randomstring")).toBe(false);
|
||||
expect(isUrlBlocked("htp://bad.url")).toBe(false);
|
||||
expect(isUrlBlocked("")).toBe(false);
|
||||
expect(isUrlBlocked("randomstring", null)).toBe(false);
|
||||
expect(isUrlBlocked("htp://bad.url", null)).toBe(false);
|
||||
expect(isUrlBlocked("", null)).toBe(false);
|
||||
});
|
||||
|
||||
test("Should respect flags", () => {
|
||||
const decryptedDomain = decryptAES("e3HFXLVgxhaVoadYpwb2BA==", hashKey);
|
||||
|
||||
expect(isUrlBlocked(decryptedDomain, {
|
||||
unblockedDomains: [decryptedDomain],
|
||||
})).toBe(false);
|
||||
});
|
||||
});
|
||||
|
@ -1,6 +1,7 @@
|
||||
import { configDotenv } from "dotenv";
|
||||
import crypto from "crypto";
|
||||
import { parse } from "tldts";
|
||||
import { TeamFlags } from "../../../controllers/v1/types";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -101,10 +102,15 @@ export function decryptedBlocklist(list: string[]): string[] {
|
||||
: [];
|
||||
}
|
||||
|
||||
export function isUrlBlocked(url: string): boolean {
|
||||
export function isUrlBlocked(url: string, flags: TeamFlags): boolean {
|
||||
const lowerCaseUrl = url.trim().toLowerCase();
|
||||
|
||||
const blockedlist = decryptedBlocklist(urlBlocklist);
|
||||
let blockedlist = decryptedBlocklist(urlBlocklist);
|
||||
|
||||
if (flags?.unblockedDomains) {
|
||||
blockedlist = blockedlist.filter((blocked) => !flags.unblockedDomains!.includes(blocked));
|
||||
}
|
||||
|
||||
const decryptedUrl =
|
||||
blockedlist.find((decrypted) => lowerCaseUrl === decrypted) ||
|
||||
lowerCaseUrl;
|
||||
|
@ -80,6 +80,7 @@ import { performGenerateLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt
|
||||
import { updateGeneratedLlmsTxt } from "../lib/generate-llmstxt/generate-llmstxt-redis";
|
||||
import { performExtraction_F0 } from "../lib/extract/fire-0/extraction-service-f0";
|
||||
import { CostTracking } from "../lib/extract/extraction-service";
|
||||
import { getACUCTeam } from "../controllers/auth";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -144,6 +145,7 @@ async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
const crawler = crawlToCrawler(
|
||||
job.data.crawl_id,
|
||||
sc,
|
||||
(await getACUCTeam(job.data.team_id))?.flags ?? null,
|
||||
sc.originUrl!,
|
||||
job.data.crawlerOptions,
|
||||
);
|
||||
@ -871,7 +873,7 @@ async function processKickoffJob(job: Job & { id: string }, token: string) {
|
||||
|
||||
try {
|
||||
const sc = (await getCrawl(job.data.crawl_id)) as StoredCrawl;
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
|
||||
|
||||
logger.debug("Locking URL...");
|
||||
await lockURL(job.data.crawl_id, sc, job.data.url);
|
||||
@ -1135,7 +1137,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
normalizeURL(doc.metadata.sourceURL, sc) &&
|
||||
job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape
|
||||
) {
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc);
|
||||
const crawler = crawlToCrawler(job.data.crawl_id, sc, (await getACUCTeam(job.data.team_id))?.flags ?? null);
|
||||
if (
|
||||
crawler.filterURL(doc.metadata.url, doc.metadata.sourceURL) ===
|
||||
null &&
|
||||
@ -1160,7 +1162,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
await saveCrawl(job.data.crawl_id, sc);
|
||||
}
|
||||
|
||||
if (isUrlBlocked(doc.metadata.url)) {
|
||||
if (isUrlBlocked(doc.metadata.url, (await getACUCTeam(job.data.team_id))?.flags ?? null)) {
|
||||
throw new Error(BLOCKLISTED_URL_MESSAGE); // TODO: make this its own error type that is ignored by error tracking
|
||||
}
|
||||
|
||||
@ -1219,6 +1221,7 @@ async function processJob(job: Job & { id: string }, token: string) {
|
||||
const crawler = crawlToCrawler(
|
||||
job.data.crawl_id,
|
||||
sc,
|
||||
(await getACUCTeam(job.data.team_id))?.flags ?? null,
|
||||
doc.metadata.url ?? doc.metadata.sourceURL ?? sc.originUrl!,
|
||||
job.data.crawlerOptions,
|
||||
);
|
||||
|
Loading…
x
Reference in New Issue
Block a user