Nick: introduce llm-usage cost analysis

This commit is contained in:
Nicolas 2025-01-15 21:01:29 -03:00
parent de14c0a45d
commit 4db023280d
10 changed files with 8277 additions and 13 deletions

View File

@ -37,5 +37,6 @@ export async function extractStatusController(
error: extract?.error ?? undefined,
expiresAt: (await getExtractExpiry(req.params.jobId)).toISOString(),
steps: extract.showSteps ? extract.steps : undefined,
llmUsage: extract.showLLMUsage ? extract.llmUsage : undefined,
});
}

View File

@ -71,6 +71,7 @@ export async function extractController(
createdAt: Date.now(),
status: "processing",
showSteps: req.body.__experimental_streamSteps,
showLLMUsage: req.body.__experimental_llmUsage,
});
if (Sentry.isInitialized()) {

View File

@ -222,6 +222,7 @@ export const extractV1Options = z
origin: z.string().optional().default("api"),
urlTrace: z.boolean().default(false),
__experimental_streamSteps: z.boolean().default(false),
__experimental_llmUsage: z.boolean().default(false),
timeout: z.number().int().positive().finite().safe().default(60000),
})
.strict(strictMessage);
@ -840,3 +841,12 @@ export type SearchResponse =
warning?: string;
data: Document[];
};
export type TokenUsage = {
promptTokens: number;
completionTokens: number;
totalTokens: number;
step?: string;
model?: string;
};

View File

@ -30,6 +30,8 @@ export type StoredExtract = {
error?: any;
showSteps?: boolean;
steps?: ExtractedStep[];
showLLMUsage?: boolean;
llmUsage?: number;
};
export async function saveExtract(id: string, extract: StoredExtract) {

View File

@ -1,6 +1,7 @@
import {
Document,
ExtractRequest,
TokenUsage,
toLegacyCrawlerOptions,
URLTrace,
} from "../../controllers/v1/types";
@ -31,6 +32,7 @@ import { ExtractStep, updateExtract } from "./extract-redis";
import { deduplicateObjectsArray } from "./helpers/deduplicate-objs-array";
import { mergeNullValObjs } from "./helpers/merge-null-val-objs";
import { CUSTOM_U_TEAMS } from "./config";
import { estimateCost, estimateTotalCost } from "./usage/llm-cost";
interface ExtractServiceOptions {
request: ExtractRequest;
@ -46,6 +48,8 @@ interface ExtractResult {
warning?: string;
urlTrace?: URLTrace[];
error?: string;
tokenUsageBreakdown?: TokenUsage[];
llmUsage?: number;
}
async function analyzeSchemaAndPrompt(
@ -57,6 +61,7 @@ async function analyzeSchemaAndPrompt(
multiEntityKeys: string[];
reasoning?: string;
keyIndicators?: string[];
tokenUsage: TokenUsage;
}> {
if (!schema) {
schema = await generateSchemaFromPrompt(prompt);
@ -71,8 +76,10 @@ async function analyzeSchemaAndPrompt(
keyIndicators: z.array(z.string()),
});
const model = "gpt-4o";
const result = await openai.beta.chat.completions.parse({
model: "gpt-4o",
model: model,
messages: [
{
role: "system",
@ -131,12 +138,20 @@ Schema: ${schemaString}\nPrompt: ${prompt}\nRelevant URLs: ${urls}`,
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
checkSchema.parse(result.choices[0].message.parsed);
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators };
const tokenUsage: TokenUsage = {
promptTokens: result.usage?.prompt_tokens ?? 0,
completionTokens: result.usage?.completion_tokens ?? 0,
totalTokens: result.usage?.total_tokens ?? 0,
model: model,
};
return { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage };
}
type completions = {
extract: Record<string, any>;
numTokens: number;
totalUsage: TokenUsage;
warning?: string;
};
@ -164,6 +179,10 @@ export async function performExtraction(
let multiEntityResult: any = {};
let singleAnswerResult: any = {};
// Token tracking
let tokenUsage: TokenUsage[] = [];
await updateExtract(extractId, {
status: "processing",
steps: [
@ -249,9 +268,12 @@ export async function performExtraction(
// 1. the first one is a completion that will extract the array of items
// 2. the second one is multiple completions that will extract the items from the array
let startAnalyze = Date.now();
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators } =
const { isMultiEntity, multiEntityKeys, reasoning, keyIndicators, tokenUsage: schemaAnalysisTokenUsage } =
await analyzeSchemaAndPrompt(links, reqSchema, request.prompt ?? "");
// Track schema analysis tokens
tokenUsage.push(schemaAnalysisTokenUsage);
// console.log("\nIs Multi Entity:", isMultiEntity);
// console.log("\nMulti Entity Keys:", multiEntityKeys);
// console.log("\nReasoning:", reasoning);
@ -376,6 +398,8 @@ export async function performExtraction(
true,
);
tokenUsage.push(shouldExtractCheck.totalUsage);
if (!shouldExtractCheck.extract["extract"]) {
console.log(
`Skipping extraction for ${doc.metadata.url} as content is irrelevant`,
@ -438,6 +462,11 @@ export async function performExtraction(
timeoutPromise,
])) as Awaited<ReturnType<typeof generateOpenAICompletions>>;
// Track multi-entity extraction tokens
if (multiEntityCompletion) {
tokenUsage.push(multiEntityCompletion.totalUsage);
}
// console.log(multiEntityCompletion.extract)
// if (!multiEntityCompletion.extract?.is_content_relevant) {
// console.log(`Skipping extraction for ${doc.metadata.url} as content is not relevant`);
@ -603,6 +632,11 @@ export async function performExtraction(
true,
);
// Track single answer extraction tokens
if (singleAnswerCompletions) {
tokenUsage.push(singleAnswerCompletions.totalUsage);
}
singleAnswerResult = singleAnswerCompletions.extract;
// Update token usage in traces
@ -641,7 +675,10 @@ export async function performExtraction(
);
});
// Log job
const totalTokensUsed = tokenUsage.reduce((a, b) => a + b.totalTokens, 0);
const llmUsage = estimateTotalCost(tokenUsage);
// Log job with token usage
logJob({
job_id: extractId,
success: true,
@ -654,10 +691,11 @@ export async function performExtraction(
url: request.urls.join(", "),
scrapeOptions: request,
origin: request.origin ?? "api",
num_tokens: 0, // completions?.numTokens ?? 0,
num_tokens: totalTokensUsed,
}).then(() => {
updateExtract(extractId, {
status: "completed",
llmUsage,
}).catch((error) => {
logger.error(
`Failed to update extract ${extractId} status to completed: ${error}`,
@ -671,5 +709,7 @@ export async function performExtraction(
extractId,
warning: undefined, // TODO FIX
urlTrace: request.urlTrace ? urlTraces : undefined,
llmUsage,
};
}

View File

@ -150,16 +150,21 @@ function filterAndProcessLinks(
);
}
export type RerankerResult = {
mapDocument: MapDocument[];
tokensUsed: number;
}
export async function rerankLinksWithLLM(
mappedLinks: MapDocument[],
searchQuery: string,
urlTraces: URLTrace[],
): Promise<MapDocument[]> {
): Promise<RerankerResult> {
const chunkSize = 100;
const chunks: MapDocument[][] = [];
const TIMEOUT_MS = 20000;
const MAX_RETRIES = 2;
let totalTokensUsed = 0;
// Split mappedLinks into chunks of 200
for (let i = 0; i < mappedLinks.length; i += chunkSize) {
@ -225,6 +230,7 @@ export async function rerankLinksWithLLM(
return [];
}
totalTokensUsed += completion.numTokens || 0;
// console.log(`Chunk ${chunkIndex + 1}: Found ${completion.extract.relevantLinks.length} relevant links`);
return completion.extract.relevantLinks;
@ -252,5 +258,8 @@ export async function rerankLinksWithLLM(
.filter((link): link is MapDocument => link !== undefined);
// console.log(`Returning ${relevantLinks.length} relevant links`);
return relevantLinks;
}
return {
mapDocument: relevantLinks,
tokensUsed: totalTokensUsed,
};
}

View File

@ -199,15 +199,19 @@ export async function processUrl(
// (link, index) => `${index + 1}. URL: ${link.url}, Title: ${link.title}, Description: ${link.description}`
// );
mappedLinks = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
const rerankerResult = await rerankLinksWithLLM(mappedLinks, searchQuery, urlTraces);
mappedLinks = rerankerResult.mapDocument;
let tokensUsed = rerankerResult.tokensUsed;
// 2nd Pass, useful for when the first pass returns too many links
if (mappedLinks.length > 100) {
mappedLinks = await rerankLinksWithLLM(
const rerankerResult = await rerankLinksWithLLM(
mappedLinks,
searchQuery,
urlTraces,
);
mappedLinks = rerankerResult.mapDocument;
tokensUsed += rerankerResult.tokensUsed;
}
// dumpToFile(

View File

@ -0,0 +1,53 @@
import { TokenUsage } from "../../../controllers/v1/types";
import { logger } from "../../../lib/logger";
import { modelPrices } from "./model-prices";
interface ModelPricing {
input_cost_per_token?: number;
output_cost_per_token?: number;
input_cost_per_request?: number;
mode: string;
}
export function estimateTotalCost(tokenUsage: TokenUsage[]): number {
return tokenUsage.reduce((total, usage) => {
return total + estimateCost(usage);
}, 0);
}
export function estimateCost(tokenUsage: TokenUsage): number {
let totalCost = 0;
try {
let model = tokenUsage.model ?? process.env.MODEL_NAME ?? "gpt-4o-mini";
const pricing = modelPrices[model] as ModelPricing;
if (!pricing) {
logger.error(`No pricing information found for model: ${model}`);
return 0;
}
if (pricing.mode !== "chat") {
logger.error(`Model ${model} is not a chat model`);
return 0;
}
// Add per-request cost if applicable (Only Perplexity supports this)
if (pricing.input_cost_per_request) {
totalCost += pricing.input_cost_per_request;
}
// Add token-based costs
if (pricing.input_cost_per_token) {
totalCost += tokenUsage.promptTokens * pricing.input_cost_per_token;
}
if (pricing.output_cost_per_token) {
totalCost += tokenUsage.completionTokens * pricing.output_cost_per_token;
}
return Number(totalCost.toFixed(7));
} catch (error) {
logger.error(`Error estimating cost: ${error}`);
return totalCost;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -1,7 +1,7 @@
import OpenAI from "openai";
import { encoding_for_model } from "@dqbd/tiktoken";
import { TiktokenModel } from "@dqbd/tiktoken";
import { Document, ExtractOptions } from "../../../controllers/v1/types";
import { Document, ExtractOptions, TokenUsage } from "../../../controllers/v1/types";
import { Logger } from "winston";
import { EngineResultsTracker, Meta } from "..";
import { logger } from "../../../lib/logger";
@ -72,7 +72,7 @@ export async function generateOpenAICompletions(
markdown?: string,
previousWarning?: string,
isExtractEndpoint?: boolean,
): Promise<{ extract: any; numTokens: number; warning: string | undefined }> {
): Promise<{ extract: any; numTokens: number; warning: string | undefined; totalUsage: TokenUsage }> {
let extract: any;
let warning: string | undefined;
@ -208,6 +208,9 @@ export async function generateOpenAICompletions(
}
}
const promptTokens = (jsonCompletion.usage?.prompt_tokens ?? 0);
const completionTokens = (jsonCompletion.usage?.completion_tokens ?? 0);
// If the users actually wants the items object, they can specify it as 'required' in the schema
// otherwise, we just return the items array
if (
@ -217,7 +220,9 @@ export async function generateOpenAICompletions(
) {
extract = extract?.items;
}
return { extract, warning, numTokens };
// num tokens (just user prompt tokenized) | deprecated
// totalTokens = promptTokens + completionTokens
return { extract, warning, numTokens, totalUsage: { promptTokens, completionTokens, totalTokens: promptTokens + completionTokens, model: model } };
}
export async function performLLMExtract(