feat(crawl): token budget

This commit is contained in:
yanlong.wang 2024-12-24 18:30:53 +08:00
parent b9d07e3692
commit 696536c7f2
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
3 changed files with 40 additions and 2 deletions

View File

@ -4,7 +4,7 @@ import {
AssertionFailureError, ParamValidationError, Defer,
} from 'civkit';
import { singleton } from 'tsyringe';
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
import _ from 'lodash';
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost {
);
rpcReflect.finally(() => {
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
return;
}
if (chargeAmount) {
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost {
);
rpcReflect.finally(() => {
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
return;
}
if (chargeAmount) {
apiRoll._ref?.set({
chargeAmount,
@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
sseStream.write({
event: 'data',
data: formatted,
@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.isEarlyReturnApplicable()) {
return formatted;
}
@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
return formatted;
}
@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.isEarlyReturnApplicable()) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost {
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,

View File

@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
description: 'Specify referer for the page.',
in: 'header',
schema: { type: 'string' }
},
'X-Token-Budget': {
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
in: 'header',
schema: { type: 'string' }
}
}
}
@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop()
referer?: string;
@Prop()
tokenBudget?: number;
static override from(input: any) {
const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
if (instance.cacheTolerance) {
instance.cacheTolerance = instance.cacheTolerance * 1000;
}
return instance;
}

@ -1 +1 @@
Subproject commit 7b3412e64166599429fa38094f4abd071a15fcd6
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf