diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 23d1f27..aedb57e 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -4,7 +4,7 @@ import { AssertionFailureError, ParamValidationError, Defer, } from 'civkit'; import { singleton } from 'tsyringe'; -import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared'; +import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared'; import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; import _ from 'lodash'; import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer'; @@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost { ); rpcReflect.finally(() => { + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + return; + } if (chargeAmount) { auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); @@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost { ); rpcReflect.finally(() => { + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + return; + } if (chargeAmount) { apiRoll._ref?.set({ chargeAmount, @@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost { const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); + } sseStream.write({ event: 'data', data: formatted, @@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost { const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); + } + if (crawlerOptions.isEarlyReturnApplicable()) { return formatted; } @@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost { const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); + } return formatted; } @@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost { const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); + } if (crawlerOptions.isEarlyReturnApplicable()) { if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { @@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost { const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); + if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { + throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); + } + if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { return assignTransferProtocolMeta(`${formatted.textRepresentation}`, diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 94f1a54..0f81cca 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set(IMAGE_RETENTION_MODES); description: 'Specify referer for the page.', in: 'header', schema: { type: 'string' } + }, + 'X-Token-Budget': { + description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.', + in: 'header', + schema: { type: 'string' } } } } @@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() referer?: string; + @Prop() + tokenBudget?: number; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { @@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable { instance.cacheTolerance = instance.cacheTolerance * 1000; } + const tokenBudget = ctx?.req.get('x-token-budget') || undefined; + instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined; + + if (instance.cacheTolerance) { + instance.cacheTolerance = instance.cacheTolerance * 1000; + } + return instance; } diff --git a/thinapps-shared b/thinapps-shared index 7b3412e..98e9bf1 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 7b3412e64166599429fa38094f4abd071a15fcd6 +Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf