mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-06 06:26:59 +08:00
feat(crawl): token budget
This commit is contained in:
parent
b9d07e3692
commit
696536c7f2
@ -4,7 +4,7 @@ import {
|
||||
AssertionFailureError, ParamValidationError, Defer,
|
||||
} from 'civkit';
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||
import _ from 'lodash';
|
||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||
@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost {
|
||||
);
|
||||
|
||||
rpcReflect.finally(() => {
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
return;
|
||||
}
|
||||
if (chargeAmount) {
|
||||
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||
@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost {
|
||||
);
|
||||
|
||||
rpcReflect.finally(() => {
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
return;
|
||||
}
|
||||
if (chargeAmount) {
|
||||
apiRoll._ref?.set({
|
||||
chargeAmount,
|
||||
@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
sseStream.write({
|
||||
event: 'data',
|
||||
data: formatted,
|
||||
@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost {
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||
return formatted;
|
||||
}
|
||||
@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
return formatted;
|
||||
}
|
||||
@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
|
@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
||||
description: 'Specify referer for the page.',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Token-Budget': {
|
||||
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
referer?: string;
|
||||
|
||||
@Prop()
|
||||
tokenBudget?: number;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
|
||||
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
||||
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
||||
|
||||
if (instance.cacheTolerance) {
|
||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||
}
|
||||
|
||||
return instance;
|
||||
}
|
||||
|
||||
|
@ -1 +1 @@
|
||||
Subproject commit 7b3412e64166599429fa38094f4abd071a15fcd6
|
||||
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
|
Loading…
x
Reference in New Issue
Block a user