mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 15:09:10 +08:00
feat(crawl): token budget
This commit is contained in:
parent
b9d07e3692
commit
696536c7f2
@ -4,7 +4,7 @@ import {
|
|||||||
AssertionFailureError, ParamValidationError, Defer,
|
AssertionFailureError, ParamValidationError, Defer,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { AsyncContext, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
import { AsyncContext, BudgetExceededError, CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect, SecurityCompromiseError } from '../shared';
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
@ -202,6 +202,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
);
|
);
|
||||||
|
|
||||||
rpcReflect.finally(() => {
|
rpcReflect.finally(() => {
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (chargeAmount) {
|
if (chargeAmount) {
|
||||||
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
||||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||||
@ -218,6 +221,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
);
|
);
|
||||||
|
|
||||||
rpcReflect.finally(() => {
|
rpcReflect.finally(() => {
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (chargeAmount) {
|
if (chargeAmount) {
|
||||||
apiRoll._ref?.set({
|
apiRoll._ref?.set({
|
||||||
chargeAmount,
|
chargeAmount,
|
||||||
@ -252,6 +258,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
|
}
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
event: 'data',
|
event: 'data',
|
||||||
data: formatted,
|
data: formatted,
|
||||||
@ -284,6 +293,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
|
}
|
||||||
|
|
||||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
@ -302,6 +315,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
|
}
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
@ -321,6 +337,9 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
|
}
|
||||||
|
|
||||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
@ -349,6 +368,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
|
}
|
||||||
|
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
|
@ -156,6 +156,11 @@ const IMAGE_RETENTION_MODE_VALUES = new Set<string>(IMAGE_RETENTION_MODES);
|
|||||||
description: 'Specify referer for the page.',
|
description: 'Specify referer for the page.',
|
||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Token-Budget': {
|
||||||
|
description: 'Specify a budget in tokens.\n\nIf the resulting token cost exceeds the budget, the request is rejected.',
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -271,6 +276,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
referer?: string;
|
referer?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
tokenBudget?: number;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||||
@ -387,6 +395,13 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const tokenBudget = ctx?.req.get('x-token-budget') || undefined;
|
||||||
|
instance.tokenBudget ??= parseInt(tokenBudget || '') || undefined;
|
||||||
|
|
||||||
|
if (instance.cacheTolerance) {
|
||||||
|
instance.cacheTolerance = instance.cacheTolerance * 1000;
|
||||||
|
}
|
||||||
|
|
||||||
return instance;
|
return instance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 7b3412e64166599429fa38094f4abd071a15fcd6
|
Subproject commit 98e9bf19bc6859c79eff516275cf1120e59e47bf
|
Loading…
x
Reference in New Issue
Block a user