mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 07:35:57 +08:00
fix: readerlm params
This commit is contained in:
parent
f3654a786b
commit
f7f6a98839
@ -338,7 +338,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -379,7 +379,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||||
|
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
@ -405,7 +405,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -434,7 +434,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -466,7 +466,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs, crawlOpts);
|
||||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
chargeAmount = this.assignChargeAmount(formatted, crawlerOptions);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -674,7 +674,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||||
...crawlOpts,
|
...crawlOpts,
|
||||||
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
engine: crawlOpts?.engine || ENGINE_TYPE.AUTO,
|
||||||
}, crawlerOpts);
|
}, CrawlerOptions.from({
|
||||||
|
...crawlerOpts,
|
||||||
|
respondWith: 'html',
|
||||||
|
}));
|
||||||
|
|
||||||
if (!finalAutoSnapshot?.html) {
|
if (!finalAutoSnapshot?.html) {
|
||||||
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
throw new AssertionFailureError(`Unexpected non HTML content for ReaderLM: ${urlToCrawl}`);
|
||||||
@ -890,7 +893,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
|
assignChargeAmount(formatted: FormattedPage, crawlerOptions?: CrawlerOptions) {
|
||||||
if (!formatted) {
|
if (!formatted) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
@ -898,7 +901,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
let amount = 0;
|
let amount = 0;
|
||||||
if (formatted.content) {
|
if (formatted.content) {
|
||||||
const x1 = estimateToken(formatted.content);
|
const x1 = estimateToken(formatted.content);
|
||||||
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
|
if (crawlerOptions?.respondWith?.toLowerCase().includes('lm')) {
|
||||||
amount += x1 * 2;
|
amount += x1 * 2;
|
||||||
}
|
}
|
||||||
amount += x1;
|
amount += x1;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user