feat: new lm engine

This commit is contained in:
yanlong.wang 2025-01-15 17:38:49 +08:00
parent 51a4877933
commit 06f359309e
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
5 changed files with 193 additions and 80 deletions

View File

@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
import { JSDomControl } from '../services/jsdom'; import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl'; import { CurlControl } from '../services/curl';
import { VlmControl } from '../services/vlm'; import { LmControl } from '../services/lm';
export interface ExtraScrappingOptions extends ScrappingOptions { export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted'; withIframe?: boolean | 'quoted';
@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
protected globalLogger: Logger, protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl, protected puppeteerControl: PuppeteerControl,
protected curlControl: CurlControl, protected curlControl: CurlControl,
protected vlmControl: VlmControl, protected lmControl: LmControl,
protected jsdomControl: JSDomControl, protected jsdomControl: JSDomControl,
protected snapshotFormatter: SnapshotFormatter, protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl, protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
} }
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
} }
@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
} }
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
} }
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
} }
@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
} }
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
} }
@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
} }
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted); chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
} }
@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
return; return;
} }
if (crawlOpts?.engine === ENGINE_TYPE.VLM) { // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts); // const rmSelectorEquivalent = [];
// if (typeof crawlOpts.removeSelector === 'string') {
// rmSelectorEquivalent.push(crawlOpts.removeSelector);
// } else if (Array.isArray(crawlOpts.removeSelector)) {
// rmSelectorEquivalent.push(...crawlOpts.removeSelector);
// }
// rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot); // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
// ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
// }, crawlerOpts);
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
// return;
// }
if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
const rmSelectorEquivalent = [];
if (typeof crawlOpts.removeSelector === 'string') {
rmSelectorEquivalent.push(crawlOpts.removeSelector);
} else if (Array.isArray(crawlOpts.removeSelector)) {
rmSelectorEquivalent.push(...crawlOpts.removeSelector);
}
rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
}, crawlerOpts);
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
return;
}
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
return; return;
} }
@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
} }
} }
assignChargeAmount(formatted: FormattedPage) { assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
if (!formatted) { if (!formatted) {
return 0; return 0;
} }
let amount = 0; let amount = 0;
if (formatted.content) { if (formatted.content) {
amount += estimateToken(formatted.content); const x1 = estimateToken(formatted.content);
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
amount += x1 * 2;
}
amount += x1;
} else if (formatted.description) { } else if (formatted.description) {
amount += estimateToken(formatted.description); amount += estimateToken(formatted.description);
} }
@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
nominalUrl?: URL, nominalUrl?: URL,
urlValidMs?: number urlValidMs?: number
) { ) {
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) { if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
const output: FormattedPage = { const output: FormattedPage = {
title: snapshot.title, title: snapshot.title,
content: snapshot.parsed?.textContent, content: snapshot.parsed?.textContent,

View File

@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
BROWSER = 'browser', BROWSER = 'browser',
DIRECT = 'direct', DIRECT = 'direct',
VLM = 'vlm', VLM = 'vlm',
READER_LM = 'readerlm-v2',
} }
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT)); const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
schema: { type: 'string' } schema: { type: 'string' }
}, },
'X-Engine': { 'X-Engine': {
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm', description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
}, },
@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
@Prop() @Prop()
viewport?: Viewport; viewport?: Viewport;
@Prop()
instruction?: string;
@Prop()
jsonSchema?: object;
static override from(input: any) { static override from(input: any) {
const instance = super.from(input) as CrawlerOptions; const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
if (this.injectFrameScript?.length || this.injectPageScript?.length) { if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false; return false;
} }
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) { if (this.engine?.toLowerCase().includes('lm')) {
return false; return false;
} }

View File

@ -0,0 +1,131 @@
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { PageSnapshot } from './puppeteer';
import { Logger } from '../shared/services/logger';
import _ from 'lodash';
import { AssertionFailureError } from 'civkit';
import { LLMManager } from '../shared/services/common-llm';
const tripleBackTick = '```';
@singleton()
export class LmControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected commonLLM: LLMManager
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
cleanRedundantEmptyLines(text: string) {
const lines = text.split(/\r?\n/g);
const mappedFlag = lines.map((line) => Boolean(line.trim()));
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
}
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
pageshotUrl?: string,
}) {
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
if (!pageshot) {
throw new AssertionFailureError('Screenshot of the page is not available');
}
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
prompt: [
`HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
],
options: {
system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
stream: true
}
});
const chunks: string[] = [];
for await (const txt of it) {
chunks.push(txt);
const output: PageSnapshot = {
...snapshot,
parsed: {
...snapshot?.parsed,
textContent: chunks.join(''),
}
};
yield output;
}
return;
}
async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
if (!snapshot) {
throw new AssertionFailureError('Snapshot of the page is not available');
}
const it = this.commonLLM.iterRun('readerlm-v2', {
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
options: {
// system: 'You are an AI assistant developed by Jina AI',
stream: true
}
});
const chunks: string[] = [];
for await (const txt of it) {
chunks.push(txt);
const output: PageSnapshot = {
...snapshot,
parsed: {
...snapshot?.parsed,
textContent: chunks.join(''),
}
};
yield output;
}
return;
}
async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
if (!snapshot) {
throw new AssertionFailureError('Snapshot of the page is not available');
}
const it = this.commonLLM.iterRun('readerlm-v2', {
prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
options: {
// system: 'You are an AI assistant developed by Jina AI',
stream: true
}
});
const chunks: string[] = [];
for await (const txt of it) {
chunks.push(txt);
const output: PageSnapshot = {
...snapshot,
parsed: {
...snapshot?.parsed,
textContent: chunks.join(''),
}
};
yield output;
}
return;
}
}

View File

@ -1,64 +0,0 @@
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { PageSnapshot } from './puppeteer';
import { Logger } from '../shared/services/logger';
import _ from 'lodash';
import { AssertionFailureError } from 'civkit';
import { LLMManager } from '../shared/services/common-llm';
@singleton()
export class VlmControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected commonLLM: LLMManager
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
pageshotUrl?: string,
}) {
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
if (!pageshot) {
throw new AssertionFailureError('Screenshot of the page is not available');
}
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
prompt: [
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
],
options: {
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
stream: true
}
});
const chunks: string[] = [];
for await (const txt of it) {
chunks.push(txt);
const output: PageSnapshot = {
...snapshot,
parsed: {
...snapshot?.parsed,
textContent: chunks.join(''),
}
};
yield output;
}
return;
}
}

@ -1 +1 @@
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0 Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f