feat: new lm engine

2025-08-19 02:25:53 +08:00 · 2025-01-15 17:38:49 +08:00 · 2025-01-15 17:38:49 +08:00 · 06f359309e
commit 06f359309e
parent 51a4877933
5 changed files with 193 additions and 80 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
 import { JSDomControl } from '../services/jsdom';
 import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
 import { CurlControl } from '../services/curl';
-import { VlmControl } from '../services/vlm';
+import { LmControl } from '../services/lm';
 export interface ExtraScrappingOptions extends ScrappingOptions {
    withIframe?: boolean | 'quoted';
@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
        protected globalLogger: Logger,
        protected puppeteerControl: PuppeteerControl,
        protected curlControl: CurlControl,
-        protected vlmControl: VlmControl,
+        protected lmControl: LmControl,
        protected jsdomControl: JSDomControl,
        protected snapshotFormatter: SnapshotFormatter,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
                    }
                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-                    chargeAmount = this.assignChargeAmount(formatted);
+                    chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
                    if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                        throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
                    }
@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
                }
                const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-                chargeAmount = this.assignChargeAmount(formatted);
+                chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
            }
            const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
-            chargeAmount = this.assignChargeAmount(formatted);
+            chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }
@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
            }
            const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-            chargeAmount = this.assignChargeAmount(formatted);
+            chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }
@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
        }
        const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
-        chargeAmount = this.assignChargeAmount(formatted);
+        chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
        if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
            throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
        }
@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
            return;
        }
-        if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
+        // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
-            const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
+        //     const rmSelectorEquivalent = [];
        //     if (typeof crawlOpts.removeSelector === 'string') {
        //         rmSelectorEquivalent.push(crawlOpts.removeSelector);
        //     } else if (Array.isArray(crawlOpts.removeSelector)) {
        //         rmSelectorEquivalent.push(...crawlOpts.removeSelector);
        //     }
        //     rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
-            yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
+        //     const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
        //         ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
        //     }, crawlerOpts);
        //     yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
        //     return;
        // }
        if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
            const rmSelectorEquivalent = [];
            if (typeof crawlOpts.removeSelector === 'string') {
                rmSelectorEquivalent.push(crawlOpts.removeSelector);
            } else if (Array.isArray(crawlOpts.removeSelector)) {
                rmSelectorEquivalent.push(...crawlOpts.removeSelector);
            }
            rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
            const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
                ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
            }, crawlerOpts);
            if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
                const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
                yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
                return;
            }
            yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
            return;
        }
@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
        }
    }
-    assignChargeAmount(formatted: FormattedPage) {
+    assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
        if (!formatted) {
            return 0;
        }
        let amount = 0;
        if (formatted.content) {
-            amount += estimateToken(formatted.content);
+            const x1 = estimateToken(formatted.content);
            if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
                amount += x1 * 2;
            }
            amount += x1;
        } else if (formatted.description) {
            amount += estimateToken(formatted.description);
        }
@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
        nominalUrl?: URL,
        urlValidMs?: number
    ) {
-        if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+        if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
            const output: FormattedPage = {
                title: snapshot.title,
                content: snapshot.parsed?.textContent,
--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
    BROWSER = 'browser',
    DIRECT = 'direct',
    VLM = 'vlm',
    READER_LM = 'readerlm-v2',
 }
 const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
                    schema: { type: 'string' }
                },
                'X-Engine': {
-                    description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
+                    description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
                    in: 'header',
                    schema: { type: 'string' }
                },
@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
    @Prop()
    viewport?: Viewport;
    @Prop()
    instruction?: string;
    @Prop()
    jsonSchema?: object;
    static override from(input: any) {
        const instance = super.from(input) as CrawlerOptions;
        const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
-        if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+        if (this.engine?.toLowerCase().includes('lm')) {
            return false;
        }
--- a/backend/functions/src/services/lm.ts
+++ b/backend/functions/src/services/lm.ts
@ -0,0 +1,131 @@
 import { AsyncService } from 'civkit/async-service';
 import { singleton } from 'tsyringe';
 import { PageSnapshot } from './puppeteer';
 import { Logger } from '../shared/services/logger';
 import _ from 'lodash';
 import { AssertionFailureError } from 'civkit';
 import { LLMManager } from '../shared/services/common-llm';
 const tripleBackTick = '```';
@singleton()
 export class LmControl extends AsyncService {
    logger = this.globalLogger.child({ service: this.constructor.name });
    constructor(
        protected globalLogger: Logger,
        protected commonLLM: LLMManager
    ) {
        super(...arguments);
    }
    override async init() {
        await this.dependencyReady();
        this.emit('ready');
    }
    cleanRedundantEmptyLines(text: string) {
        const lines = text.split(/\r?\n/g);
        const mappedFlag = lines.map((line) => Boolean(line.trim()));
        return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
    }
    async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
        pageshotUrl?: string,
    }) {
        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
        if (!pageshot) {
            throw new AssertionFailureError('Screenshot of the page is not available');
        }
        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
            prompt: [
                `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
                `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
            ],
            options: {
                system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
                stream: true
            }
        });
        const chunks: string[] = [];
        for await (const txt of it) {
            chunks.push(txt);
            const output: PageSnapshot = {
                ...snapshot,
                parsed: {
                    ...snapshot?.parsed,
                    textContent: chunks.join(''),
                }
            };
            yield output;
        }
        return;
    }
    async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
        if (!snapshot) {
            throw new AssertionFailureError('Snapshot of the page is not available');
        }
        const it = this.commonLLM.iterRun('readerlm-v2', {
            prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
            options: {
                // system: 'You are an AI assistant developed by Jina AI',
                stream: true
            }
        });
        const chunks: string[] = [];
        for await (const txt of it) {
            chunks.push(txt);
            const output: PageSnapshot = {
                ...snapshot,
                parsed: {
                    ...snapshot?.parsed,
                    textContent: chunks.join(''),
                }
            };
            yield output;
        }
        return;
    }
    async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
        if (!snapshot) {
            throw new AssertionFailureError('Snapshot of the page is not available');
        }
        const it = this.commonLLM.iterRun('readerlm-v2', {
            prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
            options: {
                // system: 'You are an AI assistant developed by Jina AI',
                stream: true
            }
        });
        const chunks: string[] = [];
        for await (const txt of it) {
            chunks.push(txt);
            const output: PageSnapshot = {
                ...snapshot,
                parsed: {
                    ...snapshot?.parsed,
                    textContent: chunks.join(''),
                }
            };
            yield output;
        }
        return;
    }
 }
--- a/backend/functions/src/services/vlm.ts
+++ b/backend/functions/src/services/vlm.ts
@ -1,64 +0,0 @@
 import { AsyncService } from 'civkit/async-service';
 import { singleton } from 'tsyringe';
 import { PageSnapshot } from './puppeteer';
 import { Logger } from '../shared/services/logger';
 import _ from 'lodash';
 import { AssertionFailureError } from 'civkit';
 import { LLMManager } from '../shared/services/common-llm';
@singleton()
 export class VlmControl extends AsyncService {
    logger = this.globalLogger.child({ service: this.constructor.name });
    constructor(
        protected globalLogger: Logger,
        protected commonLLM: LLMManager
    ) {
        super(...arguments);
    }
    override async init() {
        await this.dependencyReady();
        this.emit('ready');
    }
    async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
        pageshotUrl?: string,
    }) {
        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
        if (!pageshot) {
            throw new AssertionFailureError('Screenshot of the page is not available');
        }
        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
            prompt: [
                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
                `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
            ],
            options: {
                system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
                stream: true
            }
        });
        const chunks: string[] = [];
        for await (const txt of it) {
            chunks.push(txt);
            const output: PageSnapshot = {
                ...snapshot,
                parsed: {
                    ...snapshot?.parsed,
                    textContent: chunks.join(''),
                }
            };
            yield output;
        }
        return;
    }
 }
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
+Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f
		`@ -1 +1 @@`
			`Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0`				`Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f`