feat: new lm engine

2025-08-18 19:15:56 +08:00 · 2025-01-15 17:38:49 +08:00 · 2025-01-15 17:38:49 +08:00 · 06f359309e
commit 06f359309e
parent 51a4877933
5 changed files with 193 additions and 80 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
 import { JSDomControl } from '../services/jsdom';
 import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
 import { CurlControl } from '../services/curl';
-import { VlmControl } from '../services/vlm';
+import { LmControl } from '../services/lm';

 export interface ExtraScrappingOptions extends ScrappingOptions {
    withIframe?: boolean | 'quoted';
@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
        protected globalLogger: Logger,
        protected puppeteerControl: PuppeteerControl,
        protected curlControl: CurlControl,
-        protected vlmControl: VlmControl,
+        protected lmControl: LmControl,
        protected jsdomControl: JSDomControl,
        protected snapshotFormatter: SnapshotFormatter,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
                    }

                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-                    chargeAmount = this.assignChargeAmount(formatted);
+                    chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
                    if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                        throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
                    }
@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
                }

                const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-                chargeAmount = this.assignChargeAmount(formatted);
+                chargeAmount = this.assignChargeAmount(formatted, crawlOpts);

                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
            }

            const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
-            chargeAmount = this.assignChargeAmount(formatted);
+            chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }
@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
            }

            const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
-            chargeAmount = this.assignChargeAmount(formatted);
+            chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }
@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
        }

        const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
-        chargeAmount = this.assignChargeAmount(formatted);
+        chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
        if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
            throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
        }
@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
            return;
        }

-        if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
-            const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
+        // if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
+        //     const rmSelectorEquivalent = [];
+        //     if (typeof crawlOpts.removeSelector === 'string') {
+        //         rmSelectorEquivalent.push(crawlOpts.removeSelector);
+        //     } else if (Array.isArray(crawlOpts.removeSelector)) {
+        //         rmSelectorEquivalent.push(...crawlOpts.removeSelector);
+        //     }
+        //     rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');

-            yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
+        //     const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+        //         ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
+        //     }, crawlerOpts);
+
+        //     yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
+
+        //     return;
+        // }
+
+        if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
+            const rmSelectorEquivalent = [];
+            if (typeof crawlOpts.removeSelector === 'string') {
+                rmSelectorEquivalent.push(crawlOpts.removeSelector);
+            } else if (Array.isArray(crawlOpts.removeSelector)) {
+                rmSelectorEquivalent.push(...crawlOpts.removeSelector);
+            }
+            rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
+
+            const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
+                ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
+            }, crawlerOpts);
+
+            if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
+                const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
+                yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
+
+                return;
+            }
+
+            yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);

            return;
        }
@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
        }
    }

-    assignChargeAmount(formatted: FormattedPage) {
+    assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
        if (!formatted) {
            return 0;
        }

        let amount = 0;
        if (formatted.content) {
-            amount += estimateToken(formatted.content);
+            const x1 = estimateToken(formatted.content);
+            if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
+                amount += x1 * 2;
+            }
+            amount += x1;
        } else if (formatted.description) {
            amount += estimateToken(formatted.description);
        }
@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
        nominalUrl?: URL,
        urlValidMs?: number
    ) {
-        if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+        if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
            const output: FormattedPage = {
                title: snapshot.title,
                content: snapshot.parsed?.textContent,
--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
    BROWSER = 'browser',
    DIRECT = 'direct',
    VLM = 'vlm',
+    READER_LM = 'readerlm-v2',
 }

 const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
                    schema: { type: 'string' }
                },
                'X-Engine': {
-                    description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
+                    description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
                    in: 'header',
                    schema: { type: 'string' }
                },
@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
    @Prop()
    viewport?: Viewport;

+    @Prop()
+    instruction?: string;
+
+    @Prop()
+    jsonSchema?: object;
+
    static override from(input: any) {
        const instance = super.from(input) as CrawlerOptions;
        const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
-        if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+        if (this.engine?.toLowerCase().includes('lm')) {
            return false;
        }

--- a/backend/functions/src/services/lm.ts
+++ b/backend/functions/src/services/lm.ts
@ -0,0 +1,131 @@
+import { AsyncService } from 'civkit/async-service';
+import { singleton } from 'tsyringe';
+
+import { PageSnapshot } from './puppeteer';
+import { Logger } from '../shared/services/logger';
+import _ from 'lodash';
+import { AssertionFailureError } from 'civkit';
+import { LLMManager } from '../shared/services/common-llm';
+
+const tripleBackTick = '```';
+
+@singleton()
+export class LmControl extends AsyncService {
+
+    logger = this.globalLogger.child({ service: this.constructor.name });
+
+    constructor(
+        protected globalLogger: Logger,
+        protected commonLLM: LLMManager
+    ) {
+        super(...arguments);
+    }
+
+    override async init() {
+        await this.dependencyReady();
+
+        this.emit('ready');
+    }
+
+    cleanRedundantEmptyLines(text: string) {
+        const lines = text.split(/\r?\n/g);
+        const mappedFlag = lines.map((line) => Boolean(line.trim()));
+
+        return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
+    }
+
+    async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
+        pageshotUrl?: string,
+    }) {
+        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
+
+        if (!pageshot) {
+            throw new AssertionFailureError('Screenshot of the page is not available');
+        }
+
+        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
+            prompt: [
+                `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
+                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
+                `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
+            ],
+
+            options: {
+                system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
+                stream: true
+            }
+        });
+
+        const chunks: string[] = [];
+        for await (const txt of it) {
+            chunks.push(txt);
+            const output: PageSnapshot = {
+                ...snapshot,
+                parsed: {
+                    ...snapshot?.parsed,
+                    textContent: chunks.join(''),
+                }
+            };
+            yield output;
+        }
+
+        return;
+    }
+
+    async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
+        if (!snapshot) {
+            throw new AssertionFailureError('Snapshot of the page is not available');
+        }
+        const it = this.commonLLM.iterRun('readerlm-v2', {
+            prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
+
+            options: {
+                // system: 'You are an AI assistant developed by Jina AI',
+                stream: true
+            }
+        });
+
+        const chunks: string[] = [];
+        for await (const txt of it) {
+            chunks.push(txt);
+            const output: PageSnapshot = {
+                ...snapshot,
+                parsed: {
+                    ...snapshot?.parsed,
+                    textContent: chunks.join(''),
+                }
+            };
+            yield output;
+        }
+
+        return;
+    }
+
+    async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
+        if (!snapshot) {
+            throw new AssertionFailureError('Snapshot of the page is not available');
+        }
+        const it = this.commonLLM.iterRun('readerlm-v2', {
+            prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
+            options: {
+                // system: 'You are an AI assistant developed by Jina AI',
+                stream: true
+            }
+        });
+
+        const chunks: string[] = [];
+        for await (const txt of it) {
+            chunks.push(txt);
+            const output: PageSnapshot = {
+                ...snapshot,
+                parsed: {
+                    ...snapshot?.parsed,
+                    textContent: chunks.join(''),
+                }
+            };
+            yield output;
+        }
+
+        return;
+    }
+}
--- a/backend/functions/src/services/vlm.ts
+++ b/backend/functions/src/services/vlm.ts
@ -1,64 +0,0 @@
-import { AsyncService } from 'civkit/async-service';
-import { singleton } from 'tsyringe';
-
-import { PageSnapshot } from './puppeteer';
-import { Logger } from '../shared/services/logger';
-import _ from 'lodash';
-import { AssertionFailureError } from 'civkit';
-import { LLMManager } from '../shared/services/common-llm';
-
-@singleton()
-export class VlmControl extends AsyncService {
-
-    logger = this.globalLogger.child({ service: this.constructor.name });
-
-    constructor(
-        protected globalLogger: Logger,
-        protected commonLLM: LLMManager
-    ) {
-        super(...arguments);
-    }
-
-    override async init() {
-        await this.dependencyReady();
-
-        this.emit('ready');
-    }
-
-    async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
-        pageshotUrl?: string,
-    }) {
-        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
-
-        if (!pageshot) {
-            throw new AssertionFailureError('Screenshot of the page is not available');
-        }
-
-        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
-            prompt: [
-                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
-                `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
-            ],
-
-            options: {
-                system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
-                stream: true
-            }
-        });
-
-        const chunks: string[] = [];
-        for await (const txt of it) {
-            chunks.push(txt);
-            const output: PageSnapshot = {
-                ...snapshot,
-                parsed: {
-                    ...snapshot?.parsed,
-                    textContent: chunks.join(''),
-                }
-            };
-            yield output;
-        }
-
-        return;
-    }
-}
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
+Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f