feat: gemini to replace blip2 (#1129)

* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix * wip * fix * fix * fix * fix * fix --------- Co-authored-by: Sha Zhou <sha.zhou@jina.ai>
2025-08-18 17:55:54 +08:00 · 2025-01-15 15:03:46 +08:00 · 2025-01-15 15:03:46 +08:00 · 51a4877933
commit 51a4877933
parent c19ba65391
5 changed files with 166 additions and 27 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
 import { JSDomControl } from '../services/jsdom';
 import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
 import { CurlControl } from '../services/curl';
+import { VlmControl } from '../services/vlm';

 export interface ExtraScrappingOptions extends ScrappingOptions {
    withIframe?: boolean | 'quoted';
@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
        protected globalLogger: Logger,
        protected puppeteerControl: PuppeteerControl,
        protected curlControl: CurlControl,
+        protected vlmControl: VlmControl,
        protected jsdomControl: JSDomControl,
        protected snapshotFormatter: SnapshotFormatter,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
                        continue;
                    }

-                    const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
                    chargeAmount = this.assignChargeAmount(formatted);
                    if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                        throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
        if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
            for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
                lastScrapped = scrapped;
+                if (!crawlerOptions.isEarlyReturnApplicable()) {
+                    continue;
+                }
                if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
                    continue;
                }

-                const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+                const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
                chargeAmount = this.assignChargeAmount(formatted);

                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
                }

-                if (crawlerOptions.isEarlyReturnApplicable()) {
-                    return formatted;
+                if (scrapped?.pdfs?.length && !chargeAmount) {
+                    continue;
                }

-                if (chargeAmount && scrapped?.pdfs?.length) {
-                    return formatted;
-                }
+                return formatted;
            }

            if (!lastScrapped) {
@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
                throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
            }

-            const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
+            const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
            chargeAmount = this.assignChargeAmount(formatted);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {

        for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
            lastScrapped = scrapped;
+
+            if (!crawlerOptions.isEarlyReturnApplicable()) {
+                continue;
+            }
+
            if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
                continue;
            }

-            const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+            const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
            chargeAmount = this.assignChargeAmount(formatted);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }

-            if (crawlerOptions.isEarlyReturnApplicable()) {
-                if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
+            if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {

-                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
-                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
-                    );
-                }
-                if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
-
-                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
-                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
-                    );
-                }
-
-                return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
+                return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
+                    { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
+                );
            }
+            if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
+
+                return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
+                    { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
+                );
+            }
+
+            return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
        }

        if (!lastScrapped) {
@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
            throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
        }

-        const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
+        const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
        chargeAmount = this.assignChargeAmount(formatted);
        if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
            throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
            return;
        }

+        if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
+            const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
+
+            yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
+
+            return;
+        }
+
        let cache;

        if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
            crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
        }

+        if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+            crawlOpts.favorScreenshot = true;
+        }
+
        if (opts.injectFrameScript?.length) {
            crawlOpts.injectFrameScripts = (await Promise.all(
                opts.injectFrameScript.map((x) => {
@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
        return crawlOpts;
    }

+    formatSnapshot(
+        crawlerOptions: CrawlerOptions,
+        snapshot: PageSnapshot & {
+            screenshotUrl?: string;
+            pageshotUrl?: string;
+        },
+        nominalUrl?: URL,
+        urlValidMs?: number
+    ) {
+        if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+            const output: FormattedPage = {
+                title: snapshot.title,
+                content: snapshot.parsed?.textContent,
+                url: snapshot.href,
+                pageshotUrl: snapshot.pageshotUrl,
+                [Symbol.dispose]: () => undefined,
+            };
+
+            Object.defineProperty(output, 'textRepresentation', {
+                value: snapshot.parsed?.textContent,
+                enumerable: false,
+            });
+
+            return output;
+        }
+
+        return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
+    }
+
+    async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
+        const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
+
+        let lastSnapshot;
+        let lastError;
+        try {
+            for await (const x of it) {
+                lastSnapshot = x;
+            }
+        } catch (err) {
+            lastError = err;
+        }
+
+        if (!lastSnapshot && lastError) {
+            throw lastError;
+        }
+
+        if (!lastSnapshot) {
+            throw new AssertionFailureError(`No content available`);
+        }
+
+        return lastSnapshot;
+    }
+
    async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
        const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });

--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
+        if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
+            return false;
+        }

        return true;
    }
--- a/backend/functions/src/services/alt-text.ts
+++ b/backend/functions/src/services/alt-text.ts
@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
            const resized = this.canvasService.fitImageToSquareBox(img, 1024);
            const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');

-            const r = await this.imageInterrogator.interrogate('blip2', {
+            const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
                image: exported,
-                // prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
+                prompt: `Yield a concise image caption sentence in third person.`,
+                system: 'You are BLIP2, an image caption model.',
            });

            return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
--- a/backend/functions/src/services/vlm.ts
+++ b/backend/functions/src/services/vlm.ts
@ -0,0 +1,64 @@
+import { AsyncService } from 'civkit/async-service';
+import { singleton } from 'tsyringe';
+
+import { PageSnapshot } from './puppeteer';
+import { Logger } from '../shared/services/logger';
+import _ from 'lodash';
+import { AssertionFailureError } from 'civkit';
+import { LLMManager } from '../shared/services/common-llm';
+
+@singleton()
+export class VlmControl extends AsyncService {
+
+    logger = this.globalLogger.child({ service: this.constructor.name });
+
+    constructor(
+        protected globalLogger: Logger,
+        protected commonLLM: LLMManager
+    ) {
+        super(...arguments);
+    }
+
+    override async init() {
+        await this.dependencyReady();
+
+        this.emit('ready');
+    }
+
+    async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
+        pageshotUrl?: string,
+    }) {
+        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
+
+        if (!pageshot) {
+            throw new AssertionFailureError('Screenshot of the page is not available');
+        }
+
+        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
+            prompt: [
+                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
+                `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
+            ],
+
+            options: {
+                system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
+                stream: true
+            }
+        });
+
+        const chunks: string[] = [];
+        for await (const txt of it) {
+            chunks.push(txt);
+            const output: PageSnapshot = {
+                ...snapshot,
+                parsed: {
+                    ...snapshot?.parsed,
+                    textContent: chunks.join(''),
+                }
+            };
+            yield output;
+        }
+
+        return;
+    }
+}
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
+Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0