feat: gemini to replace blip2 (#1129)

* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix * wip * fix * fix * fix * fix * fix --------- Co-authored-by: Sha Zhou <sha.zhou@jina.ai>
2025-08-19 05:05:59 +08:00 · 2025-01-15 15:03:46 +08:00 · 2025-01-15 15:03:46 +08:00 · 51a4877933
commit 51a4877933
parent c19ba65391
5 changed files with 166 additions and 27 deletions
--- a/backend/functions/src/cloud-functions/crawler.ts
+++ b/backend/functions/src/cloud-functions/crawler.ts
@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
 import { JSDomControl } from '../services/jsdom';
 import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
 import { CurlControl } from '../services/curl';
 import { VlmControl } from '../services/vlm';
 export interface ExtraScrappingOptions extends ScrappingOptions {
    withIframe?: boolean | 'quoted';
@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
        protected globalLogger: Logger,
        protected puppeteerControl: PuppeteerControl,
        protected curlControl: CurlControl,
        protected vlmControl: VlmControl,
        protected jsdomControl: JSDomControl,
        protected snapshotFormatter: SnapshotFormatter,
        protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
                        continue;
                    }
-                    const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+                    const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
                    chargeAmount = this.assignChargeAmount(formatted);
                    if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                        throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
        if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
            for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
                lastScrapped = scrapped;
                if (!crawlerOptions.isEarlyReturnApplicable()) {
                    continue;
                }
                if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
                    continue;
                }
-                const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+                const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
                chargeAmount = this.assignChargeAmount(formatted);
                if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                    throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
                }
-                if (crawlerOptions.isEarlyReturnApplicable()) {
+                if (scrapped?.pdfs?.length && !chargeAmount) {
-                    return formatted;
+                    continue;
                }
-                if (chargeAmount && scrapped?.pdfs?.length) {
+                return formatted;
                    return formatted;
                }
            }
            if (!lastScrapped) {
@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
                throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
            }
-            const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
+            const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
            chargeAmount = this.assignChargeAmount(formatted);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {
        for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
            lastScrapped = scrapped;
            if (!crawlerOptions.isEarlyReturnApplicable()) {
                continue;
            }
            if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
                continue;
            }
-            const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
+            const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
            chargeAmount = this.assignChargeAmount(formatted);
            if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
                throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
            }
-            if (crawlerOptions.isEarlyReturnApplicable()) {
+            if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
                if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
-                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
+                return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
-                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
+                    { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
-                    );
+                );
                }
                if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
                    return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                        { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
                    );
                }
                return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
            }
            if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
                return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
                    { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
                );
            }
            return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
        }
        if (!lastScrapped) {
@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
            throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
        }
-        const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
+        const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
        chargeAmount = this.assignChargeAmount(formatted);
        if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
            throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
            return;
        }
        if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
            const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
            yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
            return;
        }
        let cache;
        if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
            crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
        }
        if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
            crawlOpts.favorScreenshot = true;
        }
        if (opts.injectFrameScript?.length) {
            crawlOpts.injectFrameScripts = (await Promise.all(
                opts.injectFrameScript.map((x) => {
@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
        return crawlOpts;
    }
    formatSnapshot(
        crawlerOptions: CrawlerOptions,
        snapshot: PageSnapshot & {
            screenshotUrl?: string;
            pageshotUrl?: string;
        },
        nominalUrl?: URL,
        urlValidMs?: number
    ) {
        if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
            const output: FormattedPage = {
                title: snapshot.title,
                content: snapshot.parsed?.textContent,
                url: snapshot.href,
                pageshotUrl: snapshot.pageshotUrl,
                [Symbol.dispose]: () => undefined,
            };
            Object.defineProperty(output, 'textRepresentation', {
                value: snapshot.parsed?.textContent,
                enumerable: false,
            });
            return output;
        }
        return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
    }
    async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
        const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
        let lastSnapshot;
        let lastError;
        try {
            for await (const x of it) {
                lastSnapshot = x;
            }
        } catch (err) {
            lastError = err;
        }
        if (!lastSnapshot && lastError) {
            throw lastError;
        }
        if (!lastSnapshot) {
            throw new AssertionFailureError(`No content available`);
        }
        return lastSnapshot;
    }
    async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
        const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
--- a/backend/functions/src/dto/scrapping-options.ts
+++ b/backend/functions/src/dto/scrapping-options.ts
@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
        if (this.injectFrameScript?.length || this.injectPageScript?.length) {
            return false;
        }
        if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
            return false;
        }
        return true;
    }
--- a/backend/functions/src/services/alt-text.ts
+++ b/backend/functions/src/services/alt-text.ts
@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
            const resized = this.canvasService.fitImageToSquareBox(img, 1024);
            const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
-            const r = await this.imageInterrogator.interrogate('blip2', {
+            const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
                image: exported,
-                // prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
+                prompt: `Yield a concise image caption sentence in third person.`,
                system: 'You are BLIP2, an image caption model.',
            });
            return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
--- a/backend/functions/src/services/vlm.ts
+++ b/backend/functions/src/services/vlm.ts
@ -0,0 +1,64 @@
 import { AsyncService } from 'civkit/async-service';
 import { singleton } from 'tsyringe';
 import { PageSnapshot } from './puppeteer';
 import { Logger } from '../shared/services/logger';
 import _ from 'lodash';
 import { AssertionFailureError } from 'civkit';
 import { LLMManager } from '../shared/services/common-llm';
@singleton()
 export class VlmControl extends AsyncService {
    logger = this.globalLogger.child({ service: this.constructor.name });
    constructor(
        protected globalLogger: Logger,
        protected commonLLM: LLMManager
    ) {
        super(...arguments);
    }
    override async init() {
        await this.dependencyReady();
        this.emit('ready');
    }
    async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
        pageshotUrl?: string,
    }) {
        const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
        if (!pageshot) {
            throw new AssertionFailureError('Screenshot of the page is not available');
        }
        const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
            prompt: [
                typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
                `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
            ],
            options: {
                system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
                stream: true
            }
        });
        const chunks: string[] = [];
        for await (const txt of it) {
            chunks.push(txt);
            const output: PageSnapshot = {
                ...snapshot,
                parsed: {
                    ...snapshot?.parsed,
                    textContent: chunks.join(''),
                }
            };
            yield output;
        }
        return;
    }
 }
--- a/2
+++ b/2
@ -1 +1 @@
-Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
+Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
		`@ -1 +1 @@`
			`Subproject commit 439f633d464f3fd5fe288313766a43163190b60f`				`Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0`