From 51a48779339185e5ee537582569fdc3480a0e205 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 15 Jan 2025 15:03:46 +0800 Subject: [PATCH] feat: gemini to replace blip2 (#1129) * feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix * wip * fix * fix * fix * fix * fix --------- Co-authored-by: Sha Zhou --- .../functions/src/cloud-functions/crawler.ts | 119 ++++++++++++++---- .../functions/src/dto/scrapping-options.ts | 3 + backend/functions/src/services/alt-text.ts | 5 +- backend/functions/src/services/vlm.ts | 64 ++++++++++ thinapps-shared | 2 +- 5 files changed, 166 insertions(+), 27 deletions(-) create mode 100644 backend/functions/src/services/vlm.ts diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index aa73681..5a98575 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip- import { JSDomControl } from '../services/jsdom'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; import { CurlControl } from '../services/curl'; +import { VlmControl } from '../services/vlm'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost { protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, protected curlControl: CurlControl, + protected vlmControl: VlmControl, protected jsdomControl: JSDomControl, protected snapshotFormatter: SnapshotFormatter, protected firebaseObjectStorage: FirebaseStorageBucketControl, @@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost { continue; } - const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost { if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; + if (!crawlerOptions.isEarlyReturnApplicable()) { + continue; + } if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) { continue; } - const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } - if (crawlerOptions.isEarlyReturnApplicable()) { - return formatted; + if (scrapped?.pdfs?.length && !chargeAmount) { + continue; } - if (chargeAmount && scrapped?.pdfs?.length) { - return formatted; - } + return formatted; } if (!lastScrapped) { @@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost { for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) { lastScrapped = scrapped; + + if (!crawlerOptions.isEarlyReturnApplicable()) { + continue; + } + if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) { continue; } - const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } - if (crawlerOptions.isEarlyReturnApplicable()) { - if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { + if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) { - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, - { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } - ); - } - if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { - - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, - { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } - ); - } - - return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } } + ); } + if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) { + + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, + { code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } } + ); + } + + return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null }); } if (!lastScrapped) { @@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost { throw new AssertionFailureError(`No content available for URL ${targetUrl}`); } - const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs); + const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); chargeAmount = this.assignChargeAmount(formatted); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost { return; } + if (crawlOpts?.engine === ENGINE_TYPE.VLM) { + const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts); + + yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot); + + return; + } + let cache; if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) { @@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost { crawlOpts.extraHeaders['Accept-Language'] = opts.locale; } + if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + crawlOpts.favorScreenshot = true; + } + if (opts.injectFrameScript?.length) { crawlOpts.injectFrameScripts = (await Promise.all( opts.injectFrameScript.map((x) => { @@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost { return crawlOpts; } + formatSnapshot( + crawlerOptions: CrawlerOptions, + snapshot: PageSnapshot & { + screenshotUrl?: string; + pageshotUrl?: string; + }, + nominalUrl?: URL, + urlValidMs?: number + ) { + if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + const output: FormattedPage = { + title: snapshot.title, + content: snapshot.parsed?.textContent, + url: snapshot.href, + pageshotUrl: snapshot.pageshotUrl, + [Symbol.dispose]: () => undefined, + }; + + Object.defineProperty(output, 'textRepresentation', { + value: snapshot.parsed?.textContent, + enumerable: false, + }); + + return output; + } + + return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs); + } + + async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise { + const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions); + + let lastSnapshot; + let lastError; + try { + for await (const x of it) { + lastSnapshot = x; + } + } catch (err) { + lastError = err; + } + + if (!lastSnapshot && lastError) { + throw lastError; + } + + if (!lastSnapshot) { + throw new AssertionFailureError(`No content available`); + } + + return lastSnapshot; + } + async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) { const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 }); diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 88d235f..6b1c41e 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable { if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } + if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + return false; + } return true; } diff --git a/backend/functions/src/services/alt-text.ts b/backend/functions/src/services/alt-text.ts index 713d380..c074c98 100644 --- a/backend/functions/src/services/alt-text.ts +++ b/backend/functions/src/services/alt-text.ts @@ -33,9 +33,10 @@ export class AltTextService extends AsyncService { const resized = this.canvasService.fitImageToSquareBox(img, 1024); const exported = await this.canvasService.canvasToBuffer(resized, 'image/png'); - const r = await this.imageInterrogator.interrogate('blip2', { + const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', { image: exported, - // prompt: `A formal caption in one sentence, concise and in the third person: HTML alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.` + prompt: `Yield a concise image caption sentence in third person.`, + system: 'You are BLIP2, an image caption model.', }); return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim(); diff --git a/backend/functions/src/services/vlm.ts b/backend/functions/src/services/vlm.ts new file mode 100644 index 0000000..bc0c4f8 --- /dev/null +++ b/backend/functions/src/services/vlm.ts @@ -0,0 +1,64 @@ +import { AsyncService } from 'civkit/async-service'; +import { singleton } from 'tsyringe'; + +import { PageSnapshot } from './puppeteer'; +import { Logger } from '../shared/services/logger'; +import _ from 'lodash'; +import { AssertionFailureError } from 'civkit'; +import { LLMManager } from '../shared/services/common-llm'; + +@singleton() +export class VlmControl extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: Logger, + protected commonLLM: LLMManager + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + async* fromBrowserSnapshot(snapshot?: PageSnapshot & { + pageshotUrl?: string, + }) { + const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot; + + if (!pageshot) { + throw new AssertionFailureError('Screenshot of the page is not available'); + } + + const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', { + prompt: [ + typeof pageshot === 'string' ? new URL(pageshot) : pageshot, + `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`, + ], + + options: { + system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed', + stream: true + } + }); + + const chunks: string[] = []; + for await (const txt of it) { + chunks.push(txt); + const output: PageSnapshot = { + ...snapshot, + parsed: { + ...snapshot?.parsed, + textContent: chunks.join(''), + } + }; + yield output; + } + + return; + } +} diff --git a/thinapps-shared b/thinapps-shared index 439f633..a17e580 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 439f633d464f3fd5fe288313766a43163190b60f +Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0