feat: gemini to replace blip2 (#1129)

* feat: domain profile

* fix

* fix

* fix

* fix

* fix

* refactor: curl as direct engine

* fix

* wip

* fix

* fix

* fix

* fix

* fix

---------

Co-authored-by: Sha Zhou <sha.zhou@jina.ai>
This commit is contained in:
Yanlong Wang 2025-01-15 15:03:46 +08:00 committed by GitHub
parent c19ba65391
commit 51a4877933
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 166 additions and 27 deletions

View File

@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
import { JSDomControl } from '../services/jsdom';
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
import { CurlControl } from '../services/curl';
import { VlmControl } from '../services/vlm';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
protected curlControl: CurlControl,
protected vlmControl: VlmControl,
protected jsdomControl: JSDomControl,
protected snapshotFormatter: SnapshotFormatter,
protected firebaseObjectStorage: FirebaseStorageBucketControl,
@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
continue;
}
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.isEarlyReturnApplicable()) {
return formatted;
if (scrapped?.pdfs?.length && !chargeAmount) {
continue;
}
if (chargeAmount && scrapped?.pdfs?.length) {
return formatted;
}
return formatted;
}
if (!lastScrapped) {
@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
lastScrapped = scrapped;
if (!crawlerOptions.isEarlyReturnApplicable()) {
continue;
}
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
continue;
}
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
}
if (crawlerOptions.isEarlyReturnApplicable()) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
);
}
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
);
}
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
}
if (!lastScrapped) {
@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
}
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
chargeAmount = this.assignChargeAmount(formatted);
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
return;
}
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
return;
}
let cache;
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
}
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
crawlOpts.favorScreenshot = true;
}
if (opts.injectFrameScript?.length) {
crawlOpts.injectFrameScripts = (await Promise.all(
opts.injectFrameScript.map((x) => {
@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
return crawlOpts;
}
formatSnapshot(
crawlerOptions: CrawlerOptions,
snapshot: PageSnapshot & {
screenshotUrl?: string;
pageshotUrl?: string;
},
nominalUrl?: URL,
urlValidMs?: number
) {
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
const output: FormattedPage = {
title: snapshot.title,
content: snapshot.parsed?.textContent,
url: snapshot.href,
pageshotUrl: snapshot.pageshotUrl,
[Symbol.dispose]: () => undefined,
};
Object.defineProperty(output, 'textRepresentation', {
value: snapshot.parsed?.textContent,
enumerable: false,
});
return output;
}
return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
}
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
let lastSnapshot;
let lastError;
try {
for await (const x of it) {
lastSnapshot = x;
}
} catch (err) {
lastError = err;
}
if (!lastSnapshot && lastError) {
throw lastError;
}
if (!lastSnapshot) {
throw new AssertionFailureError(`No content available`);
}
return lastSnapshot;
}
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });

View File

@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
return false;
}
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
return false;
}
return true;
}

View File

@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
const r = await this.imageInterrogator.interrogate('blip2', {
const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
image: exported,
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
prompt: `Yield a concise image caption sentence in third person.`,
system: 'You are BLIP2, an image caption model.',
});
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();

View File

@ -0,0 +1,64 @@
import { AsyncService } from 'civkit/async-service';
import { singleton } from 'tsyringe';
import { PageSnapshot } from './puppeteer';
import { Logger } from '../shared/services/logger';
import _ from 'lodash';
import { AssertionFailureError } from 'civkit';
import { LLMManager } from '../shared/services/common-llm';
@singleton()
export class VlmControl extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
constructor(
protected globalLogger: Logger,
protected commonLLM: LLMManager
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
pageshotUrl?: string,
}) {
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
if (!pageshot) {
throw new AssertionFailureError('Screenshot of the page is not available');
}
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
prompt: [
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
],
options: {
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
stream: true
}
});
const chunks: string[] = [];
for await (const txt of it) {
chunks.push(txt);
const output: PageSnapshot = {
...snapshot,
parsed: {
...snapshot?.parsed,
textContent: chunks.join(''),
}
};
yield output;
}
return;
}
}

@ -1 +1 @@
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0