mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 17:55:54 +08:00
feat: gemini to replace blip2 (#1129)
* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix * wip * fix * fix * fix * fix * fix --------- Co-authored-by: Sha Zhou <sha.zhou@jina.ai>
This commit is contained in:
parent
c19ba65391
commit
51a4877933
@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
||||
import { JSDomControl } from '../services/jsdom';
|
||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
import { CurlControl } from '../services/curl';
|
||||
import { VlmControl } from '../services/vlm';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected curlControl: CurlControl,
|
||||
protected vlmControl: VlmControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
|
||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||
return formatted;
|
||||
if (scrapped?.pdfs?.length && !chargeAmount) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (chargeAmount && scrapped?.pdfs?.length) {
|
||||
return formatted;
|
||||
}
|
||||
return formatted;
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {
|
||||
|
||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||
lastScrapped = scrapped;
|
||||
|
||||
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
|
||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||
);
|
||||
}
|
||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||
);
|
||||
}
|
||||
|
||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||
}
|
||||
|
||||
if (!lastScrapped) {
|
||||
@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
|
||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||
}
|
||||
|
||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
||||
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
|
||||
|
||||
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
let cache;
|
||||
|
||||
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
||||
@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
|
||||
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||
}
|
||||
|
||||
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
crawlOpts.favorScreenshot = true;
|
||||
}
|
||||
|
||||
if (opts.injectFrameScript?.length) {
|
||||
crawlOpts.injectFrameScripts = (await Promise.all(
|
||||
opts.injectFrameScript.map((x) => {
|
||||
@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
|
||||
return crawlOpts;
|
||||
}
|
||||
|
||||
formatSnapshot(
|
||||
crawlerOptions: CrawlerOptions,
|
||||
snapshot: PageSnapshot & {
|
||||
screenshotUrl?: string;
|
||||
pageshotUrl?: string;
|
||||
},
|
||||
nominalUrl?: URL,
|
||||
urlValidMs?: number
|
||||
) {
|
||||
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
const output: FormattedPage = {
|
||||
title: snapshot.title,
|
||||
content: snapshot.parsed?.textContent,
|
||||
url: snapshot.href,
|
||||
pageshotUrl: snapshot.pageshotUrl,
|
||||
[Symbol.dispose]: () => undefined,
|
||||
};
|
||||
|
||||
Object.defineProperty(output, 'textRepresentation', {
|
||||
value: snapshot.parsed?.textContent,
|
||||
enumerable: false,
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
||||
return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
|
||||
}
|
||||
|
||||
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
||||
const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
|
||||
|
||||
let lastSnapshot;
|
||||
let lastError;
|
||||
try {
|
||||
for await (const x of it) {
|
||||
lastSnapshot = x;
|
||||
}
|
||||
} catch (err) {
|
||||
lastError = err;
|
||||
}
|
||||
|
||||
if (!lastSnapshot && lastError) {
|
||||
throw lastError;
|
||||
}
|
||||
|
||||
if (!lastSnapshot) {
|
||||
throw new AssertionFailureError(`No content available`);
|
||||
}
|
||||
|
||||
return lastSnapshot;
|
||||
}
|
||||
|
||||
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
||||
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
|
||||
|
||||
|
@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
|
||||
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||
|
||||
const r = await this.imageInterrogator.interrogate('blip2', {
|
||||
const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
|
||||
image: exported,
|
||||
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
||||
prompt: `Yield a concise image caption sentence in third person.`,
|
||||
system: 'You are BLIP2, an image caption model.',
|
||||
});
|
||||
|
||||
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
||||
|
64
backend/functions/src/services/vlm.ts
Normal file
64
backend/functions/src/services/vlm.ts
Normal file
@ -0,0 +1,64 @@
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import _ from 'lodash';
|
||||
import { AssertionFailureError } from 'civkit';
|
||||
import { LLMManager } from '../shared/services/common-llm';
|
||||
|
||||
@singleton()
|
||||
export class VlmControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected commonLLM: LLMManager
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
|
||||
pageshotUrl?: string,
|
||||
}) {
|
||||
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
||||
|
||||
if (!pageshot) {
|
||||
throw new AssertionFailureError('Screenshot of the page is not available');
|
||||
}
|
||||
|
||||
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
||||
prompt: [
|
||||
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
||||
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
||||
],
|
||||
|
||||
options: {
|
||||
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
const chunks: string[] = [];
|
||||
for await (const txt of it) {
|
||||
chunks.push(txt);
|
||||
const output: PageSnapshot = {
|
||||
...snapshot,
|
||||
parsed: {
|
||||
...snapshot?.parsed,
|
||||
textContent: chunks.join(''),
|
||||
}
|
||||
};
|
||||
yield output;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
@ -1 +1 @@
|
||||
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
|
||||
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
|
Loading…
x
Reference in New Issue
Block a user