mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 05:05:59 +08:00
feat: gemini to replace blip2 (#1129)
* feat: domain profile * fix * fix * fix * fix * fix * refactor: curl as direct engine * fix * wip * fix * fix * fix * fix * fix --------- Co-authored-by: Sha Zhou <sha.zhou@jina.ai>
This commit is contained in:
parent
c19ba65391
commit
51a4877933
@ -23,6 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
|||||||
import { JSDomControl } from '../services/jsdom';
|
import { JSDomControl } from '../services/jsdom';
|
||||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||||
import { CurlControl } from '../services/curl';
|
import { CurlControl } from '../services/curl';
|
||||||
|
import { VlmControl } from '../services/vlm';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -57,6 +58,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
protected curlControl: CurlControl,
|
protected curlControl: CurlControl,
|
||||||
|
protected vlmControl: VlmControl,
|
||||||
protected jsdomControl: JSDomControl,
|
protected jsdomControl: JSDomControl,
|
||||||
protected snapshotFormatter: SnapshotFormatter,
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
@ -281,7 +283,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
@ -311,24 +313,25 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
|
||||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
|
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
|
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
if (scrapped?.pdfs?.length && !chargeAmount) {
|
||||||
return formatted;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (chargeAmount && scrapped?.pdfs?.length) {
|
return formatted;
|
||||||
return formatted;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!lastScrapped) {
|
if (!lastScrapped) {
|
||||||
@ -338,7 +341,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
@ -356,32 +359,35 @@ export class CrawlerHost extends RPCHost {
|
|||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
for await (const scrapped of this.cachedScrap(targetUrl, crawlOpts, crawlerOptions)) {
|
||||||
lastScrapped = scrapped;
|
lastScrapped = scrapped;
|
||||||
|
|
||||||
|
if (!crawlerOptions.isEarlyReturnApplicable()) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
if (crawlerOptions.waitForSelector || ((!scrapped?.parsed?.content || !scrapped?.title?.trim()) && !scrapped?.pdfs?.length)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlerOptions.isEarlyReturnApplicable()) {
|
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
if (crawlerOptions.respondWith === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'screenshotUrl') } }
|
||||||
);
|
);
|
||||||
}
|
|
||||||
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
|
||||||
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
|
||||||
}
|
}
|
||||||
|
if (crawlerOptions.respondWith === 'pageshot' && Reflect.get(formatted, 'pageshotUrl')) {
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`,
|
||||||
|
{ code: 302, envelope: null, headers: { Location: Reflect.get(formatted, 'pageshotUrl') } }
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
return assignTransferProtocolMeta(`${formatted.textRepresentation}`, { contentType: 'text/plain', envelope: null });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!lastScrapped) {
|
if (!lastScrapped) {
|
||||||
@ -391,7 +397,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
throw new AssertionFailureError(`No content available for URL ${targetUrl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
@ -619,6 +625,14 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
||||||
|
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
|
||||||
|
|
||||||
|
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
let cache;
|
let cache;
|
||||||
|
|
||||||
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
if (!crawlerOpts || crawlerOpts.isCacheQueryApplicable()) {
|
||||||
@ -765,6 +779,10 @@ export class CrawlerHost extends RPCHost {
|
|||||||
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
crawlOpts.extraHeaders['Accept-Language'] = opts.locale;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (opts.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||||
|
crawlOpts.favorScreenshot = true;
|
||||||
|
}
|
||||||
|
|
||||||
if (opts.injectFrameScript?.length) {
|
if (opts.injectFrameScript?.length) {
|
||||||
crawlOpts.injectFrameScripts = (await Promise.all(
|
crawlOpts.injectFrameScripts = (await Promise.all(
|
||||||
opts.injectFrameScript.map((x) => {
|
opts.injectFrameScript.map((x) => {
|
||||||
@ -792,6 +810,59 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return crawlOpts;
|
return crawlOpts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
formatSnapshot(
|
||||||
|
crawlerOptions: CrawlerOptions,
|
||||||
|
snapshot: PageSnapshot & {
|
||||||
|
screenshotUrl?: string;
|
||||||
|
pageshotUrl?: string;
|
||||||
|
},
|
||||||
|
nominalUrl?: URL,
|
||||||
|
urlValidMs?: number
|
||||||
|
) {
|
||||||
|
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||||
|
const output: FormattedPage = {
|
||||||
|
title: snapshot.title,
|
||||||
|
content: snapshot.parsed?.textContent,
|
||||||
|
url: snapshot.href,
|
||||||
|
pageshotUrl: snapshot.pageshotUrl,
|
||||||
|
[Symbol.dispose]: () => undefined,
|
||||||
|
};
|
||||||
|
|
||||||
|
Object.defineProperty(output, 'textRepresentation', {
|
||||||
|
value: snapshot.parsed?.textContent,
|
||||||
|
enumerable: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
return this.snapshotFormatter.formatSnapshot(crawlerOptions.respondWith, snapshot, nominalUrl, urlValidMs);
|
||||||
|
}
|
||||||
|
|
||||||
|
async getFinalSnapshot(url: URL, opts?: ExtraScrappingOptions, crawlerOptions?: CrawlerOptions): Promise<PageSnapshot | undefined> {
|
||||||
|
const it = this.cachedScrap(url, { ...opts, engine: ENGINE_TYPE.BROWSER }, crawlerOptions);
|
||||||
|
|
||||||
|
let lastSnapshot;
|
||||||
|
let lastError;
|
||||||
|
try {
|
||||||
|
for await (const x of it) {
|
||||||
|
lastSnapshot = x;
|
||||||
|
}
|
||||||
|
} catch (err) {
|
||||||
|
lastError = err;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastSnapshot && lastError) {
|
||||||
|
throw lastError;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!lastSnapshot) {
|
||||||
|
throw new AssertionFailureError(`No content available`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return lastSnapshot;
|
||||||
|
}
|
||||||
|
|
||||||
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
async simpleCrawl(mode: string, url: URL, opts?: ExtraScrappingOptions) {
|
||||||
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
|
const it = this.cachedScrap(url, { ...opts, minIntervalMs: 500 });
|
||||||
|
|
||||||
|
@ -461,6 +461,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -33,9 +33,10 @@ export class AltTextService extends AsyncService {
|
|||||||
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
const resized = this.canvasService.fitImageToSquareBox(img, 1024);
|
||||||
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
const exported = await this.canvasService.canvasToBuffer(resized, 'image/png');
|
||||||
|
|
||||||
const r = await this.imageInterrogator.interrogate('blip2', {
|
const r = await this.imageInterrogator.interrogate('vertex-gemini-1.5-flash-002', {
|
||||||
image: exported,
|
image: exported,
|
||||||
// prompt: `A formal caption in one sentence, concise and in the third person: HTML <img> alt text of this image. Return "**NSFW**" if you don't feel comfortable captioning it.`
|
prompt: `Yield a concise image caption sentence in third person.`,
|
||||||
|
system: 'You are BLIP2, an image caption model.',
|
||||||
});
|
});
|
||||||
|
|
||||||
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
return r.replaceAll(/[\n\"]|(\.\s*$)/g, '').trim();
|
||||||
|
64
backend/functions/src/services/vlm.ts
Normal file
64
backend/functions/src/services/vlm.ts
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
|
||||||
|
import { PageSnapshot } from './puppeteer';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { AssertionFailureError } from 'civkit';
|
||||||
|
import { LLMManager } from '../shared/services/common-llm';
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class VlmControl extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected commonLLM: LLMManager
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
|
||||||
|
pageshotUrl?: string,
|
||||||
|
}) {
|
||||||
|
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
||||||
|
|
||||||
|
if (!pageshot) {
|
||||||
|
throw new AssertionFailureError('Screenshot of the page is not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
||||||
|
prompt: [
|
||||||
|
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
||||||
|
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
||||||
|
],
|
||||||
|
|
||||||
|
options: {
|
||||||
|
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
||||||
|
stream: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
for await (const txt of it) {
|
||||||
|
chunks.push(txt);
|
||||||
|
const output: PageSnapshot = {
|
||||||
|
...snapshot,
|
||||||
|
parsed: {
|
||||||
|
...snapshot?.parsed,
|
||||||
|
textContent: chunks.join(''),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
yield output;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
@ -1 +1 @@
|
|||||||
Subproject commit 439f633d464f3fd5fe288313766a43163190b60f
|
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
|
Loading…
x
Reference in New Issue
Block a user