mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-18 19:15:56 +08:00
feat: new lm engine
This commit is contained in:
parent
51a4877933
commit
06f359309e
@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
||||
import { JSDomControl } from '../services/jsdom';
|
||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||
import { CurlControl } from '../services/curl';
|
||||
import { VlmControl } from '../services/vlm';
|
||||
import { LmControl } from '../services/lm';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
|
||||
protected globalLogger: Logger,
|
||||
protected puppeteerControl: PuppeteerControl,
|
||||
protected curlControl: CurlControl,
|
||||
protected vlmControl: VlmControl,
|
||||
protected lmControl: LmControl,
|
||||
protected jsdomControl: JSDomControl,
|
||||
protected snapshotFormatter: SnapshotFormatter,
|
||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||
@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
|
||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||
chargeAmount = this.assignChargeAmount(formatted);
|
||||
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||
}
|
||||
@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
|
||||
return;
|
||||
}
|
||||
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
||||
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
|
||||
// if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
||||
// const rmSelectorEquivalent = [];
|
||||
// if (typeof crawlOpts.removeSelector === 'string') {
|
||||
// rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
||||
// } else if (Array.isArray(crawlOpts.removeSelector)) {
|
||||
// rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
||||
// }
|
||||
// rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
|
||||
|
||||
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
|
||||
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||
// ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
|
||||
// }, crawlerOpts);
|
||||
|
||||
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
|
||||
|
||||
// return;
|
||||
// }
|
||||
|
||||
if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
|
||||
const rmSelectorEquivalent = [];
|
||||
if (typeof crawlOpts.removeSelector === 'string') {
|
||||
rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
||||
} else if (Array.isArray(crawlOpts.removeSelector)) {
|
||||
rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
||||
}
|
||||
rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
|
||||
|
||||
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||
...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
|
||||
}, crawlerOpts);
|
||||
|
||||
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
||||
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
||||
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
|
||||
|
||||
return;
|
||||
}
|
||||
@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
|
||||
}
|
||||
}
|
||||
|
||||
assignChargeAmount(formatted: FormattedPage) {
|
||||
assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
|
||||
if (!formatted) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let amount = 0;
|
||||
if (formatted.content) {
|
||||
amount += estimateToken(formatted.content);
|
||||
const x1 = estimateToken(formatted.content);
|
||||
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
|
||||
amount += x1 * 2;
|
||||
}
|
||||
amount += x1;
|
||||
} else if (formatted.description) {
|
||||
amount += estimateToken(formatted.description);
|
||||
}
|
||||
@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
|
||||
nominalUrl?: URL,
|
||||
urlValidMs?: number
|
||||
) {
|
||||
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
|
||||
const output: FormattedPage = {
|
||||
title: snapshot.title,
|
||||
content: snapshot.parsed?.textContent,
|
||||
|
@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
|
||||
BROWSER = 'browser',
|
||||
DIRECT = 'direct',
|
||||
VLM = 'vlm',
|
||||
READER_LM = 'readerlm-v2',
|
||||
}
|
||||
|
||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||
@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
'X-Engine': {
|
||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
|
||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
|
||||
in: 'header',
|
||||
schema: { type: 'string' }
|
||||
},
|
||||
@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
|
||||
@Prop()
|
||||
viewport?: Viewport;
|
||||
|
||||
@Prop()
|
||||
instruction?: string;
|
||||
|
||||
@Prop()
|
||||
jsonSchema?: object;
|
||||
|
||||
static override from(input: any) {
|
||||
const instance = super.from(input) as CrawlerOptions;
|
||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||
@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
|
||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||
return false;
|
||||
}
|
||||
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
||||
if (this.engine?.toLowerCase().includes('lm')) {
|
||||
return false;
|
||||
}
|
||||
|
||||
|
131
backend/functions/src/services/lm.ts
Normal file
131
backend/functions/src/services/lm.ts
Normal file
@ -0,0 +1,131 @@
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import _ from 'lodash';
|
||||
import { AssertionFailureError } from 'civkit';
|
||||
import { LLMManager } from '../shared/services/common-llm';
|
||||
|
||||
const tripleBackTick = '```';
|
||||
|
||||
@singleton()
|
||||
export class LmControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected commonLLM: LLMManager
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
cleanRedundantEmptyLines(text: string) {
|
||||
const lines = text.split(/\r?\n/g);
|
||||
const mappedFlag = lines.map((line) => Boolean(line.trim()));
|
||||
|
||||
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
|
||||
}
|
||||
|
||||
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
|
||||
pageshotUrl?: string,
|
||||
}) {
|
||||
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
||||
|
||||
if (!pageshot) {
|
||||
throw new AssertionFailureError('Screenshot of the page is not available');
|
||||
}
|
||||
|
||||
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
||||
prompt: [
|
||||
`HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
|
||||
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
||||
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
||||
],
|
||||
|
||||
options: {
|
||||
system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
const chunks: string[] = [];
|
||||
for await (const txt of it) {
|
||||
chunks.push(txt);
|
||||
const output: PageSnapshot = {
|
||||
...snapshot,
|
||||
parsed: {
|
||||
...snapshot?.parsed,
|
||||
textContent: chunks.join(''),
|
||||
}
|
||||
};
|
||||
yield output;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
|
||||
if (!snapshot) {
|
||||
throw new AssertionFailureError('Snapshot of the page is not available');
|
||||
}
|
||||
const it = this.commonLLM.iterRun('readerlm-v2', {
|
||||
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
|
||||
|
||||
options: {
|
||||
// system: 'You are an AI assistant developed by Jina AI',
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
const chunks: string[] = [];
|
||||
for await (const txt of it) {
|
||||
chunks.push(txt);
|
||||
const output: PageSnapshot = {
|
||||
...snapshot,
|
||||
parsed: {
|
||||
...snapshot?.parsed,
|
||||
textContent: chunks.join(''),
|
||||
}
|
||||
};
|
||||
yield output;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
|
||||
if (!snapshot) {
|
||||
throw new AssertionFailureError('Snapshot of the page is not available');
|
||||
}
|
||||
const it = this.commonLLM.iterRun('readerlm-v2', {
|
||||
prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
|
||||
options: {
|
||||
// system: 'You are an AI assistant developed by Jina AI',
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
const chunks: string[] = [];
|
||||
for await (const txt of it) {
|
||||
chunks.push(txt);
|
||||
const output: PageSnapshot = {
|
||||
...snapshot,
|
||||
parsed: {
|
||||
...snapshot?.parsed,
|
||||
textContent: chunks.join(''),
|
||||
}
|
||||
};
|
||||
yield output;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
@ -1,64 +0,0 @@
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { singleton } from 'tsyringe';
|
||||
|
||||
import { PageSnapshot } from './puppeteer';
|
||||
import { Logger } from '../shared/services/logger';
|
||||
import _ from 'lodash';
|
||||
import { AssertionFailureError } from 'civkit';
|
||||
import { LLMManager } from '../shared/services/common-llm';
|
||||
|
||||
@singleton()
|
||||
export class VlmControl extends AsyncService {
|
||||
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
constructor(
|
||||
protected globalLogger: Logger,
|
||||
protected commonLLM: LLMManager
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
|
||||
pageshotUrl?: string,
|
||||
}) {
|
||||
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
||||
|
||||
if (!pageshot) {
|
||||
throw new AssertionFailureError('Screenshot of the page is not available');
|
||||
}
|
||||
|
||||
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
||||
prompt: [
|
||||
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
||||
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
||||
],
|
||||
|
||||
options: {
|
||||
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
||||
stream: true
|
||||
}
|
||||
});
|
||||
|
||||
const chunks: string[] = [];
|
||||
for await (const txt of it) {
|
||||
chunks.push(txt);
|
||||
const output: PageSnapshot = {
|
||||
...snapshot,
|
||||
parsed: {
|
||||
...snapshot?.parsed,
|
||||
textContent: chunks.join(''),
|
||||
}
|
||||
};
|
||||
yield output;
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
}
|
@ -1 +1 @@
|
||||
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
|
||||
Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f
|
Loading…
x
Reference in New Issue
Block a user