mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader.git
synced 2025-08-19 02:25:53 +08:00
feat: new lm engine
This commit is contained in:
parent
51a4877933
commit
06f359309e
@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip-
|
|||||||
import { JSDomControl } from '../services/jsdom';
|
import { JSDomControl } from '../services/jsdom';
|
||||||
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter';
|
||||||
import { CurlControl } from '../services/curl';
|
import { CurlControl } from '../services/curl';
|
||||||
import { VlmControl } from '../services/vlm';
|
import { LmControl } from '../services/lm';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
protected globalLogger: Logger,
|
protected globalLogger: Logger,
|
||||||
protected puppeteerControl: PuppeteerControl,
|
protected puppeteerControl: PuppeteerControl,
|
||||||
protected curlControl: CurlControl,
|
protected curlControl: CurlControl,
|
||||||
protected vlmControl: VlmControl,
|
protected lmControl: LmControl,
|
||||||
protected jsdomControl: JSDomControl,
|
protected jsdomControl: JSDomControl,
|
||||||
protected snapshotFormatter: SnapshotFormatter,
|
protected snapshotFormatter: SnapshotFormatter,
|
||||||
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
protected firebaseObjectStorage: FirebaseStorageBucketControl,
|
||||||
@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||||
|
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs);
|
||||||
chargeAmount = this.assignChargeAmount(formatted);
|
chargeAmount = this.assignChargeAmount(formatted, crawlOpts);
|
||||||
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) {
|
||||||
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`);
|
||||||
}
|
}
|
||||||
@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
// if (crawlOpts?.engine === ENGINE_TYPE.VLM) {
|
||||||
const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts);
|
// const rmSelectorEquivalent = [];
|
||||||
|
// if (typeof crawlOpts.removeSelector === 'string') {
|
||||||
|
// rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
||||||
|
// } else if (Array.isArray(crawlOpts.removeSelector)) {
|
||||||
|
// rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
||||||
|
// }
|
||||||
|
// rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav');
|
||||||
|
|
||||||
yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot);
|
// const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||||
|
// ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER
|
||||||
|
// }, crawlerOpts);
|
||||||
|
|
||||||
|
// yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot);
|
||||||
|
|
||||||
|
// return;
|
||||||
|
// }
|
||||||
|
|
||||||
|
if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) {
|
||||||
|
const rmSelectorEquivalent = [];
|
||||||
|
if (typeof crawlOpts.removeSelector === 'string') {
|
||||||
|
rmSelectorEquivalent.push(crawlOpts.removeSelector);
|
||||||
|
} else if (Array.isArray(crawlOpts.removeSelector)) {
|
||||||
|
rmSelectorEquivalent.push(...crawlOpts.removeSelector);
|
||||||
|
}
|
||||||
|
rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option');
|
||||||
|
|
||||||
|
const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, {
|
||||||
|
...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined
|
||||||
|
}, crawlerOpts);
|
||||||
|
|
||||||
|
if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) {
|
||||||
|
const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined;
|
||||||
|
yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot);
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot);
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
assignChargeAmount(formatted: FormattedPage) {
|
assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) {
|
||||||
if (!formatted) {
|
if (!formatted) {
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
let amount = 0;
|
let amount = 0;
|
||||||
if (formatted.content) {
|
if (formatted.content) {
|
||||||
amount += estimateToken(formatted.content);
|
const x1 = estimateToken(formatted.content);
|
||||||
|
if (scrappingOptions?.engine?.toLowerCase().includes('lm')) {
|
||||||
|
amount += x1 * 2;
|
||||||
|
}
|
||||||
|
amount += x1;
|
||||||
} else if (formatted.description) {
|
} else if (formatted.description) {
|
||||||
amount += estimateToken(formatted.description);
|
amount += estimateToken(formatted.description);
|
||||||
}
|
}
|
||||||
@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
nominalUrl?: URL,
|
nominalUrl?: URL,
|
||||||
urlValidMs?: number
|
urlValidMs?: number
|
||||||
) {
|
) {
|
||||||
if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
if (crawlerOptions.engine?.toLowerCase().includes('lm')) {
|
||||||
const output: FormattedPage = {
|
const output: FormattedPage = {
|
||||||
title: snapshot.title,
|
title: snapshot.title,
|
||||||
content: snapshot.parsed?.textContent,
|
content: snapshot.parsed?.textContent,
|
||||||
|
@ -15,6 +15,7 @@ export enum ENGINE_TYPE {
|
|||||||
BROWSER = 'browser',
|
BROWSER = 'browser',
|
||||||
DIRECT = 'direct',
|
DIRECT = 'direct',
|
||||||
VLM = 'vlm',
|
VLM = 'vlm',
|
||||||
|
READER_LM = 'readerlm-v2',
|
||||||
}
|
}
|
||||||
|
|
||||||
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
const CONTENT_FORMAT_VALUES = new Set<string>(Object.values(CONTENT_FORMAT));
|
||||||
@ -188,7 +189,7 @@ class Viewport extends AutoCastable {
|
|||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
'X-Engine': {
|
'X-Engine': {
|
||||||
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm',
|
description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2',
|
||||||
in: 'header',
|
in: 'header',
|
||||||
schema: { type: 'string' }
|
schema: { type: 'string' }
|
||||||
},
|
},
|
||||||
@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
viewport?: Viewport;
|
viewport?: Viewport;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
instruction?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
jsonSchema?: object;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||||
@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
if (this.injectFrameScript?.length || this.injectPageScript?.length) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) {
|
if (this.engine?.toLowerCase().includes('lm')) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
131
backend/functions/src/services/lm.ts
Normal file
131
backend/functions/src/services/lm.ts
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
|
||||||
|
import { PageSnapshot } from './puppeteer';
|
||||||
|
import { Logger } from '../shared/services/logger';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { AssertionFailureError } from 'civkit';
|
||||||
|
import { LLMManager } from '../shared/services/common-llm';
|
||||||
|
|
||||||
|
const tripleBackTick = '```';
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class LmControl extends AsyncService {
|
||||||
|
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: Logger,
|
||||||
|
protected commonLLM: LLMManager
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanRedundantEmptyLines(text: string) {
|
||||||
|
const lines = text.split(/\r?\n/g);
|
||||||
|
const mappedFlag = lines.map((line) => Boolean(line.trim()));
|
||||||
|
|
||||||
|
return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & {
|
||||||
|
pageshotUrl?: string,
|
||||||
|
}) {
|
||||||
|
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
||||||
|
|
||||||
|
if (!pageshot) {
|
||||||
|
throw new AssertionFailureError('Screenshot of the page is not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
||||||
|
prompt: [
|
||||||
|
`HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`,
|
||||||
|
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
||||||
|
`Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
||||||
|
],
|
||||||
|
|
||||||
|
options: {
|
||||||
|
system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
||||||
|
stream: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
for await (const txt of it) {
|
||||||
|
chunks.push(txt);
|
||||||
|
const output: PageSnapshot = {
|
||||||
|
...snapshot,
|
||||||
|
parsed: {
|
||||||
|
...snapshot?.parsed,
|
||||||
|
textContent: chunks.join(''),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
yield output;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) {
|
||||||
|
if (!snapshot) {
|
||||||
|
throw new AssertionFailureError('Snapshot of the page is not available');
|
||||||
|
}
|
||||||
|
const it = this.commonLLM.iterRun('readerlm-v2', {
|
||||||
|
prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`,
|
||||||
|
|
||||||
|
options: {
|
||||||
|
// system: 'You are an AI assistant developed by Jina AI',
|
||||||
|
stream: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
for await (const txt of it) {
|
||||||
|
chunks.push(txt);
|
||||||
|
const output: PageSnapshot = {
|
||||||
|
...snapshot,
|
||||||
|
parsed: {
|
||||||
|
...snapshot?.parsed,
|
||||||
|
textContent: chunks.join(''),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
yield output;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) {
|
||||||
|
if (!snapshot) {
|
||||||
|
throw new AssertionFailureError('Snapshot of the page is not available');
|
||||||
|
}
|
||||||
|
const it = this.commonLLM.iterRun('readerlm-v2', {
|
||||||
|
prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`,
|
||||||
|
options: {
|
||||||
|
// system: 'You are an AI assistant developed by Jina AI',
|
||||||
|
stream: true
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const chunks: string[] = [];
|
||||||
|
for await (const txt of it) {
|
||||||
|
chunks.push(txt);
|
||||||
|
const output: PageSnapshot = {
|
||||||
|
...snapshot,
|
||||||
|
parsed: {
|
||||||
|
...snapshot?.parsed,
|
||||||
|
textContent: chunks.join(''),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
yield output;
|
||||||
|
}
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
@ -1,64 +0,0 @@
|
|||||||
import { AsyncService } from 'civkit/async-service';
|
|
||||||
import { singleton } from 'tsyringe';
|
|
||||||
|
|
||||||
import { PageSnapshot } from './puppeteer';
|
|
||||||
import { Logger } from '../shared/services/logger';
|
|
||||||
import _ from 'lodash';
|
|
||||||
import { AssertionFailureError } from 'civkit';
|
|
||||||
import { LLMManager } from '../shared/services/common-llm';
|
|
||||||
|
|
||||||
@singleton()
|
|
||||||
export class VlmControl extends AsyncService {
|
|
||||||
|
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
|
||||||
|
|
||||||
constructor(
|
|
||||||
protected globalLogger: Logger,
|
|
||||||
protected commonLLM: LLMManager
|
|
||||||
) {
|
|
||||||
super(...arguments);
|
|
||||||
}
|
|
||||||
|
|
||||||
override async init() {
|
|
||||||
await this.dependencyReady();
|
|
||||||
|
|
||||||
this.emit('ready');
|
|
||||||
}
|
|
||||||
|
|
||||||
async* fromBrowserSnapshot(snapshot?: PageSnapshot & {
|
|
||||||
pageshotUrl?: string,
|
|
||||||
}) {
|
|
||||||
const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot;
|
|
||||||
|
|
||||||
if (!pageshot) {
|
|
||||||
throw new AssertionFailureError('Screenshot of the page is not available');
|
|
||||||
}
|
|
||||||
|
|
||||||
const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', {
|
|
||||||
prompt: [
|
|
||||||
typeof pageshot === 'string' ? new URL(pageshot) : pageshot,
|
|
||||||
`Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`,
|
|
||||||
],
|
|
||||||
|
|
||||||
options: {
|
|
||||||
system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed',
|
|
||||||
stream: true
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
const chunks: string[] = [];
|
|
||||||
for await (const txt of it) {
|
|
||||||
chunks.push(txt);
|
|
||||||
const output: PageSnapshot = {
|
|
||||||
...snapshot,
|
|
||||||
parsed: {
|
|
||||||
...snapshot?.parsed,
|
|
||||||
textContent: chunks.join(''),
|
|
||||||
}
|
|
||||||
};
|
|
||||||
yield output;
|
|
||||||
}
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
}
|
|
@ -1 +1 @@
|
|||||||
Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0
|
Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f
|
Loading…
x
Reference in New Issue
Block a user