diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 5a98575..936eafa 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -23,7 +23,7 @@ import { FirebaseRoundTripChecker } from '../shared/services/firebase-roundtrip- import { JSDomControl } from '../services/jsdom'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; import { CurlControl } from '../services/curl'; -import { VlmControl } from '../services/vlm'; +import { LmControl } from '../services/lm'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -58,7 +58,7 @@ export class CrawlerHost extends RPCHost { protected globalLogger: Logger, protected puppeteerControl: PuppeteerControl, protected curlControl: CurlControl, - protected vlmControl: VlmControl, + protected lmControl: LmControl, protected jsdomControl: JSDomControl, protected snapshotFormatter: SnapshotFormatter, protected firebaseObjectStorage: FirebaseStorageBucketControl, @@ -284,7 +284,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); - chargeAmount = this.assignChargeAmount(formatted); + chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } @@ -321,7 +321,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); - chargeAmount = this.assignChargeAmount(formatted); + chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); @@ -342,7 +342,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); - chargeAmount = this.assignChargeAmount(formatted); + chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } @@ -369,7 +369,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, scrapped, targetUrl, this.urlValidMs); - chargeAmount = this.assignChargeAmount(formatted); + chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } @@ -398,7 +398,7 @@ export class CrawlerHost extends RPCHost { } const formatted = await this.formatSnapshot(crawlerOptions, lastScrapped, targetUrl, this.urlValidMs); - chargeAmount = this.assignChargeAmount(formatted); + chargeAmount = this.assignChargeAmount(formatted, crawlOpts); if (crawlerOptions.tokenBudget && chargeAmount > crawlerOptions.tokenBudget) { throw new BudgetExceededError(`Token budget (${crawlerOptions.tokenBudget}) exceeded, intended charge amount ${chargeAmount}.`); } @@ -625,10 +625,45 @@ export class CrawlerHost extends RPCHost { return; } - if (crawlOpts?.engine === ENGINE_TYPE.VLM) { - const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, crawlOpts, crawlerOpts); + // if (crawlOpts?.engine === ENGINE_TYPE.VLM) { + // const rmSelectorEquivalent = []; + // if (typeof crawlOpts.removeSelector === 'string') { + // rmSelectorEquivalent.push(crawlOpts.removeSelector); + // } else if (Array.isArray(crawlOpts.removeSelector)) { + // rmSelectorEquivalent.push(...crawlOpts.removeSelector); + // } + // rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option,header,footer,nav'); - yield* this.vlmControl.fromBrowserSnapshot(finalBrowserSnapshot); + // const finalBrowserSnapshot = await this.getFinalSnapshot(urlToCrawl, { + // ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: ENGINE_TYPE.BROWSER + // }, crawlerOpts); + + // yield* this.lmControl.geminiFromBrowserSnapshot(finalBrowserSnapshot); + + // return; + // } + + if (crawlOpts?.engine === ENGINE_TYPE.READER_LM) { + const rmSelectorEquivalent = []; + if (typeof crawlOpts.removeSelector === 'string') { + rmSelectorEquivalent.push(crawlOpts.removeSelector); + } else if (Array.isArray(crawlOpts.removeSelector)) { + rmSelectorEquivalent.push(...crawlOpts.removeSelector); + } + rmSelectorEquivalent.push('script,link,style,meta,textarea,select>option'); + + const finalAutoSnapshot = await this.getFinalSnapshot(urlToCrawl, { + ...crawlOpts, removeSelector: rmSelectorEquivalent, engine: undefined + }, crawlerOpts); + + if (crawlerOpts?.instruction || crawlerOpts?.jsonSchema) { + const jsonSchema = crawlerOpts.jsonSchema ? JSON.stringify(crawlerOpts.jsonSchema, undefined, 2) : undefined; + yield* this.lmControl.readerLMFromSnapshot(crawlerOpts.instruction, jsonSchema, finalAutoSnapshot); + + return; + } + + yield* this.lmControl.readerLMMarkdownFromSnapshot(finalAutoSnapshot); return; } @@ -669,14 +704,18 @@ export class CrawlerHost extends RPCHost { } } - assignChargeAmount(formatted: FormattedPage) { + assignChargeAmount(formatted: FormattedPage, scrappingOptions?: ExtraScrappingOptions) { if (!formatted) { return 0; } let amount = 0; if (formatted.content) { - amount += estimateToken(formatted.content); + const x1 = estimateToken(formatted.content); + if (scrappingOptions?.engine?.toLowerCase().includes('lm')) { + amount += x1 * 2; + } + amount += x1; } else if (formatted.description) { amount += estimateToken(formatted.description); } @@ -819,7 +858,7 @@ export class CrawlerHost extends RPCHost { nominalUrl?: URL, urlValidMs?: number ) { - if (crawlerOptions.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + if (crawlerOptions.engine?.toLowerCase().includes('lm')) { const output: FormattedPage = { title: snapshot.title, content: snapshot.parsed?.textContent, diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 6b1c41e..af2188a 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -15,6 +15,7 @@ export enum ENGINE_TYPE { BROWSER = 'browser', DIRECT = 'direct', VLM = 'vlm', + READER_LM = 'readerlm-v2', } const CONTENT_FORMAT_VALUES = new Set(Object.values(CONTENT_FORMAT)); @@ -188,7 +189,7 @@ class Viewport extends AutoCastable { schema: { type: 'string' } }, 'X-Engine': { - description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm', + description: 'Specify the engine to use for crawling.\n\nSupported: browser, direct, vlm, readerlm-v2', in: 'header', schema: { type: 'string' } }, @@ -317,6 +318,12 @@ export class CrawlerOptions extends AutoCastable { @Prop() viewport?: Viewport; + @Prop() + instruction?: string; + + @Prop() + jsonSchema?: object; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { @@ -461,7 +468,7 @@ export class CrawlerOptions extends AutoCastable { if (this.injectFrameScript?.length || this.injectPageScript?.length) { return false; } - if (this.engine?.toLowerCase() === ENGINE_TYPE.VLM) { + if (this.engine?.toLowerCase().includes('lm')) { return false; } diff --git a/backend/functions/src/services/lm.ts b/backend/functions/src/services/lm.ts new file mode 100644 index 0000000..6c30f6b --- /dev/null +++ b/backend/functions/src/services/lm.ts @@ -0,0 +1,131 @@ +import { AsyncService } from 'civkit/async-service'; +import { singleton } from 'tsyringe'; + +import { PageSnapshot } from './puppeteer'; +import { Logger } from '../shared/services/logger'; +import _ from 'lodash'; +import { AssertionFailureError } from 'civkit'; +import { LLMManager } from '../shared/services/common-llm'; + +const tripleBackTick = '```'; + +@singleton() +export class LmControl extends AsyncService { + + logger = this.globalLogger.child({ service: this.constructor.name }); + + constructor( + protected globalLogger: Logger, + protected commonLLM: LLMManager + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + cleanRedundantEmptyLines(text: string) { + const lines = text.split(/\r?\n/g); + const mappedFlag = lines.map((line) => Boolean(line.trim())); + + return lines.filter((_line, i) => mappedFlag[i] || mappedFlag[i - 1]).join('\n'); + } + + async* geminiFromBrowserSnapshot(snapshot?: PageSnapshot & { + pageshotUrl?: string, + }) { + const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot; + + if (!pageshot) { + throw new AssertionFailureError('Screenshot of the page is not available'); + } + + const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', { + prompt: [ + `HTML: \n${this.cleanRedundantEmptyLines(snapshot.html)}\n\nSCREENSHOT: \n`, + typeof pageshot === 'string' ? new URL(pageshot) : pageshot, + `Convert this webpage into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`, + ], + + options: { + system: 'You are ReaderLM-v7, a model that generates Markdown source files only. No HTML, notes and chit-chats allowed', + stream: true + } + }); + + const chunks: string[] = []; + for await (const txt of it) { + chunks.push(txt); + const output: PageSnapshot = { + ...snapshot, + parsed: { + ...snapshot?.parsed, + textContent: chunks.join(''), + } + }; + yield output; + } + + return; + } + + async* readerLMMarkdownFromSnapshot(snapshot?: PageSnapshot) { + if (!snapshot) { + throw new AssertionFailureError('Snapshot of the page is not available'); + } + const it = this.commonLLM.iterRun('readerlm-v2', { + prompt: `Extract the main content from the given HTML and convert it to Markdown format.\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n`, + + options: { + // system: 'You are an AI assistant developed by Jina AI', + stream: true + } + }); + + const chunks: string[] = []; + for await (const txt of it) { + chunks.push(txt); + const output: PageSnapshot = { + ...snapshot, + parsed: { + ...snapshot?.parsed, + textContent: chunks.join(''), + } + }; + yield output; + } + + return; + } + + async* readerLMFromSnapshot(schema?: string, instruction: string = 'Infer useful information from the HTML and present it in a structured JSON object.', snapshot?: PageSnapshot) { + if (!snapshot) { + throw new AssertionFailureError('Snapshot of the page is not available'); + } + const it = this.commonLLM.iterRun('readerlm-v2', { + prompt: `${instruction}\n\n${tripleBackTick}html\n${this.cleanRedundantEmptyLines(snapshot.html)}\n${tripleBackTick}\n${schema ? `The JSON schema:\n${tripleBackTick}json\n${schema}\n${tripleBackTick}\n` : ''}`, + options: { + // system: 'You are an AI assistant developed by Jina AI', + stream: true + } + }); + + const chunks: string[] = []; + for await (const txt of it) { + chunks.push(txt); + const output: PageSnapshot = { + ...snapshot, + parsed: { + ...snapshot?.parsed, + textContent: chunks.join(''), + } + }; + yield output; + } + + return; + } +} diff --git a/backend/functions/src/services/vlm.ts b/backend/functions/src/services/vlm.ts deleted file mode 100644 index bc0c4f8..0000000 --- a/backend/functions/src/services/vlm.ts +++ /dev/null @@ -1,64 +0,0 @@ -import { AsyncService } from 'civkit/async-service'; -import { singleton } from 'tsyringe'; - -import { PageSnapshot } from './puppeteer'; -import { Logger } from '../shared/services/logger'; -import _ from 'lodash'; -import { AssertionFailureError } from 'civkit'; -import { LLMManager } from '../shared/services/common-llm'; - -@singleton() -export class VlmControl extends AsyncService { - - logger = this.globalLogger.child({ service: this.constructor.name }); - - constructor( - protected globalLogger: Logger, - protected commonLLM: LLMManager - ) { - super(...arguments); - } - - override async init() { - await this.dependencyReady(); - - this.emit('ready'); - } - - async* fromBrowserSnapshot(snapshot?: PageSnapshot & { - pageshotUrl?: string, - }) { - const pageshot = snapshot?.pageshotUrl || snapshot?.pageshot; - - if (!pageshot) { - throw new AssertionFailureError('Screenshot of the page is not available'); - } - - const it = this.commonLLM.iterRun('vertex-gemini-1.5-flash-002', { - prompt: [ - typeof pageshot === 'string' ? new URL(pageshot) : pageshot, - `Convert this webpage screenshot into a markdown source file that does not contain HTML tags, retaining the page language and visual structures.`, - ], - - options: { - system: 'You are Reader-LM-v7, an OCR model that generates Markdown source files only. No HTML, notes and chit-chats allowed', - stream: true - } - }); - - const chunks: string[] = []; - for await (const txt of it) { - chunks.push(txt); - const output: PageSnapshot = { - ...snapshot, - parsed: { - ...snapshot?.parsed, - textContent: chunks.join(''), - } - }; - yield output; - } - - return; - } -} diff --git a/thinapps-shared b/thinapps-shared index a17e580..ee28974 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit a17e58017ee2075edeef79893fc1bf398eeb99d0 +Subproject commit ee28974871e4d68c53ff82aca6cfdef8ed19a26f