feat: index brief in JSON format

This commit is contained in:
yanlong.wang 2024-05-23 12:06:07 +08:00
parent 1c944562f7
commit 8eee95119d
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
2 changed files with 39 additions and 16 deletions

View File

@ -21,6 +21,7 @@ import { randomUUID } from 'crypto';
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
import { countGPTToken as estimateToken } from '../shared/utils/openai';
import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
const md5Hasher = new HashManager('md5', 'hex');
@ -44,6 +45,16 @@ export interface FormattedPage {
toString: () => string;
}
const indexProto = {
toString: function (): string {
return _(this)
.toPairs()
.map(([k, v]) => k ? `[${_.upperFirst(_.lowerCase(k))}] ${v}` : '')
.value()
.join('\n') + '\n';
}
};
@singleton()
export class CrawlerHost extends RPCHost {
logger = this.globalLogger.child({ service: this.constructor.name });
@ -54,12 +65,6 @@ export class CrawlerHost extends RPCHost {
cacheValidMs = 1000 * 3600;
urlValidMs = 1000 * 3600 * 4;
indexText = `[Usage1] https://r.jina.ai/YOUR_URL
[Usage2] https://s.jina.ai/YOUR_SEARCH_QUERY
[Homepage] https://jina.ai/reader
[Source code] https://github.com/jina-ai/reader
`;
constructor(
protected globalLogger: Logger,
protected puppeteerControl: PuppeteerControl,
@ -89,6 +94,25 @@ export class CrawlerHost extends RPCHost {
this.emit('ready');
}
getIndex(user?: JinaEmbeddingsTokenAccount) {
const indexObject: Record<string, string | number | undefined> = Object.create(indexProto);
Object.assign(indexObject, {
usage1: 'https://r.jina.ai/YOUR_URL',
usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY',
homepage: 'https://jina.ai/reader',
sourceCode: 'https://github.com/jina-ai/reader',
});
if (user) {
indexObject[''] = undefined;
indexObject.authenticatedAs = `${user.user_id} (${user.full_name})`;
indexObject.balanceLeft = user.wallet.total_balance;
}
return indexObject;
}
getTurndown(noRules?: boolean | string) {
const turnDownService = new TurndownService();
if (!noRules) {
@ -497,12 +521,11 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
const noSlashURL = ctx.req.url.slice(1);
if (!noSlashURL) {
const latestUser = uid ? await auth.assertUser() : undefined;
const authMixin = latestUser ? `
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
[Balance left] ${latestUser.wallet.total_balance}
` : '';
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
return this.getIndex(latestUser);
}
return assignTransferProtocolMeta(`${this.indexText}${authMixin}`,
return assignTransferProtocolMeta(`${this.getIndex(latestUser)}`,
{ contentType: 'text/plain', envelope: null }
);
}

View File

@ -152,12 +152,12 @@ export class SearcherHost extends RPCHost {
const noSlashPath = ctx.req.url.slice(1);
if (!noSlashPath) {
const latestUser = uid ? await auth.assertUser() : undefined;
const authMixin = latestUser ? `
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
[Balance left] ${latestUser.wallet.total_balance}
` : '';
if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) {
return assignTransferProtocolMeta(`${this.crawler.indexText}${authMixin}`,
return this.crawler.getIndex(latestUser);
}
return assignTransferProtocolMeta(`${this.crawler.getIndex(latestUser)}`,
{ contentType: 'text/plain', envelope: null }
);
}