mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-16 05:15:56 +08:00
feat: jina paywall (#49)
* feat: integrate with jina embeddings paywall
This commit is contained in:
parent
2e025d10cf
commit
8cfd0d67dc
42
backend/functions/package-lock.json
generated
42
backend/functions/package-lock.json
generated
@ -178,6 +178,16 @@
|
|||||||
"node": ">=6.9.0"
|
"node": ">=6.9.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/@babel/helper-compilation-targets/node_modules/lru-cache": {
|
||||||
|
"version": "5.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
||||||
|
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
||||||
|
"dev": true,
|
||||||
|
"peer": true,
|
||||||
|
"dependencies": {
|
||||||
|
"yallist": "^3.0.2"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
|
"node_modules/@babel/helper-compilation-targets/node_modules/semver": {
|
||||||
"version": "6.3.1",
|
"version": "6.3.1",
|
||||||
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
"resolved": "https://registry.npmjs.org/semver/-/semver-6.3.1.tgz",
|
||||||
@ -6251,6 +6261,17 @@
|
|||||||
"node": ">=10.19.0"
|
"node": ">=10.19.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"node_modules/http2-wrapper/node_modules/quick-lru": {
|
||||||
|
"version": "5.1.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
||||||
|
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
||||||
|
"engines": {
|
||||||
|
"node": ">=10"
|
||||||
|
},
|
||||||
|
"funding": {
|
||||||
|
"url": "https://github.com/sponsors/sindresorhus"
|
||||||
|
}
|
||||||
|
},
|
||||||
"node_modules/https-proxy-agent": {
|
"node_modules/https-proxy-agent": {
|
||||||
"version": "5.0.1",
|
"version": "5.0.1",
|
||||||
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz",
|
||||||
@ -8059,16 +8080,6 @@
|
|||||||
"node": ">=8"
|
"node": ">=8"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/lru-cache": {
|
|
||||||
"version": "5.1.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-5.1.1.tgz",
|
|
||||||
"integrity": "sha512-KpNARQA3Iwv+jTA0utUVVbrh+Jlrr1Fv0e56GGzAFOXN7dk/FviaDW8LHmK52DlcH4WP2n6gI8vN1aesBFgo9w==",
|
|
||||||
"dev": true,
|
|
||||||
"peer": true,
|
|
||||||
"dependencies": {
|
|
||||||
"yallist": "^3.0.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/lru-memoizer": {
|
"node_modules/lru-memoizer": {
|
||||||
"version": "2.2.0",
|
"version": "2.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/lru-memoizer/-/lru-memoizer-2.2.0.tgz",
|
||||||
@ -9852,17 +9863,6 @@
|
|||||||
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
|
"integrity": "sha512-tYC1Q1hgyRuHgloV/YXs2w15unPVh8qfu/qCTfhTYamaw7fyhumKa2yGpdSo87vY32rIclj+4fWYQXUMs9EHvg==",
|
||||||
"optional": true
|
"optional": true
|
||||||
},
|
},
|
||||||
"node_modules/quick-lru": {
|
|
||||||
"version": "5.1.1",
|
|
||||||
"resolved": "https://registry.npmjs.org/quick-lru/-/quick-lru-5.1.1.tgz",
|
|
||||||
"integrity": "sha512-WuyALRjWPDGtt/wzJiadO5AXY+8hZ80hVpe6MyivgraREW751X3SbhRvG3eLKOYN+8VEvqLcf3wdnt44Z4S4SA==",
|
|
||||||
"engines": {
|
|
||||||
"node": ">=10"
|
|
||||||
},
|
|
||||||
"funding": {
|
|
||||||
"url": "https://github.com/sponsors/sindresorhus"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"node_modules/range-parser": {
|
"node_modules/range-parser": {
|
||||||
"version": "1.2.1",
|
"version": "1.2.1",
|
||||||
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
"resolved": "https://registry.npmjs.org/range-parser/-/range-parser-1.2.1.tgz",
|
||||||
|
@ -5,7 +5,7 @@ import {
|
|||||||
AssertionFailureError, ParamValidationError,
|
AssertionFailureError, ParamValidationError,
|
||||||
} from 'civkit';
|
} from 'civkit';
|
||||||
import { singleton } from 'tsyringe';
|
import { singleton } from 'tsyringe';
|
||||||
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
import { CloudHTTPv2, Ctx, FirebaseStorageBucketControl, InsufficientBalanceError, Logger, OutputServerEventStream, RPCReflect } from '../shared';
|
||||||
import { RateLimitControl } from '../shared/services/rate-limit';
|
import { RateLimitControl } from '../shared/services/rate-limit';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
import { PageSnapshot, PuppeteerControl, ScrappingOptions } from '../services/puppeteer';
|
||||||
@ -19,6 +19,9 @@ import { Crawled } from '../db/crawled';
|
|||||||
import { tidyMarkdown } from '../utils/markdown';
|
import { tidyMarkdown } from '../utils/markdown';
|
||||||
import { cleanAttribute } from '../utils/misc';
|
import { cleanAttribute } from '../utils/misc';
|
||||||
import { randomUUID } from 'crypto';
|
import { randomUUID } from 'crypto';
|
||||||
|
import { JinaEmbeddingsAuthDTO } from '../shared/dto/jina-embeddings-auth';
|
||||||
|
|
||||||
|
import { countGPTToken as estimateToken } from '../shared/utils/openai';
|
||||||
|
|
||||||
const md5Hasher = new HashManager('md5', 'hex');
|
const md5Hasher = new HashManager('md5', 'hex');
|
||||||
|
|
||||||
@ -296,23 +299,55 @@ ${this.content}
|
|||||||
req: Request,
|
req: Request,
|
||||||
res: Response,
|
res: Response,
|
||||||
},
|
},
|
||||||
|
auth: JinaEmbeddingsAuthDTO
|
||||||
) {
|
) {
|
||||||
if (ctx.req.ip) {
|
const uid = await auth.solveUID();
|
||||||
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'], [
|
let chargeAmount = 0;
|
||||||
// 100 requests per minute
|
|
||||||
new Date(Date.now() - 60 * 1000), 100
|
|
||||||
]);
|
|
||||||
}
|
|
||||||
|
|
||||||
const noSlashURL = ctx.req.url.slice(1);
|
const noSlashURL = ctx.req.url.slice(1);
|
||||||
if (!noSlashURL) {
|
if (!noSlashURL) {
|
||||||
|
const latestUser = uid ? await auth.assertUser() : undefined;
|
||||||
|
const authMixin = latestUser ? `
|
||||||
|
[Authenticated as] ${latestUser.user_id} (${latestUser.full_name})
|
||||||
|
[Balance left] ${latestUser.wallet.total_balance}
|
||||||
|
` : '';
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
return assignTransferProtocolMeta(`[Usage] https://r.jina.ai/YOUR_URL
|
||||||
[Homepage] https://jina.ai/reader
|
[Homepage] https://jina.ai/reader
|
||||||
[Source code] https://github.com/jina-ai/reader
|
[Source code] https://github.com/jina-ai/reader
|
||||||
`,
|
${authMixin}`,
|
||||||
{ contentType: 'text/plain', envelope: null }
|
{ contentType: 'text/plain', envelope: null }
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (uid) {
|
||||||
|
const user = await auth.assertUser();
|
||||||
|
if (!(user.wallet.total_balance > 0)) {
|
||||||
|
throw new InsufficientBalanceError(`Account balance not enough to run this query, please recharge.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
await this.rateLimitControl.simpleRPCUidBasedLimit(rpcReflect, uid, ['CRAWL'],
|
||||||
|
[
|
||||||
|
// 1000 requests per minute
|
||||||
|
new Date(Date.now() - 60 * 1000), 1000
|
||||||
|
]
|
||||||
|
);
|
||||||
|
|
||||||
|
rpcReflect.finally(() => {
|
||||||
|
if (chargeAmount) {
|
||||||
|
auth.reportUsage(chargeAmount, 'reader-crawl').catch((err) => {
|
||||||
|
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} else if (ctx.req.ip) {
|
||||||
|
await this.rateLimitControl.simpleRpcIPBasedLimit(rpcReflect, ctx.req.ip, ['CRAWL'],
|
||||||
|
[
|
||||||
|
// 100 requests per minute
|
||||||
|
new Date(Date.now() - 60 * 1000), 100
|
||||||
|
]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
let urlToCrawl;
|
let urlToCrawl;
|
||||||
try {
|
try {
|
||||||
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
urlToCrawl = new URL(normalizeUrl(noSlashURL.trim(), { stripWWW: false, removeTrailingSlash: false, removeSingleSlash: false }));
|
||||||
@ -364,7 +399,7 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||||
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
sseStream.write({
|
sseStream.write({
|
||||||
event: 'data',
|
event: 'data',
|
||||||
data: formatted,
|
data: formatted,
|
||||||
@ -392,6 +427,7 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||||
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
|
|
||||||
return formatted;
|
return formatted;
|
||||||
}
|
}
|
||||||
@ -400,7 +436,10 @@ ${this.content}
|
|||||||
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
throw new AssertionFailureError(`No content available for URL ${urlToCrawl}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
||||||
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
|
|
||||||
|
return formatted;
|
||||||
}
|
}
|
||||||
|
|
||||||
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
for await (const scrapped of this.cachedScrap(urlToCrawl, crawlOpts, noCache)) {
|
||||||
@ -410,6 +449,7 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(customMode, scrapped, urlToCrawl);
|
||||||
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
@ -425,6 +465,7 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
|
|
||||||
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
const formatted = await this.formatSnapshot(customMode, lastScrapped, urlToCrawl);
|
||||||
|
chargeAmount = this.getChargeAmount(formatted);
|
||||||
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
if (customMode === 'screenshot' && Reflect.get(formatted, 'screenshotUrl')) {
|
||||||
|
|
||||||
return assignTransferProtocolMeta(`${formatted}`,
|
return assignTransferProtocolMeta(`${formatted}`,
|
||||||
@ -563,4 +604,21 @@ ${this.content}
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
getChargeAmount(formatted: { [k: string]: any; }) {
|
||||||
|
const textContent = formatted?.content || formatted?.text || formatted?.html;
|
||||||
|
|
||||||
|
if (typeof textContent === 'string') {
|
||||||
|
return estimateToken(textContent);
|
||||||
|
}
|
||||||
|
|
||||||
|
const imageContent = formatted.screenshotUrl || formatted?.screenshot;
|
||||||
|
|
||||||
|
if (imageContent) {
|
||||||
|
// OpenAI image token count for 1024x1024 image
|
||||||
|
return 765;
|
||||||
|
}
|
||||||
|
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit d3bb3a7335ec9d96c68d1edf1b66fdf5e2fe5b7c
|
Subproject commit 584791b789cd483dab18735416744b4d10130993
|
Loading…
x
Reference in New Issue
Block a user