From 6f657ae19ad94c23a24fa50fa5300df73aa5b113 Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Tue, 8 Apr 2025 13:39:47 +0800 Subject: [PATCH 01/14] saas: remove sourcecode tip --- src/api/crawler.ts | 7 ------- src/api/serp.ts | 1 - 2 files changed, 8 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 8312bce..f8d38e3 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -162,17 +162,10 @@ export class CrawlerHost extends RPCHost { async getIndex(auth?: JinaEmbeddingsAuthDTO) { const indexObject: Record = Object.create(indexProto); - // Object.assign(indexObject, { - // usage1: `${ctx.origin}/YOUR_URL`, - // usage2: `${ctx.origin}/search/YOUR_SEARCH_QUERY`, - // homepage: 'https://jina.ai/reader', - // sourceCode: 'https://github.com/jina-ai/reader', - // }); Object.assign(indexObject, { usage1: 'https://r.jina.ai/YOUR_URL', usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', homepage: 'https://jina.ai/reader', - sourceCode: 'https://github.com/jina-ai/reader', }); await auth?.solveUID(); diff --git a/src/api/serp.ts b/src/api/serp.ts index b367b10..ca0edb5 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -69,7 +69,6 @@ export class SerpHost extends RPCHost { usage2: 'https://s.jina.ai/YOUR_SEARCH_QUERY', usage3: `${ctx.origin}/?q=YOUR_SEARCH_QUERY`, homepage: 'https://jina.ai/reader', - sourceCode: 'https://github.com/jina-ai/reader', }); if (auth && auth.user) { From 0cf4a9ede13adc65c15e2a29c33a3134d51dbe47 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 10 Apr 2025 18:15:51 +0800 Subject: [PATCH 02/14] saas: use alternative endpoint for auth --- src/dto/jina-embeddings-auth.ts | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/dto/jina-embeddings-auth.ts b/src/dto/jina-embeddings-auth.ts index 01439e3..aadf522 100644 --- a/src/dto/jina-embeddings-auth.ts +++ b/src/dto/jina-embeddings-auth.ts @@ -146,7 +146,10 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable { } try { - const r = await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken); + // TODO: go back using validateToken after performance issue fixed + const r = ((account?.wallet?.total_balance || 0) > 0) ? + await this.jinaEmbeddingsDashboard.authorization(this.bearerToken) : + await this.jinaEmbeddingsDashboard.validateToken(this.bearerToken); const brief = r.data; const draftAccount = JinaEmbeddingsTokenAccount.from({ ...account, ...brief, _id: this.bearerToken, From b6ac1782dc7046ededf10b8b35e3e140c096af3d Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 10 Apr 2025 21:47:41 +0800 Subject: [PATCH 03/14] fix: remove invalid cache lru --- src/dto/jina-embeddings-auth.ts | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/src/dto/jina-embeddings-auth.ts b/src/dto/jina-embeddings-auth.ts index aadf522..ffd6625 100644 --- a/src/dto/jina-embeddings-auth.ts +++ b/src/dto/jina-embeddings-auth.ts @@ -18,17 +18,8 @@ import envConfig from '../shared/services/secrets'; import { JinaEmbeddingsDashboardHTTP } from '../shared/3rd-party/jina-embeddings'; import { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; -import { LRUCache } from 'lru-cache'; - const authDtoLogger = logger.child({ service: 'JinaAuthDTO' }); -const invalidTokenLRU = new LRUCache({ - max: 256, - ttl: 60 * 60 * 1000, - updateAgeOnGet: false, - updateAgeOnHas: false, -}); - const THE_VERY_SAME_JINA_EMBEDDINGS_CLIENT = new JinaEmbeddingsDashboardHTTP(envConfig.JINA_EMBEDDINGS_DASHBOARD_API_KEY); @@ -91,12 +82,6 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable { }); } - if (invalidTokenLRU.get(this.bearerToken)) { - throw new AuthenticationFailedError({ - message: 'Invalid API key, please get a new one from https://jina.ai' - }); - } - let firestoreDegradation = false; let account; try { @@ -165,7 +150,6 @@ export class JinaEmbeddingsAuthDTO extends AutoCastable { authDtoLogger.warn(`Failed to get user brief: ${err}`, { err: marshalErrorLike(err) }); if (err?.status === 401) { - invalidTokenLRU.set(this.bearerToken, true); throw new AuthenticationFailedError({ message: 'Invalid API key, please get a new one from https://jina.ai' }); From 5f83d862dd717a125bde79d80d6ec5b873c5d490 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 11 Apr 2025 11:50:13 +0800 Subject: [PATCH 04/14] fix: encoding of from file snapshots --- src/services/snapshot-formatter.ts | 9 ++++---- src/utils/encoding.ts | 34 ++++++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 4 deletions(-) create mode 100644 src/utils/encoding.ts diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index ffb6792..ab7eb32 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -14,7 +14,7 @@ import { cleanAttribute } from '../utils/misc'; import _ from 'lodash'; import { STATUS_CODES } from 'http'; import type { CrawlerOptions } from '../dto/crawler-options'; -import { readFile } from 'fs/promises'; +import { readFile } from '../utils/encoding'; import { pathToFileURL } from 'url'; import { countGPTToken } from '../shared/utils/openai'; @@ -804,7 +804,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; overrideContentType = undefined; } - const contentType = (overrideContentType || await file.mimeType).toLowerCase(); + const contentType: string = (overrideContentType || await file.mimeType).toLowerCase(); const fileName = overrideFileName || `${url.origin}${url.pathname}`; const snapshot: PageSnapshot = { title: '', @@ -821,11 +821,12 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; return snapshot; } try { + const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8'; if (contentType.startsWith('text/html')) { if ((await file.size) > 1024 * 1024 * 32) { throw new AssertionFailureError(`Failed to access ${url}: file too large`); } - snapshot.html = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.html = await readFile(await file.filePath, encoding); return snapshot; } @@ -833,7 +834,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; if ((await file.size) > 1024 * 1024 * 32) { throw new AssertionFailureError(`Failed to access ${url}: file too large`); } - snapshot.text = await readFile(await file.filePath, { encoding: 'utf-8' }); + snapshot.text = await readFile(await file.filePath, encoding); snapshot.html = `
${snapshot.text}
`; return snapshot; diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts new file mode 100644 index 0000000..0f34a9b --- /dev/null +++ b/src/utils/encoding.ts @@ -0,0 +1,34 @@ +import { createReadStream } from 'fs'; +import { Readable } from 'stream'; +import { TextDecoderStream } from 'stream/web'; + +export async function decodeFileStream( + fileStream: Readable, + encoding: string = 'utf-8', +): Promise { + const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); + Readable.toWeb(fileStream).pipeThrough(decodeStream); + const chunks = []; + + for await (const chunk of decodeStream.readable) { + chunks.push(chunk); + } + + return chunks.join(''); +} + + +export async function readFile( + filePath: string, + encoding: string = 'utf-8', +): Promise { + const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); + Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream); + const chunks = []; + + for await (const chunk of decodeStream.readable) { + chunks.push(chunk); + } + + return chunks.join(''); +} \ No newline at end of file From 75a4dbdd79931df8e73783b10414615f86788762 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Fri, 11 Apr 2025 22:36:49 +0800 Subject: [PATCH 05/14] fix: meta charset hint --- src/services/snapshot-formatter.ts | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/services/snapshot-formatter.ts b/src/services/snapshot-formatter.ts index ab7eb32..fbc06b5 100644 --- a/src/services/snapshot-formatter.ts +++ b/src/services/snapshot-formatter.ts @@ -827,6 +827,10 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; throw new AssertionFailureError(`Failed to access ${url}: file too large`); } snapshot.html = await readFile(await file.filePath, encoding); + const innerCharset = snapshot.html.slice(0, 1024).match(/]+text\/html;\s*?charset=([^>"]+)\"/i)?.[1]?.toLowerCase(); + if (innerCharset && innerCharset !== encoding) { + snapshot.html = await readFile(await file.filePath, innerCharset); + } return snapshot; } From fe269216489b113698b5adbcab395c6a127edca4 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 14 Apr 2025 19:30:07 +0800 Subject: [PATCH 06/14] saas: refactor proxy provider (#1186) * wip * fix --- .github/workflows/cd.yml | 4 ++-- src/api/crawler.ts | 18 ++++++++++++++---- src/services/serp/google.ts | 4 ++-- thinapps-shared | 2 +- 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index a039641..f9a6ee2 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -84,6 +84,6 @@ jobs: - name: Deploy SEARCH-EU with Tag run: | gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2 - - name: Deploy SERP-JP with Tag + - name: Deploy SERP-HK with Tag run: | - gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2 \ No newline at end of file + gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2 \ No newline at end of file diff --git a/src/api/crawler.ts b/src/api/crawler.ts index f8d38e3..6e4fa5b 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -41,13 +41,13 @@ import { } from '../services/errors'; import { countGPTToken as estimateToken } from '../shared/utils/openai'; -import { ProxyProvider } from '../shared/services/proxy-provider'; +import { ProxyProviderService } from '../shared/services/proxy-provider'; import { FirebaseStorageBucketControl } from '../shared/services/firebase-storage-bucket'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { RobotsTxtService } from '../services/robots-text'; import { TempFileManager } from '../services/temp-file'; import { MiscService } from '../services/misc'; -import { HTTPServiceError } from 'civkit'; +import { HTTPServiceError } from 'civkit/http'; import { GeoIPService } from '../services/geoip'; export interface ExtraScrappingOptions extends ScrappingOptions { @@ -87,7 +87,7 @@ export class CrawlerHost extends RPCHost { protected puppeteerControl: PuppeteerControl, protected curlControl: CurlControl, protected cfBrowserRendering: CFBrowserRendering, - protected proxyProvider: ProxyProvider, + protected proxyProvider: ProxyProviderService, protected lmControl: LmControl, protected jsdomControl: JSDomControl, protected snapshotFormatter: SnapshotFormatter, @@ -1232,6 +1232,7 @@ export class CrawlerHost extends RPCHost { }; } + proxyIterMap = new WeakMap>(); @retryWith((err) => { if (err instanceof ServiceBadApproachError) { return false; @@ -1250,8 +1251,17 @@ export class CrawlerHost extends RPCHost { if (opts?.allocProxy === 'none') { return this.curlControl.sideLoad(url, opts); } + let proxy; + if (opts) { + let it = this.proxyIterMap.get(opts); + if (!it) { + it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts)); + this.proxyIterMap.set(opts, it); + } + proxy = (await it.next()).value; + } - const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts)); + proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts)); this.logger.debug(`Proxy allocated`, { proxy: proxy.href }); const r = await this.curlControl.sideLoad(url, { ...opts, diff --git a/src/services/serp/google.ts b/src/services/serp/google.ts index 3f82d3e..23ba669 100644 --- a/src/services/serp/google.ts +++ b/src/services/serp/google.ts @@ -12,7 +12,7 @@ import { ApplicationError } from 'civkit/civ-rpc'; import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors'; import { parseJSONText } from 'civkit/vectorize'; import { retryWith } from 'civkit/decorators'; -import { ProxyProvider } from '../../shared/services/proxy-provider'; +import { ProxyProviderService } from '../../shared/services/proxy-provider'; @singleton() export class GoogleSERP extends AsyncService { @@ -24,7 +24,7 @@ export class GoogleSERP extends AsyncService { protected puppeteerControl: SERPSpecializedPuppeteerControl, protected jsDomControl: JSDomControl, protected curlControl: CurlControl, - protected proxyProvider: ProxyProvider, + protected proxyProvider: ProxyProviderService, ) { const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl); super(...filteredDeps); diff --git a/thinapps-shared b/thinapps-shared index 424f50c..6fac869 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 424f50ca8b6277d74185e16aa67ff2b366d9f727 +Subproject commit 6fac86977536a7b7440edba8d4cf2a1f0e769e8c From 5ba93067d269b5409c6d75c2cee02a18538967d5 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 14 Apr 2025 21:31:33 +0800 Subject: [PATCH 07/14] saas: use android ua --- src/services/serp/puppeteer.ts | 6 ++++-- thinapps-shared | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/src/services/serp/puppeteer.ts b/src/services/serp/puppeteer.ts index 5157879..3b9cdc2 100644 --- a/src/services/serp/puppeteer.ts +++ b/src/services/serp/puppeteer.ts @@ -233,6 +233,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { livePages = new Set(); lastPageCratedAt: number = 0; ua: string = ''; + effectiveUA: string = ''; protected _REPORT_FUNCTION_NAME = 'bingo'; @@ -299,7 +300,8 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, '')); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Linux; Android 10; K)'); + this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); @@ -322,7 +324,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { } const preparations = []; - preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, ''))); + preparations.push(page.setUserAgent(this.effectiveUA)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); diff --git a/thinapps-shared b/thinapps-shared index 6fac869..165a1fe 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 6fac86977536a7b7440edba8d4cf2a1f0e769e8c +Subproject commit 165a1fe0cb12728f320c081226c28d7f7b53b0ed From 6b1bfdaf1a9d72ca3e7e4c08a167134f71d1e80b Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 14 Apr 2025 21:48:09 +0800 Subject: [PATCH 08/14] fix: ua switch --- src/api/crawler.ts | 4 ++-- src/services/puppeteer.ts | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 6e4fa5b..6e63068 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -153,8 +153,8 @@ export class CrawlerHost extends RPCHost { override async init() { await this.dependencyReady(); - if (this.puppeteerControl.ua) { - this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, '')); + if (this.puppeteerControl.effectiveUA) { + this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA); } this.emit('ready'); diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 72f588d..b7388df 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -510,6 +510,7 @@ export class PuppeteerControl extends AsyncService { pagePhase = new WeakMap(); lastPageCratedAt: number = 0; ua: string = ''; + effectiveUA: string = ''; concurrentRequestsPerPage: number = 32; pageReqCtrl = new WeakMap(); @@ -582,7 +583,8 @@ export class PuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, '')); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Linux; Android 10; K)'); + this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); @@ -615,7 +617,7 @@ export class PuppeteerControl extends AsyncService { } const preparations = []; - preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, ''))); + preparations.push(page.setUserAgent(this.effectiveUA)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); From dc4605d1d5d22485be53eddb2caeaa2b3c63653c Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 14 Apr 2025 22:03:37 +0800 Subject: [PATCH 09/14] saas: use windows ua for more desktop screenshot --- src/services/puppeteer.ts | 2 +- src/services/serp/puppeteer.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index b7388df..67840af 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -583,7 +583,7 @@ export class PuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Linux; Android 10; K)'); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); diff --git a/src/services/serp/puppeteer.ts b/src/services/serp/puppeteer.ts index 3b9cdc2..40a0932 100644 --- a/src/services/serp/puppeteer.ts +++ b/src/services/serp/puppeteer.ts @@ -300,7 +300,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Linux; Android 10; K)'); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); From 75a1283a914202e19d3f9795c23224ae0c687f14 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Mon, 14 Apr 2025 22:52:34 +0800 Subject: [PATCH 10/14] bump: deps --- thinapps-shared | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/thinapps-shared b/thinapps-shared index 165a1fe..f30577e 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 165a1fe0cb12728f320c081226c28d7f7b53b0ed +Subproject commit f30577e73eca151f9d62198ae3b60d0172befaed From e6fdd87294214907d7d1095aae916565824d283a Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 15 Apr 2025 18:40:41 +0800 Subject: [PATCH 11/14] fix: saas issus --- src/api/searcher-serper.ts | 20 ++++++++++++++------ src/api/serp.ts | 19 ++++++++++--------- src/services/curl.ts | 19 ++++++++++++++++--- src/services/errors.ts | 30 +----------------------------- src/services/serper-search.ts | 6 +++++- thinapps-shared | 2 +- 6 files changed, 47 insertions(+), 49 deletions(-) diff --git a/src/api/searcher-serper.ts b/src/api/searcher-serper.ts index d36ff28..462bde3 100644 --- a/src/api/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -6,7 +6,7 @@ import { marshalErrorLike } from 'civkit/lang'; import { objHashMd5B64Of } from 'civkit/hash'; import _ from 'lodash'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit'; import { CrawlerHost, ExtraScrappingOptions } from './crawler'; import { SerperSearchResult } from '../db/searched'; @@ -19,8 +19,16 @@ import { AsyncLocalContext } from '../services/async-context'; import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; -import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors'; -import { SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperSearchResponse, SerperWebSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; +import { InsufficientBalanceError } from '../services/errors'; +import { + SerperImageSearchResponse, + SerperNewsSearchResponse, + SerperSearchQueryParams, + SerperSearchResponse, + SerperWebSearchResponse, + WORLD_COUNTRIES, + WORLD_LANGUAGES +} from '../shared/3rd-party/serper-search'; import { toAsyncGenerator } from '../utils/misc'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; @@ -218,10 +226,10 @@ export class SearcherHost extends RPCHost { } const now = Date.now(); let tgtDate; - if (err.retryAfter) { - tgtDate = new Date(now + err.retryAfter * 1000); - } else if (err.retryAfterDate) { + if (err.retryAfterDate) { tgtDate = err.retryAfterDate; + } else if (err.retryAfter) { + tgtDate = new Date(now + err.retryAfter * 1000); } if (tgtDate) { diff --git a/src/api/serp.ts b/src/api/serp.ts index ca0edb5..74fca90 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -7,14 +7,14 @@ import { import { marshalErrorLike } from 'civkit/lang'; import _ from 'lodash'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit'; import { GlobalLogger } from '../services/logger'; import { AsyncLocalContext } from '../services/async-context'; import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; -import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors'; +import { InsufficientBalanceError } from '../services/errors'; import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; import { GoogleSERP } from '../services/serp/google'; import { WebSearchEntry } from '../services/serp/compat'; @@ -172,10 +172,11 @@ export class SerpHost extends RPCHost { const now = new Date(); const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf()); if (blockedTimeRemaining > 0) { - throw RateLimitTriggeredError.from({ - message: `Per UID rate limit exceeded (async)`, - retryAfter: Math.ceil(blockedTimeRemaining / 1000), - }); + this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`); + // throw RateLimitTriggeredError.from({ + // message: `Per UID rate limit exceeded (async)`, + // retryAfter: Math.ceil(blockedTimeRemaining / 1000), + // }); } } @@ -229,10 +230,10 @@ export class SerpHost extends RPCHost { } const now = Date.now(); let tgtDate; - if (err.retryAfter) { - tgtDate = new Date(now + err.retryAfter * 1000); - } else if (err.retryAfterDate) { + if (err.retryAfterDate) { tgtDate = err.retryAfterDate; + } else if (err.retryAfter) { + tgtDate = new Date(now + err.retryAfter * 1000); } if (tgtDate) { diff --git a/src/services/curl.ts b/src/services/curl.ts index bc61661..fdb7539 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -61,10 +61,23 @@ export class CurlControl extends AsyncService { } curlImpersonateHeader(curl: Curl, headers?: object) { + let uaPlatform = this.platform; + if (this.ua.includes('Windows')) { + uaPlatform = 'Windows'; + } else if (this.ua.includes('Android')) { + uaPlatform = 'Android'; + } else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) { + uaPlatform = 'iOS'; + } else if (this.ua.includes('CrOS')) { + uaPlatform = 'Chrome OS'; + } else if (this.ua.includes('Macintosh')) { + uaPlatform = 'macOS'; + } + const mixinHeaders: Record = { - 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': this.platform, + 'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': `"${uaPlatform}"`, 'Upgrade-Insecure-Requests': '1', 'User-Agent': this.ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', diff --git a/src/services/errors.ts b/src/services/errors.ts index 44246db..51344a2 100644 --- a/src/services/errors.ts +++ b/src/services/errors.ts @@ -1,4 +1,4 @@ -import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc'; +import { ApplicationError, StatusCode } from 'civkit/civ-rpc'; import _ from 'lodash'; import dayjs from 'dayjs'; import utc from 'dayjs/plugin/utc'; @@ -46,31 +46,3 @@ export class SecurityCompromiseError extends ApplicationError { } @StatusCode(41201) export class BatchSizeTooLargeError extends ApplicationError { } - - -@StatusCode(42903) -export class RateLimitTriggeredError extends ApplicationError { - - @Prop({ - desc: 'Retry after seconds', - }) - retryAfter?: number; - - @Prop({ - desc: 'Retry after date', - }) - retryAfterDate?: Date; - - protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() { - const retryAfter = this.retryAfter || this.retryAfterDate; - if (!retryAfter) { - return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]; - } - - return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), { - headers: { - 'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`, - } - }); - } -} diff --git a/src/services/serper-search.ts b/src/services/serper-search.ts index 4a6c41c..59698de 100644 --- a/src/services/serper-search.ts +++ b/src/services/serper-search.ts @@ -56,6 +56,7 @@ export class SerperSearchService extends AsyncService { let maxTries = 3; while (maxTries--) { + const t0 = Date.now(); try { this.logger.debug(`Doing external search`, query); let r; @@ -101,11 +102,14 @@ export class SerperSearchService extends AsyncService { break; } } + const dt = Date.now() - t0; this.blackHoleDetector.itWorked(); + this.logger.debug(`External search took ${dt}ms`, { searchDt: dt, variant }); return r.parsed; } catch (err: any) { - this.logger.error(`${variant} search failed: ${err?.message}`, { err: marshalErrorLike(err) }); + const dt = Date.now() - t0; + this.logger.error(`${variant} search failed: ${err?.message}`, { searchDt: dt, err: marshalErrorLike(err) }); if (err?.status === 429) { await delay(500 + 1000 * Math.random()); continue; diff --git a/thinapps-shared b/thinapps-shared index f30577e..580ea72 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit f30577e73eca151f9d62198ae3b60d0172befaed +Subproject commit 580ea72e0eddaa115b85dabf29de41d079ecd2d0 From c72e0d3f171a7b3a25e075f32126f54b51f4d1ab Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Tue, 15 Apr 2025 19:12:04 +0800 Subject: [PATCH 12/14] fix --- src/api/crawler.ts | 2 -- thinapps-shared | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 6e63068..d0826f4 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -1293,8 +1293,6 @@ export class CrawlerHost extends RPCHost { if (opts.countryHint) { if (this.proxyProvider.supports(opts.countryHint)) { draft ??= opts.countryHint; - } else if (opts.countryHint === 'cn') { - draft ??= 'hk'; } } diff --git a/thinapps-shared b/thinapps-shared index 580ea72..9a32fc3 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 580ea72e0eddaa115b85dabf29de41d079ecd2d0 +Subproject commit 9a32fc38abea6f42fe8f5bec452ccba0d1d0089d From 6ccf56bf99b82d7d37fbc80a4e23e13bb1da13d6 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 16 Apr 2025 15:20:36 +0800 Subject: [PATCH 13/14] bump: deps --- package-lock.json | 14 ++++++-------- package.json | 2 +- src/api/searcher-serper.ts | 17 +++++++++++++++-- src/api/serp.ts | 16 ++++++++++++++-- thinapps-shared | 2 +- 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/package-lock.json b/package-lock.json index 35f1d1e..9aa825d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -47,7 +47,7 @@ "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", - "undici": "^5.24.0" + "undici": "^7.8.0" }, "devDependencies": { "@types/archiver": "^5.3.4", @@ -12500,14 +12500,12 @@ } }, "node_modules/undici": { - "version": "5.28.4", - "resolved": "https://registry.npmjs.org/undici/-/undici-5.28.4.tgz", - "integrity": "sha512-72RFADWFqKmUb2hmmvNODKL3p9hcB6Gt2DOQMis1SEBaV6a4MH8soBvzg+95CYhCKPFedut2JY9bMfrDl9D23g==", - "dependencies": { - "@fastify/busboy": "^2.0.0" - }, + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.8.0.tgz", + "integrity": "sha512-vFv1GA99b7eKO1HG/4RPu2Is3FBTWBrmzqzO0mz+rLxN3yXkE4mqRcb8g8fHxzX4blEysrNZLqg5RbJLqX5buA==", + "license": "MIT", "engines": { - "node": ">=14.0" + "node": ">=20.18.1" } }, "node_modules/undici-types": { diff --git a/package.json b/package.json index ae35f72..09bfa7c 100644 --- a/package.json +++ b/package.json @@ -56,7 +56,7 @@ "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", - "undici": "^5.24.0" + "undici": "^7.8.0" }, "devDependencies": { "@types/archiver": "^5.3.4", diff --git a/src/api/searcher-serper.ts b/src/api/searcher-serper.ts index 462bde3..8314c8f 100644 --- a/src/api/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -32,6 +32,7 @@ import { import { toAsyncGenerator } from '../utils/misc'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; +import { API_CALL_STATUS } from '../shared/db/api-roll'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -256,8 +257,20 @@ export class SearcherHost extends RPCHost { auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); }); - const apiRoll = await apiRollPromise; - apiRoll.chargeAmount = chargeAmount; + try { + const apiRoll = await apiRollPromise; + apiRoll.chargeAmount = chargeAmount; + + } catch (err) { + await this.rateLimitControl.record({ + uid, + tags: [rpcReflect.name.toUpperCase()], + status: API_CALL_STATUS.SUCCESS, + chargeAmount, + }).save().catch((err) => { + this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) }); + }); + } } }); diff --git a/src/api/serp.ts b/src/api/serp.ts index 74fca90..2e0f363 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -25,6 +25,7 @@ import { SERPResult } from '../db/searched'; import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; +import { API_CALL_STATUS } from '../shared/db/api-roll'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -261,8 +262,19 @@ export class SerpHost extends RPCHost { auth.reportUsage(chargeAmount, `reader-search`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); }); - const apiRoll = await apiRollPromise; - apiRoll.chargeAmount = chargeAmount; + try { + const apiRoll = await apiRollPromise; + apiRoll.chargeAmount = chargeAmount; + } catch (err) { + await this.rateLimitControl.record({ + uid, + tags: [rpcReflect.name.toUpperCase()], + status: API_CALL_STATUS.SUCCESS, + chargeAmount, + }).save().catch((err) => { + this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) }); + }); + } } }); diff --git a/thinapps-shared b/thinapps-shared index 9a32fc3..f89255c 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 9a32fc38abea6f42fe8f5bec452ccba0d1d0089d +Subproject commit f89255cd6546641f72eefba140a4aef96a0e03fc From 48cff2b974d2928be5f977eb2a9858657ed3cd7c Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Wed, 16 Apr 2025 15:36:22 +0800 Subject: [PATCH 14/14] fix: add logging to serp --- src/api/serp.ts | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/api/serp.ts b/src/api/serp.ts index 2e0f363..292c05f 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -480,25 +480,30 @@ export class SerpHost extends RPCHost { let lastError; outerLoop: for (const client of this.iterProviders(provider)) { + const t0 = Date.now(); try { switch (variant) { case 'images': { r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } case 'news': { r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } case 'web': default: { r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } } + const dt = Date.now() - t0; + this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); + break outerLoop; } catch (err) { lastError = err; - this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err }); + const dt = Date.now() - t0; + this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, }); } }