diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml index a039641..f9a6ee2 100644 --- a/.github/workflows/cd.yml +++ b/.github/workflows/cd.yml @@ -84,6 +84,6 @@ jobs: - name: Deploy SEARCH-EU with Tag run: | gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2 - - name: Deploy SERP-JP with Tag + - name: Deploy SERP-HK with Tag run: | - gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2 \ No newline at end of file + gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2 \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 854f457..60a2839 100644 --- a/package-lock.json +++ b/package-lock.json @@ -48,7 +48,7 @@ "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", - "undici": "^5.24.0" + "undici": "^7.8.0" }, "devDependencies": { "@types/archiver": "^5.3.4", @@ -12494,14 +12494,12 @@ } }, "node_modules/undici": { - "version": "5.28.4", - "resolved": "https://registry.npmjs.org/undici/-/undici-5.28.4.tgz", - "integrity": "sha512-72RFADWFqKmUb2hmmvNODKL3p9hcB6Gt2DOQMis1SEBaV6a4MH8soBvzg+95CYhCKPFedut2JY9bMfrDl9D23g==", - "dependencies": { - "@fastify/busboy": "^2.0.0" - }, + "version": "7.8.0", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.8.0.tgz", + "integrity": "sha512-vFv1GA99b7eKO1HG/4RPu2Is3FBTWBrmzqzO0mz+rLxN3yXkE4mqRcb8g8fHxzX4blEysrNZLqg5RbJLqX5buA==", + "license": "MIT", "engines": { - "node": ">=14.0" + "node": ">=20.18.1" } }, "node_modules/undici-types": { diff --git a/package.json b/package.json index cedf320..dca9d22 100644 --- a/package.json +++ b/package.json @@ -57,7 +57,7 @@ "tld-extract": "^2.1.0", "turndown": "^7.1.3", "turndown-plugin-gfm": "^1.0.2", - "undici": "^5.24.0" + "undici": "^7.8.0" }, "devDependencies": { "@types/archiver": "^5.3.4", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index 176a1d7..d0826f4 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -47,7 +47,7 @@ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { RobotsTxtService } from '../services/robots-text'; import { TempFileManager } from '../services/temp-file'; import { MiscService } from '../services/misc'; -import { HTTPServiceError } from 'civkit'; +import { HTTPServiceError } from 'civkit/http'; import { GeoIPService } from '../services/geoip'; export interface ExtraScrappingOptions extends ScrappingOptions { @@ -153,8 +153,8 @@ export class CrawlerHost extends RPCHost { override async init() { await this.dependencyReady(); - if (this.puppeteerControl.ua) { - this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, '')); + if (this.puppeteerControl.effectiveUA) { + this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA); } this.emit('ready'); @@ -1232,6 +1232,7 @@ export class CrawlerHost extends RPCHost { }; } + proxyIterMap = new WeakMap>(); @retryWith((err) => { if (err instanceof ServiceBadApproachError) { return false; @@ -1250,8 +1251,17 @@ export class CrawlerHost extends RPCHost { if (opts?.allocProxy === 'none') { return this.curlControl.sideLoad(url, opts); } + let proxy; + if (opts) { + let it = this.proxyIterMap.get(opts); + if (!it) { + it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts)); + this.proxyIterMap.set(opts, it); + } + proxy = (await it.next()).value; + } - const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts)); + proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts)); this.logger.debug(`Proxy allocated`, { proxy: proxy.href }); const r = await this.curlControl.sideLoad(url, { ...opts, @@ -1283,8 +1293,6 @@ export class CrawlerHost extends RPCHost { if (opts.countryHint) { if (this.proxyProvider.supports(opts.countryHint)) { draft ??= opts.countryHint; - } else if (opts.countryHint === 'cn') { - draft ??= 'hk'; } } diff --git a/src/api/searcher-serper.ts b/src/api/searcher-serper.ts index d36ff28..8314c8f 100644 --- a/src/api/searcher-serper.ts +++ b/src/api/searcher-serper.ts @@ -6,7 +6,7 @@ import { marshalErrorLike } from 'civkit/lang'; import { objHashMd5B64Of } from 'civkit/hash'; import _ from 'lodash'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit'; import { CrawlerHost, ExtraScrappingOptions } from './crawler'; import { SerperSearchResult } from '../db/searched'; @@ -19,11 +19,20 @@ import { AsyncLocalContext } from '../services/async-context'; import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; -import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors'; -import { SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperSearchResponse, SerperWebSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; +import { InsufficientBalanceError } from '../services/errors'; +import { + SerperImageSearchResponse, + SerperNewsSearchResponse, + SerperSearchQueryParams, + SerperSearchResponse, + SerperWebSearchResponse, + WORLD_COUNTRIES, + WORLD_LANGUAGES +} from '../shared/3rd-party/serper-search'; import { toAsyncGenerator } from '../utils/misc'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; +import { API_CALL_STATUS } from '../shared/db/api-roll'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -218,10 +227,10 @@ export class SearcherHost extends RPCHost { } const now = Date.now(); let tgtDate; - if (err.retryAfter) { - tgtDate = new Date(now + err.retryAfter * 1000); - } else if (err.retryAfterDate) { + if (err.retryAfterDate) { tgtDate = err.retryAfterDate; + } else if (err.retryAfter) { + tgtDate = new Date(now + err.retryAfter * 1000); } if (tgtDate) { @@ -248,8 +257,20 @@ export class SearcherHost extends RPCHost { auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); }); - const apiRoll = await apiRollPromise; - apiRoll.chargeAmount = chargeAmount; + try { + const apiRoll = await apiRollPromise; + apiRoll.chargeAmount = chargeAmount; + + } catch (err) { + await this.rateLimitControl.record({ + uid, + tags: [rpcReflect.name.toUpperCase()], + status: API_CALL_STATUS.SUCCESS, + chargeAmount, + }).save().catch((err) => { + this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) }); + }); + } } }); diff --git a/src/api/serp.ts b/src/api/serp.ts index ca0edb5..292c05f 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -7,14 +7,14 @@ import { import { marshalErrorLike } from 'civkit/lang'; import _ from 'lodash'; -import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit'; +import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit'; import { GlobalLogger } from '../services/logger'; import { AsyncLocalContext } from '../services/async-context'; import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry'; import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; -import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors'; +import { InsufficientBalanceError } from '../services/errors'; import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; import { GoogleSERP } from '../services/serp/google'; import { WebSearchEntry } from '../services/serp/compat'; @@ -25,6 +25,7 @@ import { SERPResult } from '../db/searched'; import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper'; import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; +import { API_CALL_STATUS } from '../shared/db/api-roll'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -172,10 +173,11 @@ export class SerpHost extends RPCHost { const now = new Date(); const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf()); if (blockedTimeRemaining > 0) { - throw RateLimitTriggeredError.from({ - message: `Per UID rate limit exceeded (async)`, - retryAfter: Math.ceil(blockedTimeRemaining / 1000), - }); + this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`); + // throw RateLimitTriggeredError.from({ + // message: `Per UID rate limit exceeded (async)`, + // retryAfter: Math.ceil(blockedTimeRemaining / 1000), + // }); } } @@ -229,10 +231,10 @@ export class SerpHost extends RPCHost { } const now = Date.now(); let tgtDate; - if (err.retryAfter) { - tgtDate = new Date(now + err.retryAfter * 1000); - } else if (err.retryAfterDate) { + if (err.retryAfterDate) { tgtDate = err.retryAfterDate; + } else if (err.retryAfter) { + tgtDate = new Date(now + err.retryAfter * 1000); } if (tgtDate) { @@ -260,8 +262,19 @@ export class SerpHost extends RPCHost { auth.reportUsage(chargeAmount, `reader-search`).catch((err) => { this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) }); }); - const apiRoll = await apiRollPromise; - apiRoll.chargeAmount = chargeAmount; + try { + const apiRoll = await apiRollPromise; + apiRoll.chargeAmount = chargeAmount; + } catch (err) { + await this.rateLimitControl.record({ + uid, + tags: [rpcReflect.name.toUpperCase()], + status: API_CALL_STATUS.SUCCESS, + chargeAmount, + }).save().catch((err) => { + this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) }); + }); + } } }); @@ -467,25 +480,30 @@ export class SerpHost extends RPCHost { let lastError; outerLoop: for (const client of this.iterProviders(provider)) { + const t0 = Date.now(); try { switch (variant) { case 'images': { r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } case 'news': { r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } case 'web': default: { r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]); - break outerLoop; + break; } } + const dt = Date.now() - t0; + this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); + break outerLoop; } catch (err) { lastError = err; - this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err }); + const dt = Date.now() - t0; + this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, }); } } diff --git a/src/services/curl.ts b/src/services/curl.ts index bc61661..fdb7539 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -61,10 +61,23 @@ export class CurlControl extends AsyncService { } curlImpersonateHeader(curl: Curl, headers?: object) { + let uaPlatform = this.platform; + if (this.ua.includes('Windows')) { + uaPlatform = 'Windows'; + } else if (this.ua.includes('Android')) { + uaPlatform = 'Android'; + } else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) { + uaPlatform = 'iOS'; + } else if (this.ua.includes('CrOS')) { + uaPlatform = 'Chrome OS'; + } else if (this.ua.includes('Macintosh')) { + uaPlatform = 'macOS'; + } + const mixinHeaders: Record = { - 'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, - 'sec-ch-ua-mobile': '?0', - 'sec-ch-ua-platform': this.platform, + 'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, + 'Sec-Ch-Ua-Mobile': '?0', + 'Sec-Ch-Ua-Platform': `"${uaPlatform}"`, 'Upgrade-Insecure-Requests': '1', 'User-Agent': this.ua, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', diff --git a/src/services/errors.ts b/src/services/errors.ts index 44246db..51344a2 100644 --- a/src/services/errors.ts +++ b/src/services/errors.ts @@ -1,4 +1,4 @@ -import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc'; +import { ApplicationError, StatusCode } from 'civkit/civ-rpc'; import _ from 'lodash'; import dayjs from 'dayjs'; import utc from 'dayjs/plugin/utc'; @@ -46,31 +46,3 @@ export class SecurityCompromiseError extends ApplicationError { } @StatusCode(41201) export class BatchSizeTooLargeError extends ApplicationError { } - - -@StatusCode(42903) -export class RateLimitTriggeredError extends ApplicationError { - - @Prop({ - desc: 'Retry after seconds', - }) - retryAfter?: number; - - @Prop({ - desc: 'Retry after date', - }) - retryAfterDate?: Date; - - protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() { - const retryAfter = this.retryAfter || this.retryAfterDate; - if (!retryAfter) { - return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]; - } - - return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), { - headers: { - 'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`, - } - }); - } -} diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 72f588d..67840af 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -510,6 +510,7 @@ export class PuppeteerControl extends AsyncService { pagePhase = new WeakMap(); lastPageCratedAt: number = 0; ua: string = ''; + effectiveUA: string = ''; concurrentRequestsPerPage: number = 32; pageReqCtrl = new WeakMap(); @@ -582,7 +583,8 @@ export class PuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, '')); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); + this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); @@ -615,7 +617,7 @@ export class PuppeteerControl extends AsyncService { } const preparations = []; - preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, ''))); + preparations.push(page.setUserAgent(this.effectiveUA)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); diff --git a/src/services/serp/puppeteer.ts b/src/services/serp/puppeteer.ts index 5157879..40a0932 100644 --- a/src/services/serp/puppeteer.ts +++ b/src/services/serp/puppeteer.ts @@ -233,6 +233,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { livePages = new Set(); lastPageCratedAt: number = 0; ua: string = ''; + effectiveUA: string = ''; protected _REPORT_FUNCTION_NAME = 'bingo'; @@ -299,7 +300,8 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { }); this.ua = await this.browser.userAgent(); this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`); - this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, '')); + this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); + this.curlControl.impersonateChrome(this.effectiveUA); await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); @@ -322,7 +324,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { } const preparations = []; - preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, ''))); + preparations.push(page.setUserAgent(this.effectiveUA)); // preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`)); // preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`)); preparations.push(page.setBypassCSP(true)); diff --git a/src/services/serper-search.ts b/src/services/serper-search.ts index 4a6c41c..59698de 100644 --- a/src/services/serper-search.ts +++ b/src/services/serper-search.ts @@ -56,6 +56,7 @@ export class SerperSearchService extends AsyncService { let maxTries = 3; while (maxTries--) { + const t0 = Date.now(); try { this.logger.debug(`Doing external search`, query); let r; @@ -101,11 +102,14 @@ export class SerperSearchService extends AsyncService { break; } } + const dt = Date.now() - t0; this.blackHoleDetector.itWorked(); + this.logger.debug(`External search took ${dt}ms`, { searchDt: dt, variant }); return r.parsed; } catch (err: any) { - this.logger.error(`${variant} search failed: ${err?.message}`, { err: marshalErrorLike(err) }); + const dt = Date.now() - t0; + this.logger.error(`${variant} search failed: ${err?.message}`, { searchDt: dt, err: marshalErrorLike(err) }); if (err?.status === 429) { await delay(500 + 1000 * Math.random()); continue; diff --git a/thinapps-shared b/thinapps-shared index 0a59b8f..f89255c 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit 0a59b8f84c03b2099a9785769bedc98573e65847 +Subproject commit f89255cd6546641f72eefba140a4aef96a0e03fc