mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-19 01:15:51 +08:00
Merge branch 'main' into mongodb
This commit is contained in:
commit
efaeabb49e
4
.github/workflows/cd.yml
vendored
4
.github/workflows/cd.yml
vendored
@ -84,6 +84,6 @@ jobs:
|
|||||||
- name: Deploy SEARCH-EU with Tag
|
- name: Deploy SEARCH-EU with Tag
|
||||||
run: |
|
run: |
|
||||||
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
gcloud beta run deploy search-eu --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/search.js --region europe-west1 --async --min-instances 0 --deploy-health-check --use-http2
|
||||||
- name: Deploy SERP-JP with Tag
|
- name: Deploy SERP-HK with Tag
|
||||||
run: |
|
run: |
|
||||||
gcloud beta run deploy serp-jp --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-northeast1 --async --min-instances 0 --deploy-health-check --use-http2
|
gcloud beta run deploy serp-hk --image us-docker.pkg.dev/reader-6b7dc/jina-reader/reader@${{steps.container.outputs.imageid}} --tag ${{ env.RELEASE_VERSION }} --command '' --args build/stand-alone/serp.js --region asia-east2 --async --min-instances 0 --deploy-health-check --use-http2
|
14
package-lock.json
generated
14
package-lock.json
generated
@ -48,7 +48,7 @@
|
|||||||
"tld-extract": "^2.1.0",
|
"tld-extract": "^2.1.0",
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"undici": "^5.24.0"
|
"undici": "^7.8.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/archiver": "^5.3.4",
|
"@types/archiver": "^5.3.4",
|
||||||
@ -12494,14 +12494,12 @@
|
|||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/undici": {
|
"node_modules/undici": {
|
||||||
"version": "5.28.4",
|
"version": "7.8.0",
|
||||||
"resolved": "https://registry.npmjs.org/undici/-/undici-5.28.4.tgz",
|
"resolved": "https://registry.npmjs.org/undici/-/undici-7.8.0.tgz",
|
||||||
"integrity": "sha512-72RFADWFqKmUb2hmmvNODKL3p9hcB6Gt2DOQMis1SEBaV6a4MH8soBvzg+95CYhCKPFedut2JY9bMfrDl9D23g==",
|
"integrity": "sha512-vFv1GA99b7eKO1HG/4RPu2Is3FBTWBrmzqzO0mz+rLxN3yXkE4mqRcb8g8fHxzX4blEysrNZLqg5RbJLqX5buA==",
|
||||||
"dependencies": {
|
"license": "MIT",
|
||||||
"@fastify/busboy": "^2.0.0"
|
|
||||||
},
|
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">=14.0"
|
"node": ">=20.18.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"node_modules/undici-types": {
|
"node_modules/undici-types": {
|
||||||
|
@ -57,7 +57,7 @@
|
|||||||
"tld-extract": "^2.1.0",
|
"tld-extract": "^2.1.0",
|
||||||
"turndown": "^7.1.3",
|
"turndown": "^7.1.3",
|
||||||
"turndown-plugin-gfm": "^1.0.2",
|
"turndown-plugin-gfm": "^1.0.2",
|
||||||
"undici": "^5.24.0"
|
"undici": "^7.8.0"
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/archiver": "^5.3.4",
|
"@types/archiver": "^5.3.4",
|
||||||
|
@ -47,7 +47,7 @@ import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
|||||||
import { RobotsTxtService } from '../services/robots-text';
|
import { RobotsTxtService } from '../services/robots-text';
|
||||||
import { TempFileManager } from '../services/temp-file';
|
import { TempFileManager } from '../services/temp-file';
|
||||||
import { MiscService } from '../services/misc';
|
import { MiscService } from '../services/misc';
|
||||||
import { HTTPServiceError } from 'civkit';
|
import { HTTPServiceError } from 'civkit/http';
|
||||||
import { GeoIPService } from '../services/geoip';
|
import { GeoIPService } from '../services/geoip';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
@ -153,8 +153,8 @@ export class CrawlerHost extends RPCHost {
|
|||||||
override async init() {
|
override async init() {
|
||||||
await this.dependencyReady();
|
await this.dependencyReady();
|
||||||
|
|
||||||
if (this.puppeteerControl.ua) {
|
if (this.puppeteerControl.effectiveUA) {
|
||||||
this.curlControl.impersonateChrome(this.puppeteerControl.ua.replace(/Headless/i, ''));
|
this.curlControl.impersonateChrome(this.puppeteerControl.effectiveUA);
|
||||||
}
|
}
|
||||||
|
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
@ -1232,6 +1232,7 @@ export class CrawlerHost extends RPCHost {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
proxyIterMap = new WeakMap<ExtraScrappingOptions, ReturnType<ProxyProviderService['iterAlloc']>>();
|
||||||
@retryWith((err) => {
|
@retryWith((err) => {
|
||||||
if (err instanceof ServiceBadApproachError) {
|
if (err instanceof ServiceBadApproachError) {
|
||||||
return false;
|
return false;
|
||||||
@ -1250,8 +1251,17 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (opts?.allocProxy === 'none') {
|
if (opts?.allocProxy === 'none') {
|
||||||
return this.curlControl.sideLoad(url, opts);
|
return this.curlControl.sideLoad(url, opts);
|
||||||
}
|
}
|
||||||
|
let proxy;
|
||||||
|
if (opts) {
|
||||||
|
let it = this.proxyIterMap.get(opts);
|
||||||
|
if (!it) {
|
||||||
|
it = this.proxyProvider.iterAlloc(this.figureOutBestProxyCountry(opts));
|
||||||
|
this.proxyIterMap.set(opts, it);
|
||||||
|
}
|
||||||
|
proxy = (await it.next()).value;
|
||||||
|
}
|
||||||
|
|
||||||
const proxy = await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
|
proxy ??= await this.proxyProvider.alloc(this.figureOutBestProxyCountry(opts));
|
||||||
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
||||||
const r = await this.curlControl.sideLoad(url, {
|
const r = await this.curlControl.sideLoad(url, {
|
||||||
...opts,
|
...opts,
|
||||||
@ -1283,8 +1293,6 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (opts.countryHint) {
|
if (opts.countryHint) {
|
||||||
if (this.proxyProvider.supports(opts.countryHint)) {
|
if (this.proxyProvider.supports(opts.countryHint)) {
|
||||||
draft ??= opts.countryHint;
|
draft ??= opts.countryHint;
|
||||||
} else if (opts.countryHint === 'cn') {
|
|
||||||
draft ??= 'hk';
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6,7 +6,7 @@ import { marshalErrorLike } from 'civkit/lang';
|
|||||||
import { objHashMd5B64Of } from 'civkit/hash';
|
import { objHashMd5B64Of } from 'civkit/hash';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
|
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';
|
||||||
|
|
||||||
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
import { CrawlerHost, ExtraScrappingOptions } from './crawler';
|
||||||
import { SerperSearchResult } from '../db/searched';
|
import { SerperSearchResult } from '../db/searched';
|
||||||
@ -19,11 +19,20 @@ import { AsyncLocalContext } from '../services/async-context';
|
|||||||
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
||||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors';
|
import { InsufficientBalanceError } from '../services/errors';
|
||||||
import { SerperImageSearchResponse, SerperNewsSearchResponse, SerperSearchQueryParams, SerperSearchResponse, SerperWebSearchResponse, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import {
|
||||||
|
SerperImageSearchResponse,
|
||||||
|
SerperNewsSearchResponse,
|
||||||
|
SerperSearchQueryParams,
|
||||||
|
SerperSearchResponse,
|
||||||
|
SerperWebSearchResponse,
|
||||||
|
WORLD_COUNTRIES,
|
||||||
|
WORLD_LANGUAGES
|
||||||
|
} from '../shared/3rd-party/serper-search';
|
||||||
import { toAsyncGenerator } from '../utils/misc';
|
import { toAsyncGenerator } from '../utils/misc';
|
||||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { LRUCache } from 'lru-cache';
|
import { LRUCache } from 'lru-cache';
|
||||||
|
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||||
|
|
||||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||||
|
|
||||||
@ -218,10 +227,10 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
let tgtDate;
|
let tgtDate;
|
||||||
if (err.retryAfter) {
|
if (err.retryAfterDate) {
|
||||||
tgtDate = new Date(now + err.retryAfter * 1000);
|
|
||||||
} else if (err.retryAfterDate) {
|
|
||||||
tgtDate = err.retryAfterDate;
|
tgtDate = err.retryAfterDate;
|
||||||
|
} else if (err.retryAfter) {
|
||||||
|
tgtDate = new Date(now + err.retryAfter * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tgtDate) {
|
if (tgtDate) {
|
||||||
@ -248,8 +257,20 @@ export class SearcherHost extends RPCHost {
|
|||||||
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
auth.reportUsage(chargeAmount, `reader-${rpcReflect.name}`).catch((err) => {
|
||||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||||
});
|
});
|
||||||
|
try {
|
||||||
const apiRoll = await apiRollPromise;
|
const apiRoll = await apiRollPromise;
|
||||||
apiRoll.chargeAmount = chargeAmount;
|
apiRoll.chargeAmount = chargeAmount;
|
||||||
|
|
||||||
|
} catch (err) {
|
||||||
|
await this.rateLimitControl.record({
|
||||||
|
uid,
|
||||||
|
tags: [rpcReflect.name.toUpperCase()],
|
||||||
|
status: API_CALL_STATUS.SUCCESS,
|
||||||
|
chargeAmount,
|
||||||
|
}).save().catch((err) => {
|
||||||
|
this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -7,14 +7,14 @@ import {
|
|||||||
import { marshalErrorLike } from 'civkit/lang';
|
import { marshalErrorLike } from 'civkit/lang';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
|
|
||||||
import { RateLimitControl, RateLimitDesc } from '../shared/services/rate-limit';
|
import { RateLimitControl, RateLimitDesc, RateLimitTriggeredError } from '../shared/services/rate-limit';
|
||||||
|
|
||||||
import { GlobalLogger } from '../services/logger';
|
import { GlobalLogger } from '../services/logger';
|
||||||
import { AsyncLocalContext } from '../services/async-context';
|
import { AsyncLocalContext } from '../services/async-context';
|
||||||
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
import { Context, Ctx, Method, Param, RPCReflect } from '../services/registry';
|
||||||
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { InsufficientBalanceError, RateLimitTriggeredError } from '../services/errors';
|
import { InsufficientBalanceError } from '../services/errors';
|
||||||
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||||
import { GoogleSERP } from '../services/serp/google';
|
import { GoogleSERP } from '../services/serp/google';
|
||||||
import { WebSearchEntry } from '../services/serp/compat';
|
import { WebSearchEntry } from '../services/serp/compat';
|
||||||
@ -25,6 +25,7 @@ import { SERPResult } from '../db/searched';
|
|||||||
import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
|
import { SerperBingSearchService, SerperGoogleSearchService } from '../services/serp/serper';
|
||||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { LRUCache } from 'lru-cache';
|
import { LRUCache } from 'lru-cache';
|
||||||
|
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||||
|
|
||||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||||
|
|
||||||
@ -172,10 +173,11 @@ export class SerpHost extends RPCHost {
|
|||||||
const now = new Date();
|
const now = new Date();
|
||||||
const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
|
const blockedTimeRemaining = (highFreqKey.blockedUntil.valueOf() - now.valueOf());
|
||||||
if (blockedTimeRemaining > 0) {
|
if (blockedTimeRemaining > 0) {
|
||||||
throw RateLimitTriggeredError.from({
|
this.logger.warn(`Rate limit triggered for ${uid}, this request should have been blocked`);
|
||||||
message: `Per UID rate limit exceeded (async)`,
|
// throw RateLimitTriggeredError.from({
|
||||||
retryAfter: Math.ceil(blockedTimeRemaining / 1000),
|
// message: `Per UID rate limit exceeded (async)`,
|
||||||
});
|
// retryAfter: Math.ceil(blockedTimeRemaining / 1000),
|
||||||
|
// });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -229,10 +231,10 @@ export class SerpHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
const now = Date.now();
|
const now = Date.now();
|
||||||
let tgtDate;
|
let tgtDate;
|
||||||
if (err.retryAfter) {
|
if (err.retryAfterDate) {
|
||||||
tgtDate = new Date(now + err.retryAfter * 1000);
|
|
||||||
} else if (err.retryAfterDate) {
|
|
||||||
tgtDate = err.retryAfterDate;
|
tgtDate = err.retryAfterDate;
|
||||||
|
} else if (err.retryAfter) {
|
||||||
|
tgtDate = new Date(now + err.retryAfter * 1000);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tgtDate) {
|
if (tgtDate) {
|
||||||
@ -260,8 +262,19 @@ export class SerpHost extends RPCHost {
|
|||||||
auth.reportUsage(chargeAmount, `reader-search`).catch((err) => {
|
auth.reportUsage(chargeAmount, `reader-search`).catch((err) => {
|
||||||
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
this.logger.warn(`Unable to report usage for ${uid}`, { err: marshalErrorLike(err) });
|
||||||
});
|
});
|
||||||
|
try {
|
||||||
const apiRoll = await apiRollPromise;
|
const apiRoll = await apiRollPromise;
|
||||||
apiRoll.chargeAmount = chargeAmount;
|
apiRoll.chargeAmount = chargeAmount;
|
||||||
|
} catch (err) {
|
||||||
|
await this.rateLimitControl.record({
|
||||||
|
uid,
|
||||||
|
tags: [rpcReflect.name.toUpperCase()],
|
||||||
|
status: API_CALL_STATUS.SUCCESS,
|
||||||
|
chargeAmount,
|
||||||
|
}).save().catch((err) => {
|
||||||
|
this.logger.warn(`Failed to save rate limit record`, { err: marshalErrorLike(err) });
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -467,25 +480,30 @@ export class SerpHost extends RPCHost {
|
|||||||
let lastError;
|
let lastError;
|
||||||
outerLoop:
|
outerLoop:
|
||||||
for (const client of this.iterProviders(provider)) {
|
for (const client of this.iterProviders(provider)) {
|
||||||
|
const t0 = Date.now();
|
||||||
try {
|
try {
|
||||||
switch (variant) {
|
switch (variant) {
|
||||||
case 'images': {
|
case 'images': {
|
||||||
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
|
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
|
||||||
break outerLoop;
|
break;
|
||||||
}
|
}
|
||||||
case 'news': {
|
case 'news': {
|
||||||
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
|
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
|
||||||
break outerLoop;
|
break;
|
||||||
}
|
}
|
||||||
case 'web':
|
case 'web':
|
||||||
default: {
|
default: {
|
||||||
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
|
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
||||||
break outerLoop;
|
break outerLoop;
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
lastError = err;
|
lastError = err;
|
||||||
this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err });
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.warn(`Failed to do ${variant} search using ${client.constructor.name}`, { err, variant, searchDt: dt, });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -61,10 +61,23 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
curlImpersonateHeader(curl: Curl, headers?: object) {
|
curlImpersonateHeader(curl: Curl, headers?: object) {
|
||||||
|
let uaPlatform = this.platform;
|
||||||
|
if (this.ua.includes('Windows')) {
|
||||||
|
uaPlatform = 'Windows';
|
||||||
|
} else if (this.ua.includes('Android')) {
|
||||||
|
uaPlatform = 'Android';
|
||||||
|
} else if (this.ua.includes('iPhone') || this.ua.includes('iPad') || this.ua.includes('iPod')) {
|
||||||
|
uaPlatform = 'iOS';
|
||||||
|
} else if (this.ua.includes('CrOS')) {
|
||||||
|
uaPlatform = 'Chrome OS';
|
||||||
|
} else if (this.ua.includes('Macintosh')) {
|
||||||
|
uaPlatform = 'macOS';
|
||||||
|
}
|
||||||
|
|
||||||
const mixinHeaders: Record<string, string> = {
|
const mixinHeaders: Record<string, string> = {
|
||||||
'sch-ch-ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
||||||
'sec-ch-ua-mobile': '?0',
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
'sec-ch-ua-platform': this.platform,
|
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
|
||||||
'Upgrade-Insecure-Requests': '1',
|
'Upgrade-Insecure-Requests': '1',
|
||||||
'User-Agent': this.ua,
|
'User-Agent': this.ua,
|
||||||
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
|
||||||
|
@ -1,4 +1,4 @@
|
|||||||
import { ApplicationError, Prop, RPC_TRANSFER_PROTOCOL_META_SYMBOL, StatusCode } from 'civkit/civ-rpc';
|
import { ApplicationError, StatusCode } from 'civkit/civ-rpc';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
import dayjs from 'dayjs';
|
import dayjs from 'dayjs';
|
||||||
import utc from 'dayjs/plugin/utc';
|
import utc from 'dayjs/plugin/utc';
|
||||||
@ -46,31 +46,3 @@ export class SecurityCompromiseError extends ApplicationError { }
|
|||||||
|
|
||||||
@StatusCode(41201)
|
@StatusCode(41201)
|
||||||
export class BatchSizeTooLargeError extends ApplicationError { }
|
export class BatchSizeTooLargeError extends ApplicationError { }
|
||||||
|
|
||||||
|
|
||||||
@StatusCode(42903)
|
|
||||||
export class RateLimitTriggeredError extends ApplicationError {
|
|
||||||
|
|
||||||
@Prop({
|
|
||||||
desc: 'Retry after seconds',
|
|
||||||
})
|
|
||||||
retryAfter?: number;
|
|
||||||
|
|
||||||
@Prop({
|
|
||||||
desc: 'Retry after date',
|
|
||||||
})
|
|
||||||
retryAfterDate?: Date;
|
|
||||||
|
|
||||||
protected override get [RPC_TRANSFER_PROTOCOL_META_SYMBOL]() {
|
|
||||||
const retryAfter = this.retryAfter || this.retryAfterDate;
|
|
||||||
if (!retryAfter) {
|
|
||||||
return super[RPC_TRANSFER_PROTOCOL_META_SYMBOL];
|
|
||||||
}
|
|
||||||
|
|
||||||
return _.merge(_.cloneDeep(super[RPC_TRANSFER_PROTOCOL_META_SYMBOL]), {
|
|
||||||
headers: {
|
|
||||||
'Retry-After': `${retryAfter instanceof Date ? dayjs(retryAfter).utc().format('ddd, DD MMM YYYY HH:mm:ss [GMT]') : retryAfter}`,
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
@ -510,6 +510,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
|
pagePhase = new WeakMap<Page, 'idle' | 'active' | 'background'>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
ua: string = '';
|
ua: string = '';
|
||||||
|
effectiveUA: string = '';
|
||||||
|
|
||||||
concurrentRequestsPerPage: number = 32;
|
concurrentRequestsPerPage: number = 32;
|
||||||
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
pageReqCtrl = new WeakMap<Page, PageReqCtrlKit>();
|
||||||
@ -582,7 +583,8 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
});
|
});
|
||||||
this.ua = await this.browser.userAgent();
|
this.ua = await this.browser.userAgent();
|
||||||
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
||||||
this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
|
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
||||||
|
this.curlControl.impersonateChrome(this.effectiveUA);
|
||||||
|
|
||||||
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
||||||
|
|
||||||
@ -615,7 +617,7 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
const preparations = [];
|
const preparations = [];
|
||||||
|
|
||||||
preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, '')));
|
preparations.push(page.setUserAgent(this.effectiveUA));
|
||||||
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
||||||
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
||||||
preparations.push(page.setBypassCSP(true));
|
preparations.push(page.setBypassCSP(true));
|
||||||
|
@ -233,6 +233,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
lastPageCratedAt: number = 0;
|
lastPageCratedAt: number = 0;
|
||||||
ua: string = '';
|
ua: string = '';
|
||||||
|
effectiveUA: string = '';
|
||||||
|
|
||||||
protected _REPORT_FUNCTION_NAME = 'bingo';
|
protected _REPORT_FUNCTION_NAME = 'bingo';
|
||||||
|
|
||||||
@ -299,7 +300,8 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
});
|
});
|
||||||
this.ua = await this.browser.userAgent();
|
this.ua = await this.browser.userAgent();
|
||||||
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
this.logger.info(`Browser launched: ${this.browser.process()?.pid}, ${this.ua}`);
|
||||||
this.curlControl.impersonateChrome(this.ua.replace(/Headless/i, ''));
|
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
||||||
|
this.curlControl.impersonateChrome(this.effectiveUA);
|
||||||
|
|
||||||
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
||||||
|
|
||||||
@ -322,7 +324,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
const preparations = [];
|
const preparations = [];
|
||||||
|
|
||||||
preparations.push(page.setUserAgent(this.ua.replace(/Headless/i, '')));
|
preparations.push(page.setUserAgent(this.effectiveUA));
|
||||||
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
// preparations.push(page.setUserAgent(`Slackbot-LinkExpanding 1.0 (+https://api.slack.com/robots)`));
|
||||||
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
// preparations.push(page.setUserAgent(`Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; GPTBot/1.0; +https://openai.com/gptbot)`));
|
||||||
preparations.push(page.setBypassCSP(true));
|
preparations.push(page.setBypassCSP(true));
|
||||||
|
@ -56,6 +56,7 @@ export class SerperSearchService extends AsyncService {
|
|||||||
let maxTries = 3;
|
let maxTries = 3;
|
||||||
|
|
||||||
while (maxTries--) {
|
while (maxTries--) {
|
||||||
|
const t0 = Date.now();
|
||||||
try {
|
try {
|
||||||
this.logger.debug(`Doing external search`, query);
|
this.logger.debug(`Doing external search`, query);
|
||||||
let r;
|
let r;
|
||||||
@ -101,11 +102,14 @@ export class SerperSearchService extends AsyncService {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
const dt = Date.now() - t0;
|
||||||
this.blackHoleDetector.itWorked();
|
this.blackHoleDetector.itWorked();
|
||||||
|
this.logger.debug(`External search took ${dt}ms`, { searchDt: dt, variant });
|
||||||
|
|
||||||
return r.parsed;
|
return r.parsed;
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.error(`${variant} search failed: ${err?.message}`, { err: marshalErrorLike(err) });
|
const dt = Date.now() - t0;
|
||||||
|
this.logger.error(`${variant} search failed: ${err?.message}`, { searchDt: dt, err: marshalErrorLike(err) });
|
||||||
if (err?.status === 429) {
|
if (err?.status === 429) {
|
||||||
await delay(500 + 1000 * Math.random());
|
await delay(500 + 1000 * Math.random());
|
||||||
continue;
|
continue;
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 0a59b8f84c03b2099a9785769bedc98573e65847
|
Subproject commit f89255cd6546641f72eefba140a4aef96a0e03fc
|
Loading…
x
Reference in New Issue
Block a user