From 2b5d865d9c8c0026a8ecb76ed5010bb6326a9b26 Mon Sep 17 00:00:00 2001 From: Yanlong Wang Date: Thu, 8 May 2025 11:13:47 +0800 Subject: [PATCH] serp: new trick --- .vscode/launch.json | 4 +- package-lock.json | 3 +- package.json | 1 + src/api/crawler.ts | 12 +- src/api/searcher.ts | 62 ++++-- src/api/serp.ts | 64 +++--- src/services/curl.ts | 167 +++++++++++++-- src/services/puppeteer.ts | 10 +- src/services/serp/common-serp.ts | 133 ++++++++++++ src/services/serp/google.ts | 357 ++++++++++++++++++++++++++++--- src/services/serp/internal.ts | 6 - src/services/serp/puppeteer.ts | 55 ++--- src/utils/encoding.ts | 20 +- 13 files changed, 754 insertions(+), 140 deletions(-) create mode 100644 src/services/serp/common-serp.ts diff --git a/.vscode/launch.json b/.vscode/launch.json index c8a4a5c..63a53ce 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -110,8 +110,8 @@ ], "env": { "GCLOUD_PROJECT": "reader-6b7dc", - "PREFERRED_PROXY_COUNTRY": "hk", - "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk", + "PREFERRED_PROXY_COUNTRY": "us", + // "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk", "LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib" }, "cwd": "${workspaceFolder}", diff --git a/package-lock.json b/package-lock.json index 9aa825d..164576a 100644 --- a/package-lock.json +++ b/package-lock.json @@ -23,6 +23,7 @@ "express": "^4.19.2", "firebase-admin": "^12.1.0", "firebase-functions": "^6.1.1", + "generic-pool": "^3.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "koa": "^2.16.0", @@ -6249,7 +6250,7 @@ "version": "3.9.0", "resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz", "integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==", - "optional": true, + "license": "MIT", "engines": { "node": ">= 4" } diff --git a/package.json b/package.json index 09bfa7c..5b55e29 100644 --- a/package.json +++ b/package.json @@ -32,6 +32,7 @@ "express": "^4.19.2", "firebase-admin": "^12.1.0", "firebase-functions": "^6.1.1", + "generic-pool": "^3.9.0", "htmlparser2": "^9.0.0", "jose": "^5.1.0", "koa": "^2.16.0", diff --git a/src/api/crawler.ts b/src/api/crawler.ts index c5679bc..1266e3d 100644 --- a/src/api/crawler.ts +++ b/src/api/crawler.ts @@ -49,6 +49,7 @@ import { TempFileManager } from '../services/temp-file'; import { MiscService } from '../services/misc'; import { HTTPServiceError } from 'civkit/http'; import { GeoIPService } from '../services/geoip'; +import { writeFile } from 'fs/promises'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -1145,7 +1146,16 @@ export class CrawlerHost extends RPCHost { if (pdfUrl.startsWith('http')) { const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl]; if (sideLoaded?.status === 200 && sideLoaded.body) { - snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href; + let filePath = ''; + if (sideLoaded.body instanceof Blob) { + const tmpPath = this.tempFileManager.alloc(); + await writeFile(tmpPath, sideLoaded.body.stream()); + this.tempFileManager.bindPathTo(this.threadLocal.ctx, tmpPath); + filePath = tmpPath; + } else { + filePath = await sideLoaded.body.filePath; + } + snapshotCopy.pdfs[0] = pathToFileURL(filePath).href; return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs); } diff --git a/src/api/searcher.ts b/src/api/searcher.ts index 95fa4cc..b4fc011 100644 --- a/src/api/searcher.ts +++ b/src/api/searcher.ts @@ -27,8 +27,10 @@ import { LRUCache } from 'lru-cache'; import { API_CALL_STATUS } from '../shared/db/api-roll'; import { SERPResult } from '../db/searched'; import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; -import { InternalJinaSerpService } from '../services/serp/internal'; import { WebSearchEntry } from '../services/serp/compat'; +import { CommonGoogleSERP } from '../services/serp/common-serp'; +import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google'; +import { InternalJinaSerpService } from '../services/serp/internal'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -72,6 +74,9 @@ export class SearcherHost extends RPCHost { protected serperGoogle: SerperGoogleSearchService, protected serperBing: SerperBingSearchService, protected jinaSerp: InternalJinaSerpService, + protected commonGoogleSerp: CommonGoogleSERP, + protected googleSERP: GoogleSERP, + protected googleSERPOld: GoogleSERPOldFashion, ) { super(...arguments); @@ -715,26 +720,32 @@ export class SearcherHost extends RPCHost { } } - *iterProviders(preference?: string, variant?: string) { + *iterProviders(preference?: string, _variant?: string) { if (preference === 'bing') { yield this.serperBing; - yield variant === 'web' ? this.jinaSerp : this.serperGoogle; + yield this.googleSERP; + yield this.jinaSerp; yield this.serperGoogle; + yield this.commonGoogleSerp; return; } if (preference === 'google') { - yield variant === 'web' ? this.jinaSerp : this.serperGoogle; - yield this.serperGoogle; + yield this.googleSERP; + yield this.jinaSerp; yield this.serperGoogle; + yield this.commonGoogleSerp; + yield this.googleSERPOld; return; } - yield variant === 'web' ? this.jinaSerp : this.serperGoogle; - yield this.serperGoogle; + yield this.googleSERP; + yield this.jinaSerp; yield this.serperGoogle; + yield this.commonGoogleSerp; + yield this.googleSERPOld; } async cachedSearch(variant: 'web' | 'news' | 'images', query: Record, noCache?: boolean): Promise { @@ -767,22 +778,27 @@ export class SearcherHost extends RPCHost { outerLoop: for (const client of this.iterProviders(provider, variant)) { const t0 = Date.now(); - try { - switch (variant) { - case 'images': { - r = await Reflect.apply(client.imageSearch, client, [query]); - break; - } - case 'news': { - r = await Reflect.apply(client.newsSearch, client, [query]); - break; - } - case 'web': - default: { - r = await Reflect.apply(client.webSearch, client, [query]); - break; - } + let func; + switch (variant) { + case 'images': { + func = Reflect.get(client, 'imageSearch'); + break; } + case 'news': { + func = Reflect.get(client, 'newsSearch'); + break; + } + case 'web': + default: { + func = Reflect.get(client, 'webSearch'); + break; + } + } + if (!func) { + continue; + } + try { + r = await Reflect.apply(func, client, [query]); const dt = Date.now() - t0; this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); break outerLoop; @@ -806,6 +822,8 @@ export class SearcherHost extends RPCHost { this.batchedCaches.push(record); } else if (lastError) { throw lastError; + } else if (!r) { + throw new AssertionFailureError(`No provider can do ${variant} search atm.`); } return r as WebSearchEntry[]; diff --git a/src/api/serp.ts b/src/api/serp.ts index 914f36c..cbd4618 100644 --- a/src/api/serp.ts +++ b/src/api/serp.ts @@ -3,6 +3,7 @@ import { RPCHost, RPCReflection, assignMeta, RawString, ParamValidationError, assignTransferProtocolMeta, + AssertionFailureError, } from 'civkit/civ-rpc'; import { marshalErrorLike } from 'civkit/lang'; import _ from 'lodash'; @@ -16,7 +17,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream'; import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth'; import { InsufficientBalanceError } from '../services/errors'; import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search'; -import { GoogleSERP } from '../services/serp/google'; +import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google'; import { WebSearchEntry } from '../services/serp/compat'; import { CrawlerOptions } from '../dto/crawler-options'; import { ScrappingOptions } from '../services/serp/puppeteer'; @@ -26,6 +27,7 @@ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/ import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account'; import { LRUCache } from 'lru-cache'; import { API_CALL_STATUS } from '../shared/db/api-roll'; +import { CommonGoogleSERP } from '../services/serp/common-serp'; import { InternalJinaSerpService } from '../services/serp/internal'; const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase()); @@ -91,8 +93,10 @@ export class SerpHost extends RPCHost { protected rateLimitControl: RateLimitControl, protected threadLocal: AsyncLocalContext, protected googleSerp: GoogleSERP, + protected googleSerpOld: GoogleSERPOldFashion, protected serperGoogle: SerperGoogleSearchService, protected serperBing: SerperBingSearchService, + protected commonGoogleSerp: CommonGoogleSERP, protected jinaSerp: InternalJinaSerpService, ) { super(...arguments); @@ -157,7 +161,7 @@ export class SerpHost extends RPCHost { @Param('num', { validate: (v: number) => v >= 0 && v <= 20 }) num?: number, @Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string, - @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string, + @Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string, @Param('location') location?: string, @Param('page') page?: number, @Param('fallback') fallback?: boolean, @@ -318,7 +322,7 @@ export class SerpHost extends RPCHost { q, num, gl, - // hl, + hl, location, page, }, crawlerOptions); @@ -451,27 +455,32 @@ export class SerpHost extends RPCHost { return result; } - *iterProviders(preference?: string, variant?: string) { + *iterProviders(preference?: string, _variant?: string) { if (preference === 'bing') { yield this.serperBing; - yield this.serperGoogle; yield this.googleSerp; + yield this.jinaSerp; + yield this.serperGoogle; + yield this.commonGoogleSerp; + yield this.googleSerpOld; return; } if (preference === 'google') { - yield this.googleSerp; yield this.googleSerp; yield this.serperGoogle; + yield this.commonGoogleSerp; + yield this.googleSerpOld; return; } - // yield variant === 'web' ? this.jinaSerp : this.serperGoogle; - yield this.serperGoogle - yield this.serperGoogle; yield this.googleSerp; + yield this.jinaSerp; + yield this.serperGoogle; + yield this.commonGoogleSerp; + yield this.googleSerpOld; } async cachedSearch(variant: 'web' | 'news' | 'images', query: Record, opts: CrawlerOptions) { @@ -506,22 +515,27 @@ export class SerpHost extends RPCHost { outerLoop: for (const client of this.iterProviders(provider, variant)) { const t0 = Date.now(); - try { - switch (variant) { - case 'images': { - r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]); - break; - } - case 'news': { - r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]); - break; - } - case 'web': - default: { - r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]); - break; - } + let func; + switch (variant) { + case 'images': { + func = Reflect.get(client, 'imageSearch'); + break; } + case 'news': { + func = Reflect.get(client, 'newsSearch'); + break; + } + case 'web': + default: { + func = Reflect.get(client, 'webSearch'); + break; + } + } + if (!func) { + continue; + } + try { + r = await Reflect.apply(func, client, [query, scrappingOptions]); const dt = Date.now() - t0; this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name }); break outerLoop; @@ -544,6 +558,8 @@ export class SerpHost extends RPCHost { this.batchedCaches.push(record); } else if (lastError) { throw lastError; + } else if (!r) { + throw new AssertionFailureError(`No provider can do ${variant} search atm.`); } return r; diff --git a/src/services/curl.ts b/src/services/curl.ts index fdb7539..ae3ca73 100644 --- a/src/services/curl.ts +++ b/src/services/curl.ts @@ -16,7 +16,7 @@ import { Readable } from 'stream'; import { AsyncLocalContext } from './async-context'; import { BlackHoleDetector } from './blackhole-detector'; -export interface CURLScrappingOptions extends ScrappingOptions { +export interface CURLScrappingOptions extends ScrappingOptions { method?: string; body?: string | Buffer; } @@ -75,7 +75,7 @@ export class CurlControl extends AsyncService { } const mixinHeaders: Record = { - 'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`, + 'Sec-Ch-Ua': `"Google Chrome";v="${this.chromeVersion}", "Not-A.Brand";v="8", "Chromium";v="${this.chromeVersion}"`, 'Sec-Ch-Ua-Mobile': '?0', 'Sec-Ch-Ua-Platform': `"${uaPlatform}"`, 'Upgrade-Insecure-Requests': '1', @@ -108,11 +108,11 @@ export class CurlControl extends AsyncService { return curl; } - urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { + urlToStream(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { return new Promise<{ statusCode: number, statusText?: string, - data?: FancyFile, + data?: Readable, headers: HeaderInfo[], }>((resolve, reject) => { let contentType = ''; @@ -122,7 +122,7 @@ export class CurlControl extends AsyncService { curl.setOpt(Curl.option.FOLLOWLOCATION, false); curl.setOpt(Curl.option.SSL_VERIFYPEER, false); curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000); - curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000); + curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 1_600); curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768); curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000); if (crawlOpts?.method) { @@ -193,7 +193,7 @@ export class CurlControl extends AsyncService { }); curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB let status = -1; - let statusText: string|undefined; + let statusText: string | undefined; let contentEncoding = ''; curl.once('end', () => { if (curlStream) { @@ -302,13 +302,10 @@ export class CurlControl extends AsyncService { } } - const fpath = this.tempFileManager.alloc(); - const fancyFile = FancyFile.auto(stream, fpath); - this.tempFileManager.bindPathTo(fancyFile, fpath); resolve({ statusCode: status, statusText, - data: fancyFile, + data: stream, headers: headers as HeaderInfo[], }); }); @@ -324,7 +321,19 @@ export class CurlControl extends AsyncService { let nextHopUrl = urlToCrawl; const fakeHeaderInfos: HeaderInfo[] = []; do { - const r = await this.urlToFile1Shot(nextHopUrl, opts); + const s = await this.urlToStream(nextHopUrl, opts); + const r = { ...s } as { + statusCode: number, + statusText?: string, + data?: FancyFile, + headers: HeaderInfo[], + }; + if (r.data) { + const fpath = this.tempFileManager.alloc(); + const fancyFile = FancyFile.auto(r.data, fpath); + this.tempFileManager.bindPathTo(fancyFile, fpath); + r.data = fancyFile; + } if ([301, 302, 303, 307, 308].includes(r.statusCode)) { fakeHeaderInfos.push(...r.headers); @@ -375,7 +384,7 @@ export class CurlControl extends AsyncService { const curlResult = await this.urlToFile(targetUrl, crawlOpts); this.blackHoleDetector.itWorked(); let finalURL = targetUrl; - const sideLoadOpts: CURLScrappingOptions['sideLoad'] = { + const sideLoadOpts: CURLScrappingOptions['sideLoad'] = { impersonate: {}, proxyOrigin: {}, }; @@ -421,6 +430,140 @@ export class CurlControl extends AsyncService { }; } + async urlToBlob(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) { + let leftRedirection = 6; + let cookieRedirects = 0; + let opts = { ...crawlOpts }; + let nextHopUrl = urlToCrawl; + const fakeHeaderInfos: HeaderInfo[] = []; + do { + const s = await this.urlToStream(nextHopUrl, opts); + const r = { ...s } as { + statusCode: number, + statusText?: string, + data?: Blob, + headers: HeaderInfo[], + }; + + + const headers = r.headers[r.headers.length - 1]; + if ([301, 302, 303, 307, 308].includes(r.statusCode)) { + fakeHeaderInfos.push(...r.headers); + const location: string | undefined = headers.Location || headers.location; + + const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie']; + if (setCookieHeader) { + const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader]; + const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true })); + if (parsed.length) { + opts.cookies = [...(opts.cookies || []), ...parsed]; + } + if (!location) { + cookieRedirects += 1; + } + } + + if (!location && !setCookieHeader) { + // Follow curl behavior + if (s.data) { + const chunks: Buffer[] = []; + s.data.on('data', (chunk) => { + chunks.push(chunk); + }); + await new Promise((resolve, reject) => { + s.data!.once('end', resolve); + s.data!.once('error', reject); + }); + r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] }); + } + return { + statusCode: r.statusCode, + data: r.data, + headers: fakeHeaderInfos.concat(r.headers), + }; + } + if (!location && cookieRedirects > 1) { + throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`); + } + + nextHopUrl = new URL(location || '', nextHopUrl); + leftRedirection -= 1; + continue; + } + + if (s.data) { + const chunks: Buffer[] = []; + s.data.on('data', (chunk) => { + chunks.push(chunk); + }); + await new Promise((resolve, reject) => { + s.data!.once('end', resolve); + s.data!.once('error', reject); + }); + r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] }); + } + + return { + statusCode: r.statusCode, + statusText: r.statusText, + data: r.data, + headers: fakeHeaderInfos.concat(r.headers), + }; + } while (leftRedirection > 0); + + throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`); + } + + async sideLoadBlob(targetUrl: URL, crawlOpts?: CURLScrappingOptions) { + const curlResult = await this.urlToBlob(targetUrl, crawlOpts); + this.blackHoleDetector.itWorked(); + let finalURL = targetUrl; + const sideLoadOpts: CURLScrappingOptions['sideLoad'] = { + impersonate: {}, + proxyOrigin: {}, + }; + for (const headers of curlResult.headers) { + sideLoadOpts.impersonate[finalURL.href] = { + status: headers.result?.code || -1, + headers: _.omit(headers, 'result'), + contentType: headers['Content-Type'] || headers['content-type'], + }; + if (crawlOpts?.proxyUrl) { + sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl; + } + if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) { + const location = headers.Location || headers.location; + if (location) { + finalURL = new URL(location, finalURL); + } + } + } + const lastHeaders = curlResult.headers[curlResult.headers.length - 1]; + const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (curlResult.data?.type) || 'application/octet-stream'; + const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition']; + const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop(); + + if (sideLoadOpts.impersonate[finalURL.href] && (curlResult.data?.size)) { + sideLoadOpts.impersonate[finalURL.href].body = curlResult.data; + } + + // This should keep the file from being garbage collected and deleted until this asyncContext/request is done. + this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data); + + return { + finalURL, + sideLoadOpts, + chain: curlResult.headers, + status: curlResult.statusCode, + statusText: curlResult.statusText, + headers: lastHeaders, + contentType, + contentDisposition, + fileName, + file: curlResult.data + }; + } + digestCurlCode(code: CurlCode, msg: string) { switch (code) { // 400 User errors diff --git a/src/services/puppeteer.ts b/src/services/puppeteer.ts index 3b0eb5a..5e03ecb 100644 --- a/src/services/puppeteer.ts +++ b/src/services/puppeteer.ts @@ -79,7 +79,7 @@ export interface ExtendedSnapshot extends PageSnapshot { imgs: ImgBrief[]; } -export interface ScrappingOptions { +export interface ScrappingOptions { proxyUrl?: string; cookies?: Cookie[]; favorScreenshot?: boolean; @@ -101,7 +101,7 @@ export interface ScrappingOptions { status: number; headers: { [k: string]: string | string[]; }; contentType?: string; - body?: FancyFile; + body?: T; }; }; proxyOrigin: { [origin: string]: string; }; @@ -912,7 +912,11 @@ export class PuppeteerControl extends AsyncService { if (impersonate) { let body; if (impersonate.body) { - body = await readFile(await impersonate.body.filePath); + if (impersonate.body instanceof Blob) { + body = new Uint8Array(await impersonate.body.arrayBuffer()); + } else { + body = await readFile(await impersonate.body.filePath); + } if (req.isInterceptResolutionHandled()) { return; } diff --git a/src/services/serp/common-serp.ts b/src/services/serp/common-serp.ts new file mode 100644 index 0000000..bc1faf6 --- /dev/null +++ b/src/services/serp/common-serp.ts @@ -0,0 +1,133 @@ +import { singleton } from 'tsyringe'; +import { AsyncService } from 'civkit/async-service'; +import { GlobalLogger } from '../logger'; +import { JSDomControl } from '../jsdom'; +import _ from 'lodash'; +import { WebSearchEntry } from './compat'; +import { ServiceBadAttemptError } from '../errors'; +import commonSerpClients, { CommonSerpImageResponse, CommonSerpNewsResponse, CommonSerpWebResponse } from '../../shared/3rd-party/common-serp'; +import { AsyncLocalContext } from '../async-context'; + +@singleton() +export class CommonGoogleSERP extends AsyncService { + logger = this.globalLogger.child({ service: this.constructor.name }); + googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com'; + + protected ctxIteratorMap = new WeakMap>(); + + constructor( + protected globalLogger: GlobalLogger, + protected jsDomControl: JSDomControl, + protected asyncContext: AsyncLocalContext, + + ) { + super(...arguments); + } + + override async init() { + await this.dependencyReady(); + + this.emit('ready'); + } + + *iterClients() { + if (!commonSerpClients.length) { + return; + } + while (true) { + yield* commonSerpClients; + } + } + + getClient() { + const ctx = this.asyncContext.ctx; + const it = this.ctxIteratorMap.get(ctx) || this.iterClients(); + this.ctxIteratorMap.set(ctx, it); + const client = it.next().value; + if (!client) { + throw new ServiceBadAttemptError('No client available'); + } + return client; + } + + + digestQuery(query: { [k: string]: any; }) { + const url = new URL(`https://${this.googleDomain}/search`); + const clone = { ...query }; + const num = clone.num || 10; + if (clone.page) { + const page = parseInt(clone.page); + delete clone.page; + clone.start = (page - 1) * num; + if (clone.start === 0) { + delete clone.start; + } + } + if (clone.location) { + delete clone.location; + } + + for (const [k, v] of Object.entries(clone)) { + if (v === undefined || v === null) { + continue; + } + url.searchParams.set(k, `${v}`); + } + + return url; + } + + async webSearch(query: { [k: string]: any; }) { + const url = this.digestQuery(query); + + const client = this.getClient(); + + const r = await client.queryJSON(url.href) as CommonSerpWebResponse; + + return r.organic.map((x)=> ({ + link: x.link, + title: x.title, + snippet: x.description, + variant: 'web', + })) as WebSearchEntry[]; + } + + async newsSearch(query: { [k: string]: any; }) { + const url = this.digestQuery(query); + + url.searchParams.set('tbm', 'nws'); + + const client = this.getClient(); + + const r = await client.queryJSON(url.href) as CommonSerpNewsResponse; + + return r.news.map((x)=> ({ + link: x.link, + title: x.title, + snippet: x.description, + source: x.source, + date: x.date, + imageUrl: x.image, + variant: 'news', + })) as WebSearchEntry[]; + } + + async imageSearch(query: { [k: string]: any; }) { + const url = this.digestQuery(query); + + url.searchParams.set('tbm', 'isch'); + + const client = this.getClient(); + + const r = await client.queryJSON(url.href) as CommonSerpImageResponse; + + return r.images.map((x)=> ({ + link: x.link, + title: x.title, + snippet: x.image_alt, + source: x.source, + imageUrl: x.image, + variant: 'images', + })) as WebSearchEntry[]; + } +} diff --git a/src/services/serp/google.ts b/src/services/serp/google.ts index 23ba669..7d4b6c8 100644 --- a/src/services/serp/google.ts +++ b/src/services/serp/google.ts @@ -7,24 +7,74 @@ import _ from 'lodash'; import { WebSearchEntry } from './compat'; import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer'; import { CurlControl } from '../curl'; -import { readFile } from 'fs/promises'; import { ApplicationError } from 'civkit/civ-rpc'; import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors'; import { parseJSONText } from 'civkit/vectorize'; -import { retryWith } from 'civkit/decorators'; -import { ProxyProviderService } from '../../shared/services/proxy-provider'; +import { retry, retryWith } from 'civkit/decorators'; +import { SERPProxyProviderService } from '../../shared/services/proxy-provider'; +import { readBlob } from '../../utils/encoding'; +import { createContext, Script } from 'vm'; +import { BrowserContext } from 'puppeteer'; +import { createPool } from 'generic-pool'; +import { randomBytes } from 'crypto'; +import { AsyncLocalContext } from '../async-context'; + +interface SerpContext { + proxyUrl?: string; + browserContext?: BrowserContext; + validTill?: Date; + magicId?: string; +} @singleton() export class GoogleSERP extends AsyncService { logger = this.globalLogger.child({ service: this.constructor.name }); googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com'; + nativeIPHealthy = true; + + contextPool = createPool({ + create: () => { + return this.createContext(); + }, + destroy: async (ctx: SerpContext) => { + if (ctx.browserContext) { + try { + await ctx.browserContext.close(); + } catch (err) { + this.logger.warn('Error closing browser context', { err }); + } + } + }, + validate: async (ctx: SerpContext) => { + if (ctx.validTill && ctx.validTill > (new Date(Date.now() + 5 * 60 * 1000))) { + return true; + } + + return !ctx.proxyUrl; + }, + }, { + max: 3_000, + testOnBorrow: true, + }); + + protected async createContext() { + await this.serviceReady(); + this.asyncLocalContext.ctx.ctxIsNew = true; + + return { + magicId: randomBytes(17).toString('base64url'), + validTill: new Date(Date.now() + 30 * 60 * 1000), + } as SerpContext; + } + constructor( protected globalLogger: GlobalLogger, protected puppeteerControl: SERPSpecializedPuppeteerControl, protected jsDomControl: JSDomControl, protected curlControl: CurlControl, - protected proxyProvider: ProxyProviderService, + protected proxyProvider: SERPProxyProviderService, + protected asyncLocalContext: AsyncLocalContext, ) { const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl); super(...filteredDeps); @@ -36,6 +86,15 @@ export class GoogleSERP extends AsyncService { this.emit('ready'); } + nativeIPBlocked() { + this.nativeIPHealthy = false; + this.logger.warn('Native IP is not healthy.'); + setTimeout(() => { + this.nativeIPHealthy = true; + this.logger.debug('Presume native IP healthy again after timeout.'); + }, 1000 * 60 * 60); + } + @retryWith((err) => { if (err instanceof ServiceBadApproachError) { return false; @@ -51,15 +110,13 @@ export class GoogleSERP extends AsyncService { return undefined; }, 3) async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) { - if (opts?.allocProxy === 'none') { - return this.curlControl.sideLoad(url, opts); + if (opts?.allocProxy === 'none' || opts?.proxyUrl) { + return this.curlControl.sideLoadBlob(url, opts); } - const proxy = await this.proxyProvider.alloc( - process.env.PREFERRED_PROXY_COUNTRY || 'auto' - ); + const proxy = await this.proxyProvider.alloc(); this.logger.debug(`Proxy allocated`, { proxy: proxy.href }); - const r = await this.curlControl.sideLoad(url, { + const r = await this.curlControl.sideLoadBlob(url, { ...opts, proxyUrl: proxy.href, }); @@ -101,48 +158,113 @@ export class GoogleSERP extends AsyncService { return url; } + @retry(2) async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { const url = this.digestQuery(query); + const origHref = url.href; + if (!url.searchParams.has('start')) { + url.searchParams.set('start', '0'); + } + url.searchParams.set('asearch', 'arc'); + + const ctx = await this.contextPool.acquire(); + + url.searchParams.set('async', getMagicAsyncParam(query.start, ctx.magicId)); + + const sideLoaded = await this.sideLoadWithAllocatedProxy(url, { + ...opts, + allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'), + proxyUrl: ctx.proxyUrl, + }).catch((err) => { + this.contextPool.destroy(ctx); + + return Promise.reject(err); + }); + + if ('proxy' in sideLoaded) { + ctx.proxyUrl = sideLoaded.proxy.href; + ctx.validTill = new Date(Date.now() + 30 * 60 * 1000); + } + + if (sideLoaded.status === 200) { + this.contextPool.release(ctx); + } else { + if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) { + this.nativeIPBlocked(); + } + this.contextPool.destroy(ctx); + } - const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); if (opts && sideLoaded.sideLoadOpts) { opts.sideLoad = sideLoaded.sideLoadOpts; } - const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts); + if (!sideLoaded.file) { + throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); + } + const contentType = sideLoaded.contentType; + const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8'; - return snapshot; - } - - async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { - const url = this.digestQuery(query); - - url.searchParams.set('tbm', 'nws'); - - const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); - if (opts && sideLoaded.sideLoadOpts) { - opts.sideLoad = sideLoaded.sideLoadOpts; + let html = await readBlob(sideLoaded.file, encoding); + let innerCharset; + const peek = html.slice(0, 1024); + innerCharset ??= peek.match(/]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase(); + innerCharset ??= peek.match(/]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase(); + if (innerCharset && innerCharset !== encoding) { + html = await readBlob(sideLoaded.file, innerCharset); } - const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts); + const jsdom = this.jsDomControl.linkedom.parseHTML(html, { location: { href: origHref } }); + try { + const r = runGetWebSearchResultsScript(createContext(jsdom)); - return snapshot; + return r; + } catch (err) { + throw new ServiceBadAttemptError({ + message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.', + err + }); + } } + @retry(2) async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { const url = this.digestQuery(query); url.searchParams.set('tbm', 'isch'); url.searchParams.set('asearch', 'isch'); url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`); + const ctx = await this.contextPool.acquire(); - const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts); + const sideLoaded = await this.sideLoadWithAllocatedProxy(url, { + ...opts, + proxyUrl: ctx.proxyUrl, + allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'), + }).catch((err) => { + this.contextPool.destroy(ctx); + + return Promise.reject(err); + }); + + if ('proxy' in sideLoaded) { + ctx.proxyUrl = sideLoaded.proxy.href; + ctx.validTill = new Date(Date.now() + 30 * 60 * 1000); + } + + if (sideLoaded.status === 200) { + this.contextPool.release(ctx); + } else { + this.contextPool.destroy(ctx); + if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) { + this.nativeIPBlocked(); + } + } if (sideLoaded.status !== 200 || !sideLoaded.file) { throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); } - const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString(); + const jsonTxt = (await readBlob(sideLoaded.file)).toString(); const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":'))); return _.get(rJSON, 'ischj.metadata').map((x: any) => { @@ -161,6 +283,91 @@ export class GoogleSERP extends AsyncService { } } + +@singleton() +export class GoogleSERPOldFashion extends GoogleSERP { + override async createContext() { + await this.serviceReady(); + this.asyncLocalContext.ctx.ctxIsNew = true; + + return { + browserContext: await this.puppeteerControl.browser.createBrowserContext(), + magicId: randomBytes(17).toString('base64url'), + validTill: new Date(Date.now() + 30 * 60 * 1000), + } as SerpContext; + } + + override async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { + const url = this.digestQuery(query); + + const ctx = await this.contextPool.acquire(); + + const sideLoaded = await this.sideLoadWithAllocatedProxy(url, { + ...opts, + proxyUrl: ctx.proxyUrl, + }).catch((err) => { + this.contextPool.destroy(ctx); + + return Promise.reject(err); + }); + + if ('proxy' in sideLoaded) { + ctx.proxyUrl = sideLoaded.proxy.href; + ctx.validTill = new Date(Date.now() + 30 * 60 * 1000); + } + + if (sideLoaded.status === 200) { + this.contextPool.release(ctx); + } else { + this.contextPool.destroy(ctx); + } + + if (opts && sideLoaded.sideLoadOpts) { + opts.sideLoad = sideLoaded.sideLoadOpts; + } + + const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts); + + return snapshot; + } + + async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) { + const url = this.digestQuery(query); + + url.searchParams.set('tbm', 'nws'); + + const ctx = await this.contextPool.acquire(); + + const sideLoaded = await this.sideLoadWithAllocatedProxy(url, { + ...opts, + proxyUrl: ctx.proxyUrl, + }).catch((err) => { + this.contextPool.destroy(ctx); + + return Promise.reject(err); + }); + + if ('proxy' in sideLoaded) { + ctx.proxyUrl = sideLoaded.proxy.href; + ctx.validTill = new Date(Date.now() + 30 * 60 * 1000); + } + + const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, { + ...opts, + proxyUrl: ctx.proxyUrl, + browserContext: ctx.browserContext, + }).catch((err) => { + this.contextPool.destroy(ctx); + + return Promise.reject(err); + }); + + this.contextPool.release(ctx); + + return snapshot; + } +} + async function getWebSearchResults() { if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) { throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.'); @@ -254,6 +461,96 @@ async function getWebSearchResults() { }; }).filter(Boolean) as WebSearchEntry[]; } +function getWebSearchResultsSync() { + const wrapper1 = document.querySelector('div[data-async-context^="query"]'); + + if (!wrapper1) { + return undefined; + } + + const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || ''); + + if (!query) { + return undefined; + } + + const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]')); + + return candidates.map((x, pos) => { + const primaryLink = x.querySelector('a:not([href="#"])'); + if (!primaryLink) { + return undefined; + } + const url = primaryLink.getAttribute('href'); + + if (primaryLink.querySelector('div[role="heading"]')) { + // const spans = primaryLink.querySelectorAll('span'); + // const title = spans[0]?.textContent; + // const source = spans[1]?.textContent; + // const date = spans[spans.length - 1].textContent; + + // return { + // link: url, + // title, + // source, + // date, + // variant: 'video' + // }; + return undefined; + } + + const title = primaryLink.querySelector('h3')?.textContent; + const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent; + const cite = primaryLink.querySelector('cite[role=text]')?.textContent; + let date = cite?.split('ยท')[1]?.trim(); + const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span')); + let snippet = snippets[snippets.length - 1]?.textContent; + if (!snippet) { + snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null; + } + date ??= snippets[snippets.length - 2]?.textContent?.trim(); + const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src'); + let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => { + return { + link: l.getAttribute('href'), + title: l.textContent, + }; + }); + const perhapsParent = x.parentElement?.closest('div[data-hveid]'); + if (!siteLinks?.length && perhapsParent) { + const candidates = Array.from(perhapsParent.querySelectorAll('td h3')); + if (candidates.length) { + siteLinks = candidates.map((l) => { + const link = l.querySelector('a'); + if (!link) { + return undefined; + } + const snippet = l.nextElementSibling?.textContent; + return { + link: link.getAttribute('href'), + title: link.textContent, + snippet, + }; + }).filter(Boolean) as any; + } + } + + return { + link: url, + title, + source, + date, + snippet: snippet ?? undefined, + imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl, + siteLinks: siteLinks.length ? siteLinks : undefined, + variant: 'web', + }; + }).filter(Boolean) as WebSearchEntry[]; +} +const script = new Script(`(${getWebSearchResultsSync.toString()})()`); +function runGetWebSearchResultsScript(ctx: object) { + return script.runInContext(ctx); +} async function getNewsSearchResults() { if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) { @@ -305,4 +602,10 @@ async function getNewsSearchResults() { variant: 'news', }; }).filter(Boolean) as WebSearchEntry[]; +} + +function getMagicAsyncParam(start: number = 0, inputArcid?: string) { + const arcid = inputArcid || randomBytes(17).toString('base64url'); + + return `arc_id:srp_${arcid}_1${start.toString().padStart(2, '0')},use_ac:true,_fmt:prog`; } \ No newline at end of file diff --git a/src/services/serp/internal.ts b/src/services/serp/internal.ts index 1303369..2c6d8d2 100644 --- a/src/services/serp/internal.ts +++ b/src/services/serp/internal.ts @@ -67,11 +67,5 @@ export class InternalJinaSerpService extends AsyncService { async webSearch(query: SerperSearchQueryParams) { return this.doSearch('web', query); } - async imageSearch(query: SerperSearchQueryParams) { - return this.doSearch('images', query); - } - async newsSearch(query: SerperSearchQueryParams) { - return this.doSearch('news', query); - } } diff --git a/src/services/serp/puppeteer.ts b/src/services/serp/puppeteer.ts index 40a0932..0bfffd4 100644 --- a/src/services/serp/puppeteer.ts +++ b/src/services/serp/puppeteer.ts @@ -2,7 +2,7 @@ import _ from 'lodash'; import { readFile } from 'fs/promises'; import { container, singleton } from 'tsyringe'; -import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer'; +import type { Browser, BrowserContext, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer'; import type { Cookie } from 'set-cookie-parser'; import puppeteer, { TimeoutError } from 'puppeteer'; @@ -21,6 +21,7 @@ import { BlackHoleDetector } from '../blackhole-detector'; export interface ScrappingOptions { + browserContext?: BrowserContext; proxyUrl?: string; cookies?: Cookie[]; overrideUserAgent?: string; @@ -38,7 +39,7 @@ export interface ScrappingOptions { status: number; headers: { [k: string]: string | string[]; }; contentType?: string; - body?: FancyFile; + body?: FancyFile | Blob; }; }; proxyOrigin: { [origin: string]: string; }; @@ -226,8 +227,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { browser!: Browser; logger = this.globalLogger.child({ service: this.constructor.name }); - __loadedPage: Page[] = []; - finalizerMap = new WeakMap>(); snMap = new WeakMap(); livePages = new Set(); @@ -251,7 +250,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { let crippledTimes = 0; this.on('crippled', () => { crippledTimes += 1; - this.__loadedPage.length = 0; this.livePages.clear(); if (crippledTimes > 5) { process.nextTick(() => { @@ -303,20 +301,16 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'); this.curlControl.impersonateChrome(this.effectiveUA); - await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r)); - this.emit('ready'); } - async newPage(bewareDeadLock: any = false) { - if (!bewareDeadLock) { - await this.serviceReady(); - } + async newPage(context?: BrowserContext) { + await this.serviceReady(); const sn = this._sn++; let page; + context ??= await this.browser.createBrowserContext(); try { - const dedicatedContext = await this.browser.createBrowserContext(); - page = await dedicatedContext.newPage(); + page = await context.newPage(); } catch (err: any) { this.logger.warn(`Failed to create page ${sn}`, { err }); this.browser.process()?.kill('SIGKILL'); @@ -347,23 +341,9 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { return page; } - async getNextPage() { - let thePage: Page | undefined; - if (this.__loadedPage.length) { - thePage = this.__loadedPage.shift(); - if (this.__loadedPage.length <= 1) { - process.nextTick(() => { - this.newPage() - .then((r) => this.__loadedPage.push(r)) - .catch((err) => { - this.logger.warn(`Failed to load new page ahead of time`, { err }); - }); - }); - } - } - + async getNextPage(context?: BrowserContext) { + const thePage = await this.newPage(context); if (!thePage) { - thePage = await this.newPage(); } const timer = setTimeout(() => { @@ -387,14 +367,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { const sn = this.snMap.get(page); this.logger.debug(`Closing page ${sn}`); await Promise.race([ - (async () => { - const ctx = page.browserContext(); - try { - await page.close(); - } finally { - await ctx.close(); - } - })(), + page.close(), delay(5000) ]).catch((err) => { this.logger.error(`Failed to destroy page ${sn}`, { err }); @@ -405,7 +378,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { async controlledScrap(parsedUrl: URL, func: (this: void) => Promise, options: ScrappingOptions = {}): Promise { // parsedUrl.search = ''; const url = parsedUrl.toString(); - const page = await this.getNextPage(); + const page = await this.getNextPage(options.browserContext); this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx); page.on('response', (_resp) => { this.blackHoleDetector.itWorked(); @@ -452,7 +425,11 @@ export class SERPSpecializedPuppeteerControl extends AsyncService { if (impersonate) { let body; if (impersonate.body) { - body = await readFile(await impersonate.body.filePath); + if (impersonate.body instanceof Blob) { + body = new Uint8Array(await impersonate.body.arrayBuffer()); + } else { + body = await readFile(await impersonate.body.filePath); + } if (req.isInterceptResolutionHandled()) { return; } diff --git a/src/utils/encoding.ts b/src/utils/encoding.ts index 0f34a9b..7ffe4ee 100644 --- a/src/utils/encoding.ts +++ b/src/utils/encoding.ts @@ -1,13 +1,12 @@ import { createReadStream } from 'fs'; import { Readable } from 'stream'; -import { TextDecoderStream } from 'stream/web'; export async function decodeFileStream( fileStream: Readable, encoding: string = 'utf-8', ): Promise { const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); - Readable.toWeb(fileStream).pipeThrough(decodeStream); + (Readable.toWeb(fileStream) as ReadableStream).pipeThrough(decodeStream); const chunks = []; for await (const chunk of decodeStream.readable) { @@ -23,7 +22,22 @@ export async function readFile( encoding: string = 'utf-8', ): Promise { const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); - Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream); + (Readable.toWeb(createReadStream(filePath)) as ReadableStream).pipeThrough(decodeStream); + const chunks = []; + + for await (const chunk of decodeStream.readable) { + chunks.push(chunk); + } + + return chunks.join(''); +} + +export async function readBlob( + blob: Blob, + encoding: string = 'utf-8', +): Promise { + const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false }); + blob.stream().pipeThrough(decodeStream); const chunks = []; for await (const chunk of decodeStream.readable) {