mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-19 00:05:59 +08:00
serp: new trick
This commit is contained in:
parent
7bc70a1955
commit
2b5d865d9c
4
.vscode/launch.json
vendored
4
.vscode/launch.json
vendored
@ -110,8 +110,8 @@
|
||||
],
|
||||
"env": {
|
||||
"GCLOUD_PROJECT": "reader-6b7dc",
|
||||
"PREFERRED_PROXY_COUNTRY": "hk",
|
||||
"OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
|
||||
"PREFERRED_PROXY_COUNTRY": "us",
|
||||
// "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
|
||||
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
||||
},
|
||||
"cwd": "${workspaceFolder}",
|
||||
|
3
package-lock.json
generated
3
package-lock.json
generated
@ -23,6 +23,7 @@
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"generic-pool": "^3.9.0",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"koa": "^2.16.0",
|
||||
@ -6249,7 +6250,7 @@
|
||||
"version": "3.9.0",
|
||||
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
|
||||
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
|
||||
"optional": true,
|
||||
"license": "MIT",
|
||||
"engines": {
|
||||
"node": ">= 4"
|
||||
}
|
||||
|
@ -32,6 +32,7 @@
|
||||
"express": "^4.19.2",
|
||||
"firebase-admin": "^12.1.0",
|
||||
"firebase-functions": "^6.1.1",
|
||||
"generic-pool": "^3.9.0",
|
||||
"htmlparser2": "^9.0.0",
|
||||
"jose": "^5.1.0",
|
||||
"koa": "^2.16.0",
|
||||
|
@ -49,6 +49,7 @@ import { TempFileManager } from '../services/temp-file';
|
||||
import { MiscService } from '../services/misc';
|
||||
import { HTTPServiceError } from 'civkit/http';
|
||||
import { GeoIPService } from '../services/geoip';
|
||||
import { writeFile } from 'fs/promises';
|
||||
|
||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||
withIframe?: boolean | 'quoted';
|
||||
@ -1145,7 +1146,16 @@ export class CrawlerHost extends RPCHost {
|
||||
if (pdfUrl.startsWith('http')) {
|
||||
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
|
||||
if (sideLoaded?.status === 200 && sideLoaded.body) {
|
||||
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
|
||||
let filePath = '';
|
||||
if (sideLoaded.body instanceof Blob) {
|
||||
const tmpPath = this.tempFileManager.alloc();
|
||||
await writeFile(tmpPath, sideLoaded.body.stream());
|
||||
this.tempFileManager.bindPathTo(this.threadLocal.ctx, tmpPath);
|
||||
filePath = tmpPath;
|
||||
} else {
|
||||
filePath = await sideLoaded.body.filePath;
|
||||
}
|
||||
snapshotCopy.pdfs[0] = pathToFileURL(filePath).href;
|
||||
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
||||
}
|
||||
|
||||
|
@ -27,8 +27,10 @@ import { LRUCache } from 'lru-cache';
|
||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||
import { SERPResult } from '../db/searched';
|
||||
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||
import { WebSearchEntry } from '../services/serp/compat';
|
||||
import { CommonGoogleSERP } from '../services/serp/common-serp';
|
||||
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
|
||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||
|
||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||
|
||||
@ -72,6 +74,9 @@ export class SearcherHost extends RPCHost {
|
||||
protected serperGoogle: SerperGoogleSearchService,
|
||||
protected serperBing: SerperBingSearchService,
|
||||
protected jinaSerp: InternalJinaSerpService,
|
||||
protected commonGoogleSerp: CommonGoogleSERP,
|
||||
protected googleSERP: GoogleSERP,
|
||||
protected googleSERPOld: GoogleSERPOldFashion,
|
||||
) {
|
||||
super(...arguments);
|
||||
|
||||
@ -715,26 +720,32 @@ export class SearcherHost extends RPCHost {
|
||||
}
|
||||
}
|
||||
|
||||
*iterProviders(preference?: string, variant?: string) {
|
||||
*iterProviders(preference?: string, _variant?: string) {
|
||||
if (preference === 'bing') {
|
||||
yield this.serperBing;
|
||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
||||
yield this.googleSERP;
|
||||
yield this.jinaSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (preference === 'google') {
|
||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
||||
yield this.serperGoogle;
|
||||
yield this.googleSERP;
|
||||
yield this.jinaSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
yield this.googleSERPOld;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
||||
yield this.serperGoogle;
|
||||
yield this.googleSERP;
|
||||
yield this.jinaSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
yield this.googleSERPOld;
|
||||
}
|
||||
|
||||
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
|
||||
@ -767,22 +778,27 @@ export class SearcherHost extends RPCHost {
|
||||
outerLoop:
|
||||
for (const client of this.iterProviders(provider, variant)) {
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
switch (variant) {
|
||||
case 'images': {
|
||||
r = await Reflect.apply(client.imageSearch, client, [query]);
|
||||
break;
|
||||
}
|
||||
case 'news': {
|
||||
r = await Reflect.apply(client.newsSearch, client, [query]);
|
||||
break;
|
||||
}
|
||||
case 'web':
|
||||
default: {
|
||||
r = await Reflect.apply(client.webSearch, client, [query]);
|
||||
break;
|
||||
}
|
||||
let func;
|
||||
switch (variant) {
|
||||
case 'images': {
|
||||
func = Reflect.get(client, 'imageSearch');
|
||||
break;
|
||||
}
|
||||
case 'news': {
|
||||
func = Reflect.get(client, 'newsSearch');
|
||||
break;
|
||||
}
|
||||
case 'web':
|
||||
default: {
|
||||
func = Reflect.get(client, 'webSearch');
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!func) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
r = await Reflect.apply(func, client, [query]);
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
||||
break outerLoop;
|
||||
@ -806,6 +822,8 @@ export class SearcherHost extends RPCHost {
|
||||
this.batchedCaches.push(record);
|
||||
} else if (lastError) {
|
||||
throw lastError;
|
||||
} else if (!r) {
|
||||
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
|
||||
}
|
||||
|
||||
return r as WebSearchEntry[];
|
||||
|
@ -3,6 +3,7 @@ import {
|
||||
RPCHost, RPCReflection, assignMeta, RawString,
|
||||
ParamValidationError,
|
||||
assignTransferProtocolMeta,
|
||||
AssertionFailureError,
|
||||
} from 'civkit/civ-rpc';
|
||||
import { marshalErrorLike } from 'civkit/lang';
|
||||
import _ from 'lodash';
|
||||
@ -16,7 +17,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||
import { InsufficientBalanceError } from '../services/errors';
|
||||
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||
import { GoogleSERP } from '../services/serp/google';
|
||||
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
|
||||
import { WebSearchEntry } from '../services/serp/compat';
|
||||
import { CrawlerOptions } from '../dto/crawler-options';
|
||||
import { ScrappingOptions } from '../services/serp/puppeteer';
|
||||
@ -26,6 +27,7 @@ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/
|
||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||
import { LRUCache } from 'lru-cache';
|
||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||
import { CommonGoogleSERP } from '../services/serp/common-serp';
|
||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||
|
||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||
@ -91,8 +93,10 @@ export class SerpHost extends RPCHost {
|
||||
protected rateLimitControl: RateLimitControl,
|
||||
protected threadLocal: AsyncLocalContext,
|
||||
protected googleSerp: GoogleSERP,
|
||||
protected googleSerpOld: GoogleSERPOldFashion,
|
||||
protected serperGoogle: SerperGoogleSearchService,
|
||||
protected serperBing: SerperBingSearchService,
|
||||
protected commonGoogleSerp: CommonGoogleSERP,
|
||||
protected jinaSerp: InternalJinaSerpService,
|
||||
) {
|
||||
super(...arguments);
|
||||
@ -157,7 +161,7 @@ export class SerpHost extends RPCHost {
|
||||
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
|
||||
num?: number,
|
||||
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
|
||||
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string,
|
||||
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
||||
@Param('location') location?: string,
|
||||
@Param('page') page?: number,
|
||||
@Param('fallback') fallback?: boolean,
|
||||
@ -318,7 +322,7 @@ export class SerpHost extends RPCHost {
|
||||
q,
|
||||
num,
|
||||
gl,
|
||||
// hl,
|
||||
hl,
|
||||
location,
|
||||
page,
|
||||
}, crawlerOptions);
|
||||
@ -451,27 +455,32 @@ export class SerpHost extends RPCHost {
|
||||
return result;
|
||||
}
|
||||
|
||||
*iterProviders(preference?: string, variant?: string) {
|
||||
*iterProviders(preference?: string, _variant?: string) {
|
||||
if (preference === 'bing') {
|
||||
yield this.serperBing;
|
||||
yield this.serperGoogle;
|
||||
yield this.googleSerp;
|
||||
yield this.jinaSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
yield this.googleSerpOld;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (preference === 'google') {
|
||||
yield this.googleSerp;
|
||||
yield this.googleSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
yield this.googleSerpOld;
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
||||
yield this.serperGoogle
|
||||
yield this.serperGoogle;
|
||||
yield this.googleSerp;
|
||||
yield this.jinaSerp;
|
||||
yield this.serperGoogle;
|
||||
yield this.commonGoogleSerp;
|
||||
yield this.googleSerpOld;
|
||||
}
|
||||
|
||||
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
|
||||
@ -506,22 +515,27 @@ export class SerpHost extends RPCHost {
|
||||
outerLoop:
|
||||
for (const client of this.iterProviders(provider, variant)) {
|
||||
const t0 = Date.now();
|
||||
try {
|
||||
switch (variant) {
|
||||
case 'images': {
|
||||
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
|
||||
break;
|
||||
}
|
||||
case 'news': {
|
||||
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
|
||||
break;
|
||||
}
|
||||
case 'web':
|
||||
default: {
|
||||
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
|
||||
break;
|
||||
}
|
||||
let func;
|
||||
switch (variant) {
|
||||
case 'images': {
|
||||
func = Reflect.get(client, 'imageSearch');
|
||||
break;
|
||||
}
|
||||
case 'news': {
|
||||
func = Reflect.get(client, 'newsSearch');
|
||||
break;
|
||||
}
|
||||
case 'web':
|
||||
default: {
|
||||
func = Reflect.get(client, 'webSearch');
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!func) {
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
r = await Reflect.apply(func, client, [query, scrappingOptions]);
|
||||
const dt = Date.now() - t0;
|
||||
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
||||
break outerLoop;
|
||||
@ -544,6 +558,8 @@ export class SerpHost extends RPCHost {
|
||||
this.batchedCaches.push(record);
|
||||
} else if (lastError) {
|
||||
throw lastError;
|
||||
} else if (!r) {
|
||||
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
|
||||
}
|
||||
|
||||
return r;
|
||||
|
@ -16,7 +16,7 @@ import { Readable } from 'stream';
|
||||
import { AsyncLocalContext } from './async-context';
|
||||
import { BlackHoleDetector } from './blackhole-detector';
|
||||
|
||||
export interface CURLScrappingOptions extends ScrappingOptions {
|
||||
export interface CURLScrappingOptions<T = any> extends ScrappingOptions<T> {
|
||||
method?: string;
|
||||
body?: string | Buffer;
|
||||
}
|
||||
@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
|
||||
}
|
||||
|
||||
const mixinHeaders: Record<string, string> = {
|
||||
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
||||
'Sec-Ch-Ua': `"Google Chrome";v="${this.chromeVersion}", "Not-A.Brand";v="8", "Chromium";v="${this.chromeVersion}"`,
|
||||
'Sec-Ch-Ua-Mobile': '?0',
|
||||
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
|
||||
'Upgrade-Insecure-Requests': '1',
|
||||
@ -108,11 +108,11 @@ export class CurlControl extends AsyncService {
|
||||
return curl;
|
||||
}
|
||||
|
||||
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
urlToStream(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
return new Promise<{
|
||||
statusCode: number,
|
||||
statusText?: string,
|
||||
data?: FancyFile,
|
||||
data?: Readable,
|
||||
headers: HeaderInfo[],
|
||||
}>((resolve, reject) => {
|
||||
let contentType = '';
|
||||
@ -122,7 +122,7 @@ export class CurlControl extends AsyncService {
|
||||
curl.setOpt(Curl.option.FOLLOWLOCATION, false);
|
||||
curl.setOpt(Curl.option.SSL_VERIFYPEER, false);
|
||||
curl.setOpt(Curl.option.TIMEOUT_MS, crawlOpts?.timeoutMs || 30_000);
|
||||
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 3_000);
|
||||
curl.setOpt(Curl.option.CONNECTTIMEOUT_MS, 1_600);
|
||||
curl.setOpt(Curl.option.LOW_SPEED_LIMIT, 32768);
|
||||
curl.setOpt(Curl.option.LOW_SPEED_TIME, 5_000);
|
||||
if (crawlOpts?.method) {
|
||||
@ -193,7 +193,7 @@ export class CurlControl extends AsyncService {
|
||||
});
|
||||
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
||||
let status = -1;
|
||||
let statusText: string|undefined;
|
||||
let statusText: string | undefined;
|
||||
let contentEncoding = '';
|
||||
curl.once('end', () => {
|
||||
if (curlStream) {
|
||||
@ -302,13 +302,10 @@ export class CurlControl extends AsyncService {
|
||||
}
|
||||
}
|
||||
|
||||
const fpath = this.tempFileManager.alloc();
|
||||
const fancyFile = FancyFile.auto(stream, fpath);
|
||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||
resolve({
|
||||
statusCode: status,
|
||||
statusText,
|
||||
data: fancyFile,
|
||||
data: stream,
|
||||
headers: headers as HeaderInfo[],
|
||||
});
|
||||
});
|
||||
@ -324,7 +321,19 @@ export class CurlControl extends AsyncService {
|
||||
let nextHopUrl = urlToCrawl;
|
||||
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||
do {
|
||||
const r = await this.urlToFile1Shot(nextHopUrl, opts);
|
||||
const s = await this.urlToStream(nextHopUrl, opts);
|
||||
const r = { ...s } as {
|
||||
statusCode: number,
|
||||
statusText?: string,
|
||||
data?: FancyFile,
|
||||
headers: HeaderInfo[],
|
||||
};
|
||||
if (r.data) {
|
||||
const fpath = this.tempFileManager.alloc();
|
||||
const fancyFile = FancyFile.auto(r.data, fpath);
|
||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||
r.data = fancyFile;
|
||||
}
|
||||
|
||||
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
|
||||
fakeHeaderInfos.push(...r.headers);
|
||||
@ -375,7 +384,7 @@ export class CurlControl extends AsyncService {
|
||||
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
|
||||
this.blackHoleDetector.itWorked();
|
||||
let finalURL = targetUrl;
|
||||
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
|
||||
const sideLoadOpts: CURLScrappingOptions<FancyFile>['sideLoad'] = {
|
||||
impersonate: {},
|
||||
proxyOrigin: {},
|
||||
};
|
||||
@ -421,6 +430,140 @@ export class CurlControl extends AsyncService {
|
||||
};
|
||||
}
|
||||
|
||||
async urlToBlob(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
let leftRedirection = 6;
|
||||
let cookieRedirects = 0;
|
||||
let opts = { ...crawlOpts };
|
||||
let nextHopUrl = urlToCrawl;
|
||||
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||
do {
|
||||
const s = await this.urlToStream(nextHopUrl, opts);
|
||||
const r = { ...s } as {
|
||||
statusCode: number,
|
||||
statusText?: string,
|
||||
data?: Blob,
|
||||
headers: HeaderInfo[],
|
||||
};
|
||||
|
||||
|
||||
const headers = r.headers[r.headers.length - 1];
|
||||
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
|
||||
fakeHeaderInfos.push(...r.headers);
|
||||
const location: string | undefined = headers.Location || headers.location;
|
||||
|
||||
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
|
||||
if (setCookieHeader) {
|
||||
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
||||
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
|
||||
if (parsed.length) {
|
||||
opts.cookies = [...(opts.cookies || []), ...parsed];
|
||||
}
|
||||
if (!location) {
|
||||
cookieRedirects += 1;
|
||||
}
|
||||
}
|
||||
|
||||
if (!location && !setCookieHeader) {
|
||||
// Follow curl behavior
|
||||
if (s.data) {
|
||||
const chunks: Buffer[] = [];
|
||||
s.data.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
await new Promise((resolve, reject) => {
|
||||
s.data!.once('end', resolve);
|
||||
s.data!.once('error', reject);
|
||||
});
|
||||
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
|
||||
}
|
||||
return {
|
||||
statusCode: r.statusCode,
|
||||
data: r.data,
|
||||
headers: fakeHeaderInfos.concat(r.headers),
|
||||
};
|
||||
}
|
||||
if (!location && cookieRedirects > 1) {
|
||||
throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
|
||||
}
|
||||
|
||||
nextHopUrl = new URL(location || '', nextHopUrl);
|
||||
leftRedirection -= 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (s.data) {
|
||||
const chunks: Buffer[] = [];
|
||||
s.data.on('data', (chunk) => {
|
||||
chunks.push(chunk);
|
||||
});
|
||||
await new Promise((resolve, reject) => {
|
||||
s.data!.once('end', resolve);
|
||||
s.data!.once('error', reject);
|
||||
});
|
||||
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
|
||||
}
|
||||
|
||||
return {
|
||||
statusCode: r.statusCode,
|
||||
statusText: r.statusText,
|
||||
data: r.data,
|
||||
headers: fakeHeaderInfos.concat(r.headers),
|
||||
};
|
||||
} while (leftRedirection > 0);
|
||||
|
||||
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
||||
}
|
||||
|
||||
async sideLoadBlob(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||
const curlResult = await this.urlToBlob(targetUrl, crawlOpts);
|
||||
this.blackHoleDetector.itWorked();
|
||||
let finalURL = targetUrl;
|
||||
const sideLoadOpts: CURLScrappingOptions<Blob>['sideLoad'] = {
|
||||
impersonate: {},
|
||||
proxyOrigin: {},
|
||||
};
|
||||
for (const headers of curlResult.headers) {
|
||||
sideLoadOpts.impersonate[finalURL.href] = {
|
||||
status: headers.result?.code || -1,
|
||||
headers: _.omit(headers, 'result'),
|
||||
contentType: headers['Content-Type'] || headers['content-type'],
|
||||
};
|
||||
if (crawlOpts?.proxyUrl) {
|
||||
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
|
||||
}
|
||||
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
|
||||
const location = headers.Location || headers.location;
|
||||
if (location) {
|
||||
finalURL = new URL(location, finalURL);
|
||||
}
|
||||
}
|
||||
}
|
||||
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
|
||||
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (curlResult.data?.type) || 'application/octet-stream';
|
||||
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
|
||||
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
|
||||
|
||||
if (sideLoadOpts.impersonate[finalURL.href] && (curlResult.data?.size)) {
|
||||
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
|
||||
}
|
||||
|
||||
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
|
||||
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
|
||||
|
||||
return {
|
||||
finalURL,
|
||||
sideLoadOpts,
|
||||
chain: curlResult.headers,
|
||||
status: curlResult.statusCode,
|
||||
statusText: curlResult.statusText,
|
||||
headers: lastHeaders,
|
||||
contentType,
|
||||
contentDisposition,
|
||||
fileName,
|
||||
file: curlResult.data
|
||||
};
|
||||
}
|
||||
|
||||
digestCurlCode(code: CurlCode, msg: string) {
|
||||
switch (code) {
|
||||
// 400 User errors
|
||||
|
@ -79,7 +79,7 @@ export interface ExtendedSnapshot extends PageSnapshot {
|
||||
imgs: ImgBrief[];
|
||||
}
|
||||
|
||||
export interface ScrappingOptions {
|
||||
export interface ScrappingOptions<T = FancyFile | Blob> {
|
||||
proxyUrl?: string;
|
||||
cookies?: Cookie[];
|
||||
favorScreenshot?: boolean;
|
||||
@ -101,7 +101,7 @@ export interface ScrappingOptions {
|
||||
status: number;
|
||||
headers: { [k: string]: string | string[]; };
|
||||
contentType?: string;
|
||||
body?: FancyFile;
|
||||
body?: T;
|
||||
};
|
||||
};
|
||||
proxyOrigin: { [origin: string]: string; };
|
||||
@ -912,7 +912,11 @@ export class PuppeteerControl extends AsyncService {
|
||||
if (impersonate) {
|
||||
let body;
|
||||
if (impersonate.body) {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
if (impersonate.body instanceof Blob) {
|
||||
body = new Uint8Array(await impersonate.body.arrayBuffer());
|
||||
} else {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
}
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
}
|
||||
|
133
src/services/serp/common-serp.ts
Normal file
133
src/services/serp/common-serp.ts
Normal file
@ -0,0 +1,133 @@
|
||||
import { singleton } from 'tsyringe';
|
||||
import { AsyncService } from 'civkit/async-service';
|
||||
import { GlobalLogger } from '../logger';
|
||||
import { JSDomControl } from '../jsdom';
|
||||
import _ from 'lodash';
|
||||
import { WebSearchEntry } from './compat';
|
||||
import { ServiceBadAttemptError } from '../errors';
|
||||
import commonSerpClients, { CommonSerpImageResponse, CommonSerpNewsResponse, CommonSerpWebResponse } from '../../shared/3rd-party/common-serp';
|
||||
import { AsyncLocalContext } from '../async-context';
|
||||
|
||||
@singleton()
|
||||
export class CommonGoogleSERP extends AsyncService {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
||||
|
||||
protected ctxIteratorMap = new WeakMap<object, ReturnType<CommonGoogleSERP['iterClients']>>();
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected jsDomControl: JSDomControl,
|
||||
protected asyncContext: AsyncLocalContext,
|
||||
|
||||
) {
|
||||
super(...arguments);
|
||||
}
|
||||
|
||||
override async init() {
|
||||
await this.dependencyReady();
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
*iterClients() {
|
||||
if (!commonSerpClients.length) {
|
||||
return;
|
||||
}
|
||||
while (true) {
|
||||
yield* commonSerpClients;
|
||||
}
|
||||
}
|
||||
|
||||
getClient() {
|
||||
const ctx = this.asyncContext.ctx;
|
||||
const it = this.ctxIteratorMap.get(ctx) || this.iterClients();
|
||||
this.ctxIteratorMap.set(ctx, it);
|
||||
const client = it.next().value;
|
||||
if (!client) {
|
||||
throw new ServiceBadAttemptError('No client available');
|
||||
}
|
||||
return client;
|
||||
}
|
||||
|
||||
|
||||
digestQuery(query: { [k: string]: any; }) {
|
||||
const url = new URL(`https://${this.googleDomain}/search`);
|
||||
const clone = { ...query };
|
||||
const num = clone.num || 10;
|
||||
if (clone.page) {
|
||||
const page = parseInt(clone.page);
|
||||
delete clone.page;
|
||||
clone.start = (page - 1) * num;
|
||||
if (clone.start === 0) {
|
||||
delete clone.start;
|
||||
}
|
||||
}
|
||||
if (clone.location) {
|
||||
delete clone.location;
|
||||
}
|
||||
|
||||
for (const [k, v] of Object.entries(clone)) {
|
||||
if (v === undefined || v === null) {
|
||||
continue;
|
||||
}
|
||||
url.searchParams.set(k, `${v}`);
|
||||
}
|
||||
|
||||
return url;
|
||||
}
|
||||
|
||||
async webSearch(query: { [k: string]: any; }) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
const client = this.getClient();
|
||||
|
||||
const r = await client.queryJSON(url.href) as CommonSerpWebResponse;
|
||||
|
||||
return r.organic.map((x)=> ({
|
||||
link: x.link,
|
||||
title: x.title,
|
||||
snippet: x.description,
|
||||
variant: 'web',
|
||||
})) as WebSearchEntry[];
|
||||
}
|
||||
|
||||
async newsSearch(query: { [k: string]: any; }) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
url.searchParams.set('tbm', 'nws');
|
||||
|
||||
const client = this.getClient();
|
||||
|
||||
const r = await client.queryJSON(url.href) as CommonSerpNewsResponse;
|
||||
|
||||
return r.news.map((x)=> ({
|
||||
link: x.link,
|
||||
title: x.title,
|
||||
snippet: x.description,
|
||||
source: x.source,
|
||||
date: x.date,
|
||||
imageUrl: x.image,
|
||||
variant: 'news',
|
||||
})) as WebSearchEntry[];
|
||||
}
|
||||
|
||||
async imageSearch(query: { [k: string]: any; }) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
url.searchParams.set('tbm', 'isch');
|
||||
|
||||
const client = this.getClient();
|
||||
|
||||
const r = await client.queryJSON(url.href) as CommonSerpImageResponse;
|
||||
|
||||
return r.images.map((x)=> ({
|
||||
link: x.link,
|
||||
title: x.title,
|
||||
snippet: x.image_alt,
|
||||
source: x.source,
|
||||
imageUrl: x.image,
|
||||
variant: 'images',
|
||||
})) as WebSearchEntry[];
|
||||
}
|
||||
}
|
@ -7,24 +7,74 @@ import _ from 'lodash';
|
||||
import { WebSearchEntry } from './compat';
|
||||
import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
|
||||
import { CurlControl } from '../curl';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { ApplicationError } from 'civkit/civ-rpc';
|
||||
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
||||
import { parseJSONText } from 'civkit/vectorize';
|
||||
import { retryWith } from 'civkit/decorators';
|
||||
import { ProxyProviderService } from '../../shared/services/proxy-provider';
|
||||
import { retry, retryWith } from 'civkit/decorators';
|
||||
import { SERPProxyProviderService } from '../../shared/services/proxy-provider';
|
||||
import { readBlob } from '../../utils/encoding';
|
||||
import { createContext, Script } from 'vm';
|
||||
import { BrowserContext } from 'puppeteer';
|
||||
import { createPool } from 'generic-pool';
|
||||
import { randomBytes } from 'crypto';
|
||||
import { AsyncLocalContext } from '../async-context';
|
||||
|
||||
interface SerpContext {
|
||||
proxyUrl?: string;
|
||||
browserContext?: BrowserContext;
|
||||
validTill?: Date;
|
||||
magicId?: string;
|
||||
}
|
||||
|
||||
@singleton()
|
||||
export class GoogleSERP extends AsyncService {
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
||||
|
||||
nativeIPHealthy = true;
|
||||
|
||||
contextPool = createPool({
|
||||
create: () => {
|
||||
return this.createContext();
|
||||
},
|
||||
destroy: async (ctx: SerpContext) => {
|
||||
if (ctx.browserContext) {
|
||||
try {
|
||||
await ctx.browserContext.close();
|
||||
} catch (err) {
|
||||
this.logger.warn('Error closing browser context', { err });
|
||||
}
|
||||
}
|
||||
},
|
||||
validate: async (ctx: SerpContext) => {
|
||||
if (ctx.validTill && ctx.validTill > (new Date(Date.now() + 5 * 60 * 1000))) {
|
||||
return true;
|
||||
}
|
||||
|
||||
return !ctx.proxyUrl;
|
||||
},
|
||||
}, {
|
||||
max: 3_000,
|
||||
testOnBorrow: true,
|
||||
});
|
||||
|
||||
protected async createContext() {
|
||||
await this.serviceReady();
|
||||
this.asyncLocalContext.ctx.ctxIsNew = true;
|
||||
|
||||
return {
|
||||
magicId: randomBytes(17).toString('base64url'),
|
||||
validTill: new Date(Date.now() + 30 * 60 * 1000),
|
||||
} as SerpContext;
|
||||
}
|
||||
|
||||
constructor(
|
||||
protected globalLogger: GlobalLogger,
|
||||
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
||||
protected jsDomControl: JSDomControl,
|
||||
protected curlControl: CurlControl,
|
||||
protected proxyProvider: ProxyProviderService,
|
||||
protected proxyProvider: SERPProxyProviderService,
|
||||
protected asyncLocalContext: AsyncLocalContext,
|
||||
) {
|
||||
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
||||
super(...filteredDeps);
|
||||
@ -36,6 +86,15 @@ export class GoogleSERP extends AsyncService {
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
nativeIPBlocked() {
|
||||
this.nativeIPHealthy = false;
|
||||
this.logger.warn('Native IP is not healthy.');
|
||||
setTimeout(() => {
|
||||
this.nativeIPHealthy = true;
|
||||
this.logger.debug('Presume native IP healthy again after timeout.');
|
||||
}, 1000 * 60 * 60);
|
||||
}
|
||||
|
||||
@retryWith((err) => {
|
||||
if (err instanceof ServiceBadApproachError) {
|
||||
return false;
|
||||
@ -51,15 +110,13 @@ export class GoogleSERP extends AsyncService {
|
||||
return undefined;
|
||||
}, 3)
|
||||
async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
|
||||
if (opts?.allocProxy === 'none') {
|
||||
return this.curlControl.sideLoad(url, opts);
|
||||
if (opts?.allocProxy === 'none' || opts?.proxyUrl) {
|
||||
return this.curlControl.sideLoadBlob(url, opts);
|
||||
}
|
||||
|
||||
const proxy = await this.proxyProvider.alloc(
|
||||
process.env.PREFERRED_PROXY_COUNTRY || 'auto'
|
||||
);
|
||||
const proxy = await this.proxyProvider.alloc();
|
||||
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
||||
const r = await this.curlControl.sideLoad(url, {
|
||||
const r = await this.curlControl.sideLoadBlob(url, {
|
||||
...opts,
|
||||
proxyUrl: proxy.href,
|
||||
});
|
||||
@ -101,48 +158,113 @@ export class GoogleSERP extends AsyncService {
|
||||
return url;
|
||||
}
|
||||
|
||||
@retry(2)
|
||||
async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||
const url = this.digestQuery(query);
|
||||
const origHref = url.href;
|
||||
if (!url.searchParams.has('start')) {
|
||||
url.searchParams.set('start', '0');
|
||||
}
|
||||
url.searchParams.set('asearch', 'arc');
|
||||
|
||||
const ctx = await this.contextPool.acquire();
|
||||
|
||||
url.searchParams.set('async', getMagicAsyncParam(query.start, ctx.magicId));
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||
...opts,
|
||||
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
|
||||
proxyUrl: ctx.proxyUrl,
|
||||
}).catch((err) => {
|
||||
this.contextPool.destroy(ctx);
|
||||
|
||||
return Promise.reject(err);
|
||||
});
|
||||
|
||||
if ('proxy' in sideLoaded) {
|
||||
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||
}
|
||||
|
||||
if (sideLoaded.status === 200) {
|
||||
this.contextPool.release(ctx);
|
||||
} else {
|
||||
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
|
||||
this.nativeIPBlocked();
|
||||
}
|
||||
this.contextPool.destroy(ctx);
|
||||
}
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
||||
if (opts && sideLoaded.sideLoadOpts) {
|
||||
opts.sideLoad = sideLoaded.sideLoadOpts;
|
||||
}
|
||||
|
||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
|
||||
if (!sideLoaded.file) {
|
||||
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||
}
|
||||
const contentType = sideLoaded.contentType;
|
||||
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
url.searchParams.set('tbm', 'nws');
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
||||
if (opts && sideLoaded.sideLoadOpts) {
|
||||
opts.sideLoad = sideLoaded.sideLoadOpts;
|
||||
let html = await readBlob(sideLoaded.file, encoding);
|
||||
let innerCharset;
|
||||
const peek = html.slice(0, 1024);
|
||||
innerCharset ??= peek.match(/<meta[^>]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase();
|
||||
innerCharset ??= peek.match(/<meta[^>]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase();
|
||||
if (innerCharset && innerCharset !== encoding) {
|
||||
html = await readBlob(sideLoaded.file, innerCharset);
|
||||
}
|
||||
|
||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
|
||||
const jsdom = this.jsDomControl.linkedom.parseHTML(html, { location: { href: origHref } });
|
||||
try {
|
||||
const r = runGetWebSearchResultsScript(createContext(jsdom));
|
||||
|
||||
return snapshot;
|
||||
return r;
|
||||
} catch (err) {
|
||||
throw new ServiceBadAttemptError({
|
||||
message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.',
|
||||
err
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@retry(2)
|
||||
async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
url.searchParams.set('tbm', 'isch');
|
||||
url.searchParams.set('asearch', 'isch');
|
||||
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
|
||||
const ctx = await this.contextPool.acquire();
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||
...opts,
|
||||
proxyUrl: ctx.proxyUrl,
|
||||
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
|
||||
}).catch((err) => {
|
||||
this.contextPool.destroy(ctx);
|
||||
|
||||
return Promise.reject(err);
|
||||
});
|
||||
|
||||
if ('proxy' in sideLoaded) {
|
||||
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||
}
|
||||
|
||||
if (sideLoaded.status === 200) {
|
||||
this.contextPool.release(ctx);
|
||||
} else {
|
||||
this.contextPool.destroy(ctx);
|
||||
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
|
||||
this.nativeIPBlocked();
|
||||
}
|
||||
}
|
||||
|
||||
if (sideLoaded.status !== 200 || !sideLoaded.file) {
|
||||
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||
}
|
||||
|
||||
const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
|
||||
const jsonTxt = (await readBlob(sideLoaded.file)).toString();
|
||||
const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
|
||||
|
||||
return _.get(rJSON, 'ischj.metadata').map((x: any) => {
|
||||
@ -161,6 +283,91 @@ export class GoogleSERP extends AsyncService {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@singleton()
|
||||
export class GoogleSERPOldFashion extends GoogleSERP {
|
||||
override async createContext() {
|
||||
await this.serviceReady();
|
||||
this.asyncLocalContext.ctx.ctxIsNew = true;
|
||||
|
||||
return {
|
||||
browserContext: await this.puppeteerControl.browser.createBrowserContext(),
|
||||
magicId: randomBytes(17).toString('base64url'),
|
||||
validTill: new Date(Date.now() + 30 * 60 * 1000),
|
||||
} as SerpContext;
|
||||
}
|
||||
|
||||
override async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
const ctx = await this.contextPool.acquire();
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||
...opts,
|
||||
proxyUrl: ctx.proxyUrl,
|
||||
}).catch((err) => {
|
||||
this.contextPool.destroy(ctx);
|
||||
|
||||
return Promise.reject(err);
|
||||
});
|
||||
|
||||
if ('proxy' in sideLoaded) {
|
||||
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||
}
|
||||
|
||||
if (sideLoaded.status === 200) {
|
||||
this.contextPool.release(ctx);
|
||||
} else {
|
||||
this.contextPool.destroy(ctx);
|
||||
}
|
||||
|
||||
if (opts && sideLoaded.sideLoadOpts) {
|
||||
opts.sideLoad = sideLoaded.sideLoadOpts;
|
||||
}
|
||||
|
||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
|
||||
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||
const url = this.digestQuery(query);
|
||||
|
||||
url.searchParams.set('tbm', 'nws');
|
||||
|
||||
const ctx = await this.contextPool.acquire();
|
||||
|
||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||
...opts,
|
||||
proxyUrl: ctx.proxyUrl,
|
||||
}).catch((err) => {
|
||||
this.contextPool.destroy(ctx);
|
||||
|
||||
return Promise.reject(err);
|
||||
});
|
||||
|
||||
if ('proxy' in sideLoaded) {
|
||||
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||
}
|
||||
|
||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, {
|
||||
...opts,
|
||||
proxyUrl: ctx.proxyUrl,
|
||||
browserContext: ctx.browserContext,
|
||||
}).catch((err) => {
|
||||
this.contextPool.destroy(ctx);
|
||||
|
||||
return Promise.reject(err);
|
||||
});
|
||||
|
||||
this.contextPool.release(ctx);
|
||||
|
||||
return snapshot;
|
||||
}
|
||||
}
|
||||
|
||||
async function getWebSearchResults() {
|
||||
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
||||
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||
@ -254,6 +461,96 @@ async function getWebSearchResults() {
|
||||
};
|
||||
}).filter(Boolean) as WebSearchEntry[];
|
||||
}
|
||||
function getWebSearchResultsSync() {
|
||||
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
|
||||
|
||||
if (!wrapper1) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
|
||||
|
||||
if (!query) {
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
|
||||
|
||||
return candidates.map((x, pos) => {
|
||||
const primaryLink = x.querySelector('a:not([href="#"])');
|
||||
if (!primaryLink) {
|
||||
return undefined;
|
||||
}
|
||||
const url = primaryLink.getAttribute('href');
|
||||
|
||||
if (primaryLink.querySelector('div[role="heading"]')) {
|
||||
// const spans = primaryLink.querySelectorAll('span');
|
||||
// const title = spans[0]?.textContent;
|
||||
// const source = spans[1]?.textContent;
|
||||
// const date = spans[spans.length - 1].textContent;
|
||||
|
||||
// return {
|
||||
// link: url,
|
||||
// title,
|
||||
// source,
|
||||
// date,
|
||||
// variant: 'video'
|
||||
// };
|
||||
return undefined;
|
||||
}
|
||||
|
||||
const title = primaryLink.querySelector('h3')?.textContent;
|
||||
const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
|
||||
const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
|
||||
let date = cite?.split('·')[1]?.trim();
|
||||
const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
|
||||
let snippet = snippets[snippets.length - 1]?.textContent;
|
||||
if (!snippet) {
|
||||
snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
|
||||
}
|
||||
date ??= snippets[snippets.length - 2]?.textContent?.trim();
|
||||
const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
|
||||
let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
|
||||
return {
|
||||
link: l.getAttribute('href'),
|
||||
title: l.textContent,
|
||||
};
|
||||
});
|
||||
const perhapsParent = x.parentElement?.closest('div[data-hveid]');
|
||||
if (!siteLinks?.length && perhapsParent) {
|
||||
const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
|
||||
if (candidates.length) {
|
||||
siteLinks = candidates.map((l) => {
|
||||
const link = l.querySelector('a');
|
||||
if (!link) {
|
||||
return undefined;
|
||||
}
|
||||
const snippet = l.nextElementSibling?.textContent;
|
||||
return {
|
||||
link: link.getAttribute('href'),
|
||||
title: link.textContent,
|
||||
snippet,
|
||||
};
|
||||
}).filter(Boolean) as any;
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
link: url,
|
||||
title,
|
||||
source,
|
||||
date,
|
||||
snippet: snippet ?? undefined,
|
||||
imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
|
||||
siteLinks: siteLinks.length ? siteLinks : undefined,
|
||||
variant: 'web',
|
||||
};
|
||||
}).filter(Boolean) as WebSearchEntry[];
|
||||
}
|
||||
const script = new Script(`(${getWebSearchResultsSync.toString()})()`);
|
||||
function runGetWebSearchResultsScript(ctx: object) {
|
||||
return script.runInContext(ctx);
|
||||
}
|
||||
|
||||
async function getNewsSearchResults() {
|
||||
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
||||
@ -306,3 +603,9 @@ async function getNewsSearchResults() {
|
||||
};
|
||||
}).filter(Boolean) as WebSearchEntry[];
|
||||
}
|
||||
|
||||
function getMagicAsyncParam(start: number = 0, inputArcid?: string) {
|
||||
const arcid = inputArcid || randomBytes(17).toString('base64url');
|
||||
|
||||
return `arc_id:srp_${arcid}_1${start.toString().padStart(2, '0')},use_ac:true,_fmt:prog`;
|
||||
}
|
@ -67,11 +67,5 @@ export class InternalJinaSerpService extends AsyncService {
|
||||
async webSearch(query: SerperSearchQueryParams) {
|
||||
return this.doSearch('web', query);
|
||||
}
|
||||
async imageSearch(query: SerperSearchQueryParams) {
|
||||
return this.doSearch('images', query);
|
||||
}
|
||||
async newsSearch(query: SerperSearchQueryParams) {
|
||||
return this.doSearch('news', query);
|
||||
}
|
||||
|
||||
}
|
||||
|
@ -2,7 +2,7 @@ import _ from 'lodash';
|
||||
import { readFile } from 'fs/promises';
|
||||
import { container, singleton } from 'tsyringe';
|
||||
|
||||
import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
|
||||
import type { Browser, BrowserContext, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
|
||||
import type { Cookie } from 'set-cookie-parser';
|
||||
import puppeteer, { TimeoutError } from 'puppeteer';
|
||||
|
||||
@ -21,6 +21,7 @@ import { BlackHoleDetector } from '../blackhole-detector';
|
||||
|
||||
|
||||
export interface ScrappingOptions {
|
||||
browserContext?: BrowserContext;
|
||||
proxyUrl?: string;
|
||||
cookies?: Cookie[];
|
||||
overrideUserAgent?: string;
|
||||
@ -38,7 +39,7 @@ export interface ScrappingOptions {
|
||||
status: number;
|
||||
headers: { [k: string]: string | string[]; };
|
||||
contentType?: string;
|
||||
body?: FancyFile;
|
||||
body?: FancyFile | Blob;
|
||||
};
|
||||
};
|
||||
proxyOrigin: { [origin: string]: string; };
|
||||
@ -226,8 +227,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
browser!: Browser;
|
||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||
|
||||
__loadedPage: Page[] = [];
|
||||
|
||||
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
||||
snMap = new WeakMap<Page, number>();
|
||||
livePages = new Set<Page>();
|
||||
@ -251,7 +250,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
let crippledTimes = 0;
|
||||
this.on('crippled', () => {
|
||||
crippledTimes += 1;
|
||||
this.__loadedPage.length = 0;
|
||||
this.livePages.clear();
|
||||
if (crippledTimes > 5) {
|
||||
process.nextTick(() => {
|
||||
@ -303,20 +301,16 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
||||
this.curlControl.impersonateChrome(this.effectiveUA);
|
||||
|
||||
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
||||
|
||||
this.emit('ready');
|
||||
}
|
||||
|
||||
async newPage<T>(bewareDeadLock: any = false) {
|
||||
if (!bewareDeadLock) {
|
||||
await this.serviceReady();
|
||||
}
|
||||
async newPage<T>(context?: BrowserContext) {
|
||||
await this.serviceReady();
|
||||
const sn = this._sn++;
|
||||
let page;
|
||||
context ??= await this.browser.createBrowserContext();
|
||||
try {
|
||||
const dedicatedContext = await this.browser.createBrowserContext();
|
||||
page = await dedicatedContext.newPage();
|
||||
page = await context.newPage();
|
||||
} catch (err: any) {
|
||||
this.logger.warn(`Failed to create page ${sn}`, { err });
|
||||
this.browser.process()?.kill('SIGKILL');
|
||||
@ -347,23 +341,9 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
return page;
|
||||
}
|
||||
|
||||
async getNextPage() {
|
||||
let thePage: Page | undefined;
|
||||
if (this.__loadedPage.length) {
|
||||
thePage = this.__loadedPage.shift();
|
||||
if (this.__loadedPage.length <= 1) {
|
||||
process.nextTick(() => {
|
||||
this.newPage()
|
||||
.then((r) => this.__loadedPage.push(r))
|
||||
.catch((err) => {
|
||||
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
||||
});
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
async getNextPage(context?: BrowserContext) {
|
||||
const thePage = await this.newPage(context);
|
||||
if (!thePage) {
|
||||
thePage = await this.newPage();
|
||||
}
|
||||
|
||||
const timer = setTimeout(() => {
|
||||
@ -387,14 +367,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
const sn = this.snMap.get(page);
|
||||
this.logger.debug(`Closing page ${sn}`);
|
||||
await Promise.race([
|
||||
(async () => {
|
||||
const ctx = page.browserContext();
|
||||
try {
|
||||
await page.close();
|
||||
} finally {
|
||||
await ctx.close();
|
||||
}
|
||||
})(),
|
||||
page.close(),
|
||||
delay(5000)
|
||||
]).catch((err) => {
|
||||
this.logger.error(`Failed to destroy page ${sn}`, { err });
|
||||
@ -405,7 +378,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
|
||||
// parsedUrl.search = '';
|
||||
const url = parsedUrl.toString();
|
||||
const page = await this.getNextPage();
|
||||
const page = await this.getNextPage(options.browserContext);
|
||||
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
||||
page.on('response', (_resp) => {
|
||||
this.blackHoleDetector.itWorked();
|
||||
@ -452,7 +425,11 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
||||
if (impersonate) {
|
||||
let body;
|
||||
if (impersonate.body) {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
if (impersonate.body instanceof Blob) {
|
||||
body = new Uint8Array(await impersonate.body.arrayBuffer());
|
||||
} else {
|
||||
body = await readFile(await impersonate.body.filePath);
|
||||
}
|
||||
if (req.isInterceptResolutionHandled()) {
|
||||
return;
|
||||
}
|
||||
|
@ -1,13 +1,12 @@
|
||||
import { createReadStream } from 'fs';
|
||||
import { Readable } from 'stream';
|
||||
import { TextDecoderStream } from 'stream/web';
|
||||
|
||||
export async function decodeFileStream(
|
||||
fileStream: Readable,
|
||||
encoding: string = 'utf-8',
|
||||
): Promise<string> {
|
||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||
Readable.toWeb(fileStream).pipeThrough(decodeStream);
|
||||
(Readable.toWeb(fileStream) as ReadableStream).pipeThrough(decodeStream);
|
||||
const chunks = [];
|
||||
|
||||
for await (const chunk of decodeStream.readable) {
|
||||
@ -23,7 +22,22 @@ export async function readFile(
|
||||
encoding: string = 'utf-8',
|
||||
): Promise<string> {
|
||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
|
||||
(Readable.toWeb(createReadStream(filePath)) as ReadableStream).pipeThrough(decodeStream);
|
||||
const chunks = [];
|
||||
|
||||
for await (const chunk of decodeStream.readable) {
|
||||
chunks.push(chunk);
|
||||
}
|
||||
|
||||
return chunks.join('');
|
||||
}
|
||||
|
||||
export async function readBlob(
|
||||
blob: Blob,
|
||||
encoding: string = 'utf-8',
|
||||
): Promise<string> {
|
||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||
blob.stream().pipeThrough(decodeStream);
|
||||
const chunks = [];
|
||||
|
||||
for await (const chunk of decodeStream.readable) {
|
||||
|
Loading…
x
Reference in New Issue
Block a user