serp: new trick

This commit is contained in:
Yanlong Wang 2025-05-08 15:21:20 +08:00
parent 5f07900eab
commit a178723579
No known key found for this signature in database
GPG Key ID: C0A623C0BADF9F37
15 changed files with 784 additions and 142 deletions

5
.vscode/launch.json vendored
View File

@ -90,6 +90,7 @@
],
"env": {
"GCLOUD_PROJECT": "reader-6b7dc",
"PREFERRED_PROXY_COUNTRY": "us",
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
},
"cwd": "${workspaceFolder}",
@ -110,8 +111,8 @@
],
"env": {
"GCLOUD_PROJECT": "reader-6b7dc",
"PREFERRED_PROXY_COUNTRY": "hk",
"OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
"PREFERRED_PROXY_COUNTRY": "us",
// "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
},
"cwd": "${workspaceFolder}",

View File

@ -18,10 +18,13 @@ Or just visit these URLs (**Read**) https://r.jina.ai/https://github.com/jina-ai
## Updates
- **2024-10-08**: Introduced an `adaptive crawler`. It can recursively crawl the website and extract the most relevant pages for a given webpage.
- **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform).
- **2024-07-01**: We have resolved a DDoS attack and other traffic abusing since June 27th. We also found a bug introduced on June 28th which may cause higher latency for some websites. The attack and the bug have been solved; if you have experienced high latency of r.jina.ai between June 27th-30th, it should back to normal now.
- **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf).
- **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms).
- **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header.
- **2024-05-03**: We finally resolved a DDoS attack since April 29th. Now our API is much more reliable and scalable than ever!
- **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy.
- **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926).
@ -151,8 +154,15 @@ All images in that page that lack `alt` tag can be auto-captioned by a VLM (visi
curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
```
## How it works
[![Ask DeepWiki](https://deepwiki.com/badge.svg)](https://deepwiki.com/jina-ai/reader)
## Install
You will need the following tools to run the project:
- Node v18 (The build fails for Node version >18)
```bash
git clone git@github.com:jina-ai/reader.git
npm install
```
## What is `thinapps-shared` submodule?

3
package-lock.json generated
View File

@ -23,6 +23,7 @@
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.1",
"generic-pool": "^3.9.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"koa": "^2.16.0",
@ -6249,7 +6250,7 @@
"version": "3.9.0",
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
"optional": true,
"license": "MIT",
"engines": {
"node": ">= 4"
}

View File

@ -32,6 +32,7 @@
"express": "^4.19.2",
"firebase-admin": "^12.1.0",
"firebase-functions": "^6.1.1",
"generic-pool": "^3.9.0",
"htmlparser2": "^9.0.0",
"jose": "^5.1.0",
"koa": "^2.16.0",

View File

@ -49,6 +49,7 @@ import { TempFileManager } from '../services/temp-file';
import { MiscService } from '../services/misc';
import { HTTPServiceError } from 'civkit/http';
import { GeoIPService } from '../services/geoip';
import { writeFile } from 'fs/promises';
export interface ExtraScrappingOptions extends ScrappingOptions {
withIframe?: boolean | 'quoted';
@ -1145,7 +1146,16 @@ export class CrawlerHost extends RPCHost {
if (pdfUrl.startsWith('http')) {
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
if (sideLoaded?.status === 200 && sideLoaded.body) {
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
let filePath = '';
if (sideLoaded.body instanceof Blob) {
const tmpPath = this.tempFileManager.alloc();
await writeFile(tmpPath, sideLoaded.body.stream());
this.tempFileManager.bindPathTo(this.threadLocal.ctx, tmpPath);
filePath = tmpPath;
} else {
filePath = await sideLoaded.body.filePath;
}
snapshotCopy.pdfs[0] = pathToFileURL(filePath).href;
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
}

View File

@ -27,8 +27,10 @@ import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll';
import { SERPResult } from '../db/searched';
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { InternalJinaSerpService } from '../services/serp/internal';
import { WebSearchEntry } from '../services/serp/compat';
import { CommonGoogleSERP } from '../services/serp/common-serp';
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
import { InternalJinaSerpService } from '../services/serp/internal';
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
@ -72,6 +74,9 @@ export class SearcherHost extends RPCHost {
protected serperGoogle: SerperGoogleSearchService,
protected serperBing: SerperBingSearchService,
protected jinaSerp: InternalJinaSerpService,
protected commonGoogleSerp: CommonGoogleSERP,
protected googleSERP: GoogleSERP,
protected googleSERPOld: GoogleSERPOldFashion,
) {
super(...arguments);
@ -715,26 +720,32 @@ export class SearcherHost extends RPCHost {
}
}
*iterProviders(preference?: string, variant?: string) {
*iterProviders(preference?: string, _variant?: string) {
if (preference === 'bing') {
yield this.serperBing;
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
yield this.googleSERP;
yield this.jinaSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
return;
}
if (preference === 'google') {
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
yield this.serperGoogle;
yield this.googleSERP;
yield this.jinaSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
yield this.googleSERPOld;
return;
}
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
yield this.serperGoogle;
yield this.googleSERP;
yield this.jinaSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
yield this.googleSERPOld;
}
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
@ -767,22 +778,27 @@ export class SearcherHost extends RPCHost {
outerLoop:
for (const client of this.iterProviders(provider, variant)) {
const t0 = Date.now();
try {
let func;
switch (variant) {
case 'images': {
r = await Reflect.apply(client.imageSearch, client, [query]);
func = Reflect.get(client, 'imageSearch');
break;
}
case 'news': {
r = await Reflect.apply(client.newsSearch, client, [query]);
func = Reflect.get(client, 'newsSearch');
break;
}
case 'web':
default: {
r = await Reflect.apply(client.webSearch, client, [query]);
func = Reflect.get(client, 'webSearch');
break;
}
}
if (!func) {
continue;
}
try {
r = await Reflect.apply(func, client, [query]);
const dt = Date.now() - t0;
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
break outerLoop;
@ -806,6 +822,8 @@ export class SearcherHost extends RPCHost {
this.batchedCaches.push(record);
} else if (lastError) {
throw lastError;
} else if (!r) {
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
}
return r as WebSearchEntry[];

View File

@ -3,6 +3,7 @@ import {
RPCHost, RPCReflection, assignMeta, RawString,
ParamValidationError,
assignTransferProtocolMeta,
AssertionFailureError,
} from 'civkit/civ-rpc';
import { marshalErrorLike } from 'civkit/lang';
import _ from 'lodash';
@ -16,7 +17,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
import { InsufficientBalanceError } from '../services/errors';
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
import { GoogleSERP } from '../services/serp/google';
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
import { WebSearchEntry } from '../services/serp/compat';
import { CrawlerOptions } from '../dto/crawler-options';
import { ScrappingOptions } from '../services/serp/puppeteer';
@ -26,6 +27,7 @@ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
import { LRUCache } from 'lru-cache';
import { API_CALL_STATUS } from '../shared/db/api-roll';
import { CommonGoogleSERP } from '../services/serp/common-serp';
import { InternalJinaSerpService } from '../services/serp/internal';
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
@ -91,8 +93,10 @@ export class SerpHost extends RPCHost {
protected rateLimitControl: RateLimitControl,
protected threadLocal: AsyncLocalContext,
protected googleSerp: GoogleSERP,
protected googleSerpOld: GoogleSERPOldFashion,
protected serperGoogle: SerperGoogleSearchService,
protected serperBing: SerperBingSearchService,
protected commonGoogleSerp: CommonGoogleSERP,
protected jinaSerp: InternalJinaSerpService,
) {
super(...arguments);
@ -157,7 +161,7 @@ export class SerpHost extends RPCHost {
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
num?: number,
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string,
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
@Param('location') location?: string,
@Param('page') page?: number,
@Param('fallback') fallback?: boolean,
@ -318,7 +322,7 @@ export class SerpHost extends RPCHost {
q,
num,
gl,
// hl,
hl,
location,
page,
}, crawlerOptions);
@ -451,27 +455,32 @@ export class SerpHost extends RPCHost {
return result;
}
*iterProviders(preference?: string, variant?: string) {
*iterProviders(preference?: string, _variant?: string) {
if (preference === 'bing') {
yield this.serperBing;
yield this.serperGoogle;
yield this.googleSerp;
yield this.jinaSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
yield this.googleSerpOld;
return;
}
if (preference === 'google') {
yield this.googleSerp;
yield this.googleSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
yield this.googleSerpOld;
return;
}
// yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
yield this.serperGoogle
yield this.serperGoogle;
yield this.googleSerp;
yield this.jinaSerp;
yield this.serperGoogle;
yield this.commonGoogleSerp;
yield this.googleSerpOld;
}
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
@ -506,22 +515,27 @@ export class SerpHost extends RPCHost {
outerLoop:
for (const client of this.iterProviders(provider, variant)) {
const t0 = Date.now();
try {
let func;
switch (variant) {
case 'images': {
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
func = Reflect.get(client, 'imageSearch');
break;
}
case 'news': {
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
func = Reflect.get(client, 'newsSearch');
break;
}
case 'web':
default: {
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
func = Reflect.get(client, 'webSearch');
break;
}
}
if (!func) {
continue;
}
try {
r = await Reflect.apply(func, client, [query, scrappingOptions]);
const dt = Date.now() - t0;
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
break outerLoop;
@ -544,6 +558,8 @@ export class SerpHost extends RPCHost {
this.batchedCaches.push(record);
} else if (lastError) {
throw lastError;
} else if (!r) {
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
}
return r;

View File

@ -16,7 +16,7 @@ import { Readable } from 'stream';
import { AsyncLocalContext } from './async-context';
import { BlackHoleDetector } from './blackhole-detector';
export interface CURLScrappingOptions extends ScrappingOptions {
export interface CURLScrappingOptions<T = any> extends ScrappingOptions<T> {
method?: string;
body?: string | Buffer;
}
@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
}
const mixinHeaders: Record<string, string> = {
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
'Sec-Ch-Ua': `"Google Chrome";v="${this.chromeVersion}", "Not-A.Brand";v="8", "Chromium";v="${this.chromeVersion}"`,
'Sec-Ch-Ua-Mobile': '?0',
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
'Upgrade-Insecure-Requests': '1',
@ -108,11 +108,11 @@ export class CurlControl extends AsyncService {
return curl;
}
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
urlToStream(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
return new Promise<{
statusCode: number,
statusText?: string,
data?: FancyFile,
data?: Readable,
headers: HeaderInfo[],
}>((resolve, reject) => {
let contentType = '';
@ -193,7 +193,7 @@ export class CurlControl extends AsyncService {
});
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
let status = -1;
let statusText: string|undefined;
let statusText: string | undefined;
let contentEncoding = '';
curl.once('end', () => {
if (curlStream) {
@ -302,13 +302,10 @@ export class CurlControl extends AsyncService {
}
}
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(stream, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
resolve({
statusCode: status,
statusText,
data: fancyFile,
data: stream,
headers: headers as HeaderInfo[],
});
});
@ -324,7 +321,19 @@ export class CurlControl extends AsyncService {
let nextHopUrl = urlToCrawl;
const fakeHeaderInfos: HeaderInfo[] = [];
do {
const r = await this.urlToFile1Shot(nextHopUrl, opts);
const s = await this.urlToStream(nextHopUrl, opts);
const r = { ...s } as {
statusCode: number,
statusText?: string,
data?: FancyFile,
headers: HeaderInfo[],
};
if (r.data) {
const fpath = this.tempFileManager.alloc();
const fancyFile = FancyFile.auto(r.data, fpath);
this.tempFileManager.bindPathTo(fancyFile, fpath);
r.data = fancyFile;
}
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
fakeHeaderInfos.push(...r.headers);
@ -375,7 +384,7 @@ export class CurlControl extends AsyncService {
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
this.blackHoleDetector.itWorked();
let finalURL = targetUrl;
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
const sideLoadOpts: CURLScrappingOptions<FancyFile>['sideLoad'] = {
impersonate: {},
proxyOrigin: {},
};
@ -421,6 +430,140 @@ export class CurlControl extends AsyncService {
};
}
async urlToBlob(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
let leftRedirection = 6;
let cookieRedirects = 0;
let opts = { ...crawlOpts };
let nextHopUrl = urlToCrawl;
const fakeHeaderInfos: HeaderInfo[] = [];
do {
const s = await this.urlToStream(nextHopUrl, opts);
const r = { ...s } as {
statusCode: number,
statusText?: string,
data?: Blob,
headers: HeaderInfo[],
};
const headers = r.headers[r.headers.length - 1];
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
fakeHeaderInfos.push(...r.headers);
const location: string | undefined = headers.Location || headers.location;
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
if (setCookieHeader) {
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
if (parsed.length) {
opts.cookies = [...(opts.cookies || []), ...parsed];
}
if (!location) {
cookieRedirects += 1;
}
}
if (!location && !setCookieHeader) {
// Follow curl behavior
if (s.data) {
const chunks: Buffer[] = [];
s.data.on('data', (chunk) => {
chunks.push(chunk);
});
await new Promise((resolve, reject) => {
s.data!.once('end', resolve);
s.data!.once('error', reject);
});
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
}
return {
statusCode: r.statusCode,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
}
if (!location && cookieRedirects > 1) {
throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
}
nextHopUrl = new URL(location || '', nextHopUrl);
leftRedirection -= 1;
continue;
}
if (s.data) {
const chunks: Buffer[] = [];
s.data.on('data', (chunk) => {
chunks.push(chunk);
});
await new Promise((resolve, reject) => {
s.data!.once('end', resolve);
s.data!.once('error', reject);
});
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
}
return {
statusCode: r.statusCode,
statusText: r.statusText,
data: r.data,
headers: fakeHeaderInfos.concat(r.headers),
};
} while (leftRedirection > 0);
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
}
async sideLoadBlob(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
const curlResult = await this.urlToBlob(targetUrl, crawlOpts);
this.blackHoleDetector.itWorked();
let finalURL = targetUrl;
const sideLoadOpts: CURLScrappingOptions<Blob>['sideLoad'] = {
impersonate: {},
proxyOrigin: {},
};
for (const headers of curlResult.headers) {
sideLoadOpts.impersonate[finalURL.href] = {
status: headers.result?.code || -1,
headers: _.omit(headers, 'result'),
contentType: headers['Content-Type'] || headers['content-type'],
};
if (crawlOpts?.proxyUrl) {
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
}
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
const location = headers.Location || headers.location;
if (location) {
finalURL = new URL(location, finalURL);
}
}
}
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (curlResult.data?.type) || 'application/octet-stream';
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
if (sideLoadOpts.impersonate[finalURL.href] && (curlResult.data?.size)) {
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
}
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
return {
finalURL,
sideLoadOpts,
chain: curlResult.headers,
status: curlResult.statusCode,
statusText: curlResult.statusText,
headers: lastHeaders,
contentType,
contentDisposition,
fileName,
file: curlResult.data
};
}
digestCurlCode(code: CurlCode, msg: string) {
switch (code) {
// 400 User errors

View File

@ -79,7 +79,7 @@ export interface ExtendedSnapshot extends PageSnapshot {
imgs: ImgBrief[];
}
export interface ScrappingOptions {
export interface ScrappingOptions<T = FancyFile | Blob> {
proxyUrl?: string;
cookies?: Cookie[];
favorScreenshot?: boolean;
@ -101,7 +101,7 @@ export interface ScrappingOptions {
status: number;
headers: { [k: string]: string | string[]; };
contentType?: string;
body?: FancyFile;
body?: T;
};
};
proxyOrigin: { [origin: string]: string; };
@ -912,7 +912,11 @@ export class PuppeteerControl extends AsyncService {
if (impersonate) {
let body;
if (impersonate.body) {
if (impersonate.body instanceof Blob) {
body = new Uint8Array(await impersonate.body.arrayBuffer());
} else {
body = await readFile(await impersonate.body.filePath);
}
if (req.isInterceptResolutionHandled()) {
return;
}

View File

@ -0,0 +1,133 @@
import { singleton } from 'tsyringe';
import { AsyncService } from 'civkit/async-service';
import { GlobalLogger } from '../logger';
import { JSDomControl } from '../jsdom';
import _ from 'lodash';
import { WebSearchEntry } from './compat';
import { ServiceBadAttemptError } from '../errors';
import commonSerpClients, { CommonSerpImageResponse, CommonSerpNewsResponse, CommonSerpWebResponse } from '../../shared/3rd-party/common-serp';
import { AsyncLocalContext } from '../async-context';
@singleton()
export class CommonGoogleSERP extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
protected ctxIteratorMap = new WeakMap<object, ReturnType<CommonGoogleSERP['iterClients']>>();
constructor(
protected globalLogger: GlobalLogger,
protected jsDomControl: JSDomControl,
protected asyncContext: AsyncLocalContext,
) {
super(...arguments);
}
override async init() {
await this.dependencyReady();
this.emit('ready');
}
*iterClients() {
if (!commonSerpClients.length) {
return;
}
while (true) {
yield* commonSerpClients;
}
}
getClient() {
const ctx = this.asyncContext.ctx;
const it = this.ctxIteratorMap.get(ctx) || this.iterClients();
this.ctxIteratorMap.set(ctx, it);
const client = it.next().value;
if (!client) {
throw new ServiceBadAttemptError('No client available');
}
return client;
}
digestQuery(query: { [k: string]: any; }) {
const url = new URL(`https://${this.googleDomain}/search`);
const clone = { ...query };
const num = clone.num || 10;
if (clone.page) {
const page = parseInt(clone.page);
delete clone.page;
clone.start = (page - 1) * num;
if (clone.start === 0) {
delete clone.start;
}
}
if (clone.location) {
delete clone.location;
}
for (const [k, v] of Object.entries(clone)) {
if (v === undefined || v === null) {
continue;
}
url.searchParams.set(k, `${v}`);
}
return url;
}
async webSearch(query: { [k: string]: any; }) {
const url = this.digestQuery(query);
const client = this.getClient();
const r = await client.queryJSON(url.href) as CommonSerpWebResponse;
return r.organic.map((x)=> ({
link: x.link,
title: x.title,
snippet: x.description,
variant: 'web',
})) as WebSearchEntry[];
}
async newsSearch(query: { [k: string]: any; }) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'nws');
const client = this.getClient();
const r = await client.queryJSON(url.href) as CommonSerpNewsResponse;
return r.news.map((x)=> ({
link: x.link,
title: x.title,
snippet: x.description,
source: x.source,
date: x.date,
imageUrl: x.image,
variant: 'news',
})) as WebSearchEntry[];
}
async imageSearch(query: { [k: string]: any; }) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'isch');
const client = this.getClient();
const r = await client.queryJSON(url.href) as CommonSerpImageResponse;
return r.images.map((x)=> ({
link: x.link,
title: x.title,
snippet: x.image_alt,
source: x.source,
imageUrl: x.image,
variant: 'images',
})) as WebSearchEntry[];
}
}

View File

@ -7,24 +7,74 @@ import _ from 'lodash';
import { WebSearchEntry } from './compat';
import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
import { CurlControl } from '../curl';
import { readFile } from 'fs/promises';
import { ApplicationError } from 'civkit/civ-rpc';
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
import { parseJSONText } from 'civkit/vectorize';
import { retryWith } from 'civkit/decorators';
import { ProxyProviderService } from '../../shared/services/proxy-provider';
import { retry, retryWith } from 'civkit/decorators';
import { SERPProxyProviderService } from '../../shared/services/proxy-provider';
import { readBlob } from '../../utils/encoding';
import { createContext, Script } from 'vm';
import { BrowserContext } from 'puppeteer';
import { createPool } from 'generic-pool';
import { randomBytes } from 'crypto';
import { AsyncLocalContext } from '../async-context';
interface SerpContext {
proxyUrl?: string;
browserContext?: BrowserContext;
validTill?: Date;
magicId?: string;
}
@singleton()
export class GoogleSERP extends AsyncService {
logger = this.globalLogger.child({ service: this.constructor.name });
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
nativeIPHealthy = true;
contextPool = createPool({
create: () => {
return this.createContext();
},
destroy: async (ctx: SerpContext) => {
if (ctx.browserContext) {
try {
await ctx.browserContext.close();
} catch (err) {
this.logger.warn('Error closing browser context', { err });
}
}
},
validate: async (ctx: SerpContext) => {
if (ctx.validTill && ctx.validTill > (new Date(Date.now() + 5 * 60 * 1000))) {
return true;
}
return !ctx.proxyUrl;
},
}, {
max: 3_000,
testOnBorrow: true,
});
protected async createContext() {
await this.serviceReady();
this.asyncLocalContext.ctx.ctxIsNew = true;
return {
magicId: randomBytes(17).toString('base64url'),
validTill: new Date(Date.now() + 30 * 60 * 1000),
} as SerpContext;
}
constructor(
protected globalLogger: GlobalLogger,
protected puppeteerControl: SERPSpecializedPuppeteerControl,
protected jsDomControl: JSDomControl,
protected curlControl: CurlControl,
protected proxyProvider: ProxyProviderService,
protected proxyProvider: SERPProxyProviderService,
protected asyncLocalContext: AsyncLocalContext,
) {
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
super(...filteredDeps);
@ -36,6 +86,15 @@ export class GoogleSERP extends AsyncService {
this.emit('ready');
}
nativeIPBlocked() {
this.nativeIPHealthy = false;
this.logger.warn('Native IP is not healthy.');
setTimeout(() => {
this.nativeIPHealthy = true;
this.logger.debug('Presume native IP healthy again after timeout.');
}, 1000 * 60 * 60);
}
@retryWith((err) => {
if (err instanceof ServiceBadApproachError) {
return false;
@ -51,15 +110,13 @@ export class GoogleSERP extends AsyncService {
return undefined;
}, 3)
async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
if (opts?.allocProxy === 'none') {
return this.curlControl.sideLoad(url, opts);
if (opts?.allocProxy === 'none' || opts?.proxyUrl) {
return this.curlControl.sideLoadBlob(url, opts);
}
const proxy = await this.proxyProvider.alloc(
process.env.PREFERRED_PROXY_COUNTRY || 'auto'
);
const proxy = await this.proxyProvider.alloc();
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
const r = await this.curlControl.sideLoad(url, {
const r = await this.curlControl.sideLoadBlob(url, {
...opts,
proxyUrl: proxy.href,
});
@ -101,48 +158,126 @@ export class GoogleSERP extends AsyncService {
return url;
}
@retry(2)
async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
const url = this.digestQuery(query);
const origHref = url.href;
if (!url.searchParams.has('start')) {
url.searchParams.set('start', '0');
}
url.searchParams.set('asearch', 'arc');
const ctx = await this.contextPool.acquire();
url.searchParams.set('async', getMagicAsyncParam(query.start, ctx.magicId));
const t0 = performance.now();
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
...opts,
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
proxyUrl: ctx.proxyUrl,
timeoutMs: 3_700
}).catch((err) => {
this.contextPool.destroy(ctx);
return Promise.reject(err);
});
const dt = performance.now() - t0;
if ('proxy' in sideLoaded) {
ctx.proxyUrl = sideLoaded.proxy.href;
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
}
if (sideLoaded.status === 200) {
if (dt < 1_700) {
this.contextPool.release(ctx);
} else {
this.contextPool.destroy(ctx);
}
} else {
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
this.nativeIPBlocked();
}
this.contextPool.destroy(ctx);
throw new ServiceBadAttemptError({
message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.',
});
}
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
if (opts && sideLoaded.sideLoadOpts) {
opts.sideLoad = sideLoaded.sideLoadOpts;
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
if (!sideLoaded.file) {
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
const contentType = sideLoaded.contentType;
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
return snapshot;
let html = await readBlob(sideLoaded.file, encoding);
let innerCharset;
const peek = html.slice(0, 1024);
innerCharset ??= peek.match(/<meta[^>]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase();
innerCharset ??= peek.match(/<meta[^>]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase();
if (innerCharset && innerCharset !== encoding) {
html = await readBlob(sideLoaded.file, innerCharset);
}
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'nws');
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
if (opts && sideLoaded.sideLoadOpts) {
opts.sideLoad = sideLoaded.sideLoadOpts;
const jsdom = this.jsDomControl.linkedom.parseHTML(html, { location: { href: origHref } });
try {
const r = runGetWebSearchResultsScript(createContext(jsdom));
if (!Array.isArray(r)) {
throw new Error('Failed to parse response as SERP results');
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
return snapshot;
return r;
} catch (err) {
throw new ServiceBadAttemptError({
message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.',
err
});
}
}
@retry(2)
async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'isch');
url.searchParams.set('asearch', 'isch');
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
const ctx = await this.contextPool.acquire();
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
...opts,
proxyUrl: ctx.proxyUrl,
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
}).catch((err) => {
this.contextPool.destroy(ctx);
return Promise.reject(err);
});
if ('proxy' in sideLoaded) {
ctx.proxyUrl = sideLoaded.proxy.href;
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
}
if (sideLoaded.status === 200) {
this.contextPool.release(ctx);
} else {
this.contextPool.destroy(ctx);
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
this.nativeIPBlocked();
}
}
if (sideLoaded.status !== 200 || !sideLoaded.file) {
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
const jsonTxt = (await readBlob(sideLoaded.file)).toString();
const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
return _.get(rJSON, 'ischj.metadata').map((x: any) => {
@ -161,6 +296,95 @@ export class GoogleSERP extends AsyncService {
}
}
@singleton()
export class GoogleSERPOldFashion extends GoogleSERP {
override async createContext() {
await this.serviceReady();
this.asyncLocalContext.ctx.ctxIsNew = true;
return {
browserContext: await this.puppeteerControl.browser.createBrowserContext(),
magicId: randomBytes(17).toString('base64url'),
validTill: new Date(Date.now() + 30 * 60 * 1000),
} as SerpContext;
}
override async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
const url = this.digestQuery(query);
const ctx = await this.contextPool.acquire();
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
...opts,
proxyUrl: ctx.proxyUrl,
}).catch((err) => {
this.contextPool.destroy(ctx);
return Promise.reject(err);
});
if ('proxy' in sideLoaded) {
ctx.proxyUrl = sideLoaded.proxy.href;
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
}
if (sideLoaded.status === 200) {
this.contextPool.release(ctx);
} else {
this.contextPool.destroy(ctx);
}
if (opts && sideLoaded.sideLoadOpts) {
opts.sideLoad = sideLoaded.sideLoadOpts;
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
if (!Array.isArray(snapshot)) {
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
}
return snapshot;
}
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
const url = this.digestQuery(query);
url.searchParams.set('tbm', 'nws');
const ctx = await this.contextPool.acquire();
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
...opts,
proxyUrl: ctx.proxyUrl,
}).catch((err) => {
this.contextPool.destroy(ctx);
return Promise.reject(err);
});
if ('proxy' in sideLoaded) {
ctx.proxyUrl = sideLoaded.proxy.href;
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
}
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, {
...opts,
proxyUrl: ctx.proxyUrl,
browserContext: ctx.browserContext,
}).catch((err) => {
this.contextPool.destroy(ctx);
return Promise.reject(err);
});
this.contextPool.release(ctx);
return snapshot;
}
}
async function getWebSearchResults() {
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
@ -254,6 +478,96 @@ async function getWebSearchResults() {
};
}).filter(Boolean) as WebSearchEntry[];
}
function getWebSearchResultsSync() {
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
if (!wrapper1) {
return undefined;
}
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
if (!query) {
return undefined;
}
const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
return candidates.map((x, pos) => {
const primaryLink = x.querySelector('a:not([href="#"])');
if (!primaryLink) {
return undefined;
}
const url = primaryLink.getAttribute('href');
if (primaryLink.querySelector('div[role="heading"]')) {
// const spans = primaryLink.querySelectorAll('span');
// const title = spans[0]?.textContent;
// const source = spans[1]?.textContent;
// const date = spans[spans.length - 1].textContent;
// return {
// link: url,
// title,
// source,
// date,
// variant: 'video'
// };
return undefined;
}
const title = primaryLink.querySelector('h3')?.textContent;
const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
let date = cite?.split('·')[1]?.trim();
const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
let snippet = snippets[snippets.length - 1]?.textContent;
if (!snippet) {
snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
}
date ??= snippets[snippets.length - 2]?.textContent?.trim();
const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
return {
link: l.getAttribute('href'),
title: l.textContent,
};
});
const perhapsParent = x.parentElement?.closest('div[data-hveid]');
if (!siteLinks?.length && perhapsParent) {
const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
if (candidates.length) {
siteLinks = candidates.map((l) => {
const link = l.querySelector('a');
if (!link) {
return undefined;
}
const snippet = l.nextElementSibling?.textContent;
return {
link: link.getAttribute('href'),
title: link.textContent,
snippet,
};
}).filter(Boolean) as any;
}
}
return {
link: url,
title,
source,
date,
snippet: snippet ?? undefined,
imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
siteLinks: siteLinks.length ? siteLinks : undefined,
variant: 'web',
};
}).filter(Boolean) as WebSearchEntry[];
}
const script = new Script(`(${getWebSearchResultsSync.toString()})()`);
function runGetWebSearchResultsScript(ctx: object) {
return script.runInContext(ctx);
}
async function getNewsSearchResults() {
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
@ -306,3 +620,9 @@ async function getNewsSearchResults() {
};
}).filter(Boolean) as WebSearchEntry[];
}
function getMagicAsyncParam(start: number = 0, inputArcid?: string) {
const arcid = inputArcid || randomBytes(17).toString('base64url');
return `arc_id:srp_${arcid}_1${start.toString().padStart(2, '0')},use_ac:true,_fmt:prog`;
}

View File

@ -67,11 +67,5 @@ export class InternalJinaSerpService extends AsyncService {
async webSearch(query: SerperSearchQueryParams) {
return this.doSearch('web', query);
}
async imageSearch(query: SerperSearchQueryParams) {
return this.doSearch('images', query);
}
async newsSearch(query: SerperSearchQueryParams) {
return this.doSearch('news', query);
}
}

View File

@ -2,7 +2,7 @@ import _ from 'lodash';
import { readFile } from 'fs/promises';
import { container, singleton } from 'tsyringe';
import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
import type { Browser, BrowserContext, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
import type { Cookie } from 'set-cookie-parser';
import puppeteer, { TimeoutError } from 'puppeteer';
@ -21,6 +21,7 @@ import { BlackHoleDetector } from '../blackhole-detector';
export interface ScrappingOptions {
browserContext?: BrowserContext;
proxyUrl?: string;
cookies?: Cookie[];
overrideUserAgent?: string;
@ -38,7 +39,7 @@ export interface ScrappingOptions {
status: number;
headers: { [k: string]: string | string[]; };
contentType?: string;
body?: FancyFile;
body?: FancyFile | Blob;
};
};
proxyOrigin: { [origin: string]: string; };
@ -226,8 +227,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
browser!: Browser;
logger = this.globalLogger.child({ service: this.constructor.name });
__loadedPage: Page[] = [];
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
snMap = new WeakMap<Page, number>();
livePages = new Set<Page>();
@ -251,7 +250,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
let crippledTimes = 0;
this.on('crippled', () => {
crippledTimes += 1;
this.__loadedPage.length = 0;
this.livePages.clear();
if (crippledTimes > 5) {
process.nextTick(() => {
@ -303,20 +301,16 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
this.curlControl.impersonateChrome(this.effectiveUA);
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
this.emit('ready');
}
async newPage<T>(bewareDeadLock: any = false) {
if (!bewareDeadLock) {
async newPage<T>(context?: BrowserContext) {
await this.serviceReady();
}
const sn = this._sn++;
let page;
context ??= await this.browser.createBrowserContext();
try {
const dedicatedContext = await this.browser.createBrowserContext();
page = await dedicatedContext.newPage();
page = await context.newPage();
} catch (err: any) {
this.logger.warn(`Failed to create page ${sn}`, { err });
this.browser.process()?.kill('SIGKILL');
@ -347,23 +341,9 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
return page;
}
async getNextPage() {
let thePage: Page | undefined;
if (this.__loadedPage.length) {
thePage = this.__loadedPage.shift();
if (this.__loadedPage.length <= 1) {
process.nextTick(() => {
this.newPage()
.then((r) => this.__loadedPage.push(r))
.catch((err) => {
this.logger.warn(`Failed to load new page ahead of time`, { err });
});
});
}
}
async getNextPage(context?: BrowserContext) {
const thePage = await this.newPage(context);
if (!thePage) {
thePage = await this.newPage();
}
const timer = setTimeout(() => {
@ -387,14 +367,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
const sn = this.snMap.get(page);
this.logger.debug(`Closing page ${sn}`);
await Promise.race([
(async () => {
const ctx = page.browserContext();
try {
await page.close();
} finally {
await ctx.close();
}
})(),
page.close(),
delay(5000)
]).catch((err) => {
this.logger.error(`Failed to destroy page ${sn}`, { err });
@ -405,7 +378,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
// parsedUrl.search = '';
const url = parsedUrl.toString();
const page = await this.getNextPage();
const page = await this.getNextPage(options.browserContext);
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
page.on('response', (_resp) => {
this.blackHoleDetector.itWorked();
@ -452,7 +425,11 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
if (impersonate) {
let body;
if (impersonate.body) {
if (impersonate.body instanceof Blob) {
body = new Uint8Array(await impersonate.body.arrayBuffer());
} else {
body = await readFile(await impersonate.body.filePath);
}
if (req.isInterceptResolutionHandled()) {
return;
}

View File

@ -1,13 +1,12 @@
import { createReadStream } from 'fs';
import { Readable } from 'stream';
import { TextDecoderStream } from 'stream/web';
export async function decodeFileStream(
fileStream: Readable,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(fileStream).pipeThrough(decodeStream);
(Readable.toWeb(fileStream) as ReadableStream).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
@ -23,7 +22,22 @@ export async function readFile(
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
(Readable.toWeb(createReadStream(filePath)) as ReadableStream).pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {
chunks.push(chunk);
}
return chunks.join('');
}
export async function readBlob(
blob: Blob,
encoding: string = 'utf-8',
): Promise<string> {
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
blob.stream().pipeThrough(decodeStream);
const chunks = [];
for await (const chunk of decodeStream.readable) {

@ -1 +1 @@
Subproject commit 02279d88bc3940a08a92cb18cf8877d57cb49b82
Subproject commit 8b78eab54d78868d44065bfc59c413fe6bd4929d