mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-19 01:25:51 +08:00
serp: new trick
This commit is contained in:
parent
5f07900eab
commit
a178723579
5
.vscode/launch.json
vendored
5
.vscode/launch.json
vendored
@ -90,6 +90,7 @@
|
|||||||
],
|
],
|
||||||
"env": {
|
"env": {
|
||||||
"GCLOUD_PROJECT": "reader-6b7dc",
|
"GCLOUD_PROJECT": "reader-6b7dc",
|
||||||
|
"PREFERRED_PROXY_COUNTRY": "us",
|
||||||
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
@ -110,8 +111,8 @@
|
|||||||
],
|
],
|
||||||
"env": {
|
"env": {
|
||||||
"GCLOUD_PROJECT": "reader-6b7dc",
|
"GCLOUD_PROJECT": "reader-6b7dc",
|
||||||
"PREFERRED_PROXY_COUNTRY": "hk",
|
"PREFERRED_PROXY_COUNTRY": "us",
|
||||||
"OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
|
// "OVERRIDE_GOOGLE_DOMAIN": "www.google.com.hk",
|
||||||
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
"LD_PRELOAD": "/usr/local/lib/libcurl-impersonate-chrome.dylib"
|
||||||
},
|
},
|
||||||
"cwd": "${workspaceFolder}",
|
"cwd": "${workspaceFolder}",
|
||||||
|
14
README.md
14
README.md
@ -18,10 +18,13 @@ Or just visit these URLs (**Read**) https://r.jina.ai/https://github.com/jina-ai
|
|||||||
|
|
||||||
## Updates
|
## Updates
|
||||||
|
|
||||||
|
- **2024-10-08**: Introduced an `adaptive crawler`. It can recursively crawl the website and extract the most relevant pages for a given webpage.
|
||||||
- **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform).
|
- **2024-07-15**: To restrict the results of `s.jina.ai` to certain domain/website, you can set e.g. `site=jina.ai` in the query parameters, which enables in-site search. For more options, [try our updated live-demo](https://jina.ai/reader/#apiform).
|
||||||
|
- **2024-07-01**: We have resolved a DDoS attack and other traffic abusing since June 27th. We also found a bug introduced on June 28th which may cause higher latency for some websites. The attack and the bug have been solved; if you have experienced high latency of r.jina.ai between June 27th-30th, it should back to normal now.
|
||||||
- **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf).
|
- **2024-05-30**: Reader can now read abitrary PDF from any URL! Check out [this PDF result from NASA.gov](https://r.jina.ai/https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf) vs [the original](https://www.nasa.gov/wp-content/uploads/2023/01/55583main_vision_space_exploration2.pdf).
|
||||||
- **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms).
|
- **2024-05-15**: We introduced a new endpoint `s.jina.ai` that searches on the web and return top-5 results, each in a LLM-friendly format. [Read more about this new feature here](https://jina.ai/news/jina-reader-for-search-grounding-to-improve-factuality-of-llms).
|
||||||
- **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header.
|
- **2024-05-08**: Image caption is off by default for better latency. To turn it on, set `x-with-generated-alt: true` in the request header.
|
||||||
|
- **2024-05-03**: We finally resolved a DDoS attack since April 29th. Now our API is much more reliable and scalable than ever!
|
||||||
- **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy.
|
- **2024-04-24**: You now have more fine-grained control over Reader API [using headers](#using-request-headers), e.g. forwarding cookies, using HTTP proxy.
|
||||||
- **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926).
|
- **2024-04-15**: Reader now supports image reading! It captions all images at the specified URL and adds `Image [idx]: [caption]` as an alt tag (if they initially lack one). This enables downstream LLMs to interact with the images in reasoning, summarizing etc. [See example here](https://x.com/JinaAI_/status/1780094402071023926).
|
||||||
|
|
||||||
@ -151,8 +154,15 @@ All images in that page that lack `alt` tag can be auto-captioned by a VLM (visi
|
|||||||
curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
|
curl -H "X-With-Generated-Alt: true" https://r.jina.ai/https://en.m.wikipedia.org/wiki/Main_Page
|
||||||
```
|
```
|
||||||
|
|
||||||
## How it works
|
## Install
|
||||||
[](https://deepwiki.com/jina-ai/reader)
|
|
||||||
|
You will need the following tools to run the project:
|
||||||
|
- Node v18 (The build fails for Node version >18)
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone git@github.com:jina-ai/reader.git
|
||||||
|
npm install
|
||||||
|
```
|
||||||
|
|
||||||
## What is `thinapps-shared` submodule?
|
## What is `thinapps-shared` submodule?
|
||||||
|
|
||||||
|
3
package-lock.json
generated
3
package-lock.json
generated
@ -23,6 +23,7 @@
|
|||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^6.1.1",
|
"firebase-functions": "^6.1.1",
|
||||||
|
"generic-pool": "^3.9.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"koa": "^2.16.0",
|
"koa": "^2.16.0",
|
||||||
@ -6249,7 +6250,7 @@
|
|||||||
"version": "3.9.0",
|
"version": "3.9.0",
|
||||||
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
|
"resolved": "https://registry.npmjs.org/generic-pool/-/generic-pool-3.9.0.tgz",
|
||||||
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
|
"integrity": "sha512-hymDOu5B53XvN4QT9dBmZxPX4CWhBPPLguTZ9MMFeFa/Kg0xWVfylOVNlJji/E7yTZWFd/q9GO5TxDLq156D7g==",
|
||||||
"optional": true,
|
"license": "MIT",
|
||||||
"engines": {
|
"engines": {
|
||||||
"node": ">= 4"
|
"node": ">= 4"
|
||||||
}
|
}
|
||||||
|
@ -32,6 +32,7 @@
|
|||||||
"express": "^4.19.2",
|
"express": "^4.19.2",
|
||||||
"firebase-admin": "^12.1.0",
|
"firebase-admin": "^12.1.0",
|
||||||
"firebase-functions": "^6.1.1",
|
"firebase-functions": "^6.1.1",
|
||||||
|
"generic-pool": "^3.9.0",
|
||||||
"htmlparser2": "^9.0.0",
|
"htmlparser2": "^9.0.0",
|
||||||
"jose": "^5.1.0",
|
"jose": "^5.1.0",
|
||||||
"koa": "^2.16.0",
|
"koa": "^2.16.0",
|
||||||
|
@ -49,6 +49,7 @@ import { TempFileManager } from '../services/temp-file';
|
|||||||
import { MiscService } from '../services/misc';
|
import { MiscService } from '../services/misc';
|
||||||
import { HTTPServiceError } from 'civkit/http';
|
import { HTTPServiceError } from 'civkit/http';
|
||||||
import { GeoIPService } from '../services/geoip';
|
import { GeoIPService } from '../services/geoip';
|
||||||
|
import { writeFile } from 'fs/promises';
|
||||||
|
|
||||||
export interface ExtraScrappingOptions extends ScrappingOptions {
|
export interface ExtraScrappingOptions extends ScrappingOptions {
|
||||||
withIframe?: boolean | 'quoted';
|
withIframe?: boolean | 'quoted';
|
||||||
@ -1145,7 +1146,16 @@ export class CrawlerHost extends RPCHost {
|
|||||||
if (pdfUrl.startsWith('http')) {
|
if (pdfUrl.startsWith('http')) {
|
||||||
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
|
const sideLoaded = scrappingOptions?.sideLoad?.impersonate[pdfUrl];
|
||||||
if (sideLoaded?.status === 200 && sideLoaded.body) {
|
if (sideLoaded?.status === 200 && sideLoaded.body) {
|
||||||
snapshotCopy.pdfs[0] = pathToFileURL(await sideLoaded?.body.filePath).href;
|
let filePath = '';
|
||||||
|
if (sideLoaded.body instanceof Blob) {
|
||||||
|
const tmpPath = this.tempFileManager.alloc();
|
||||||
|
await writeFile(tmpPath, sideLoaded.body.stream());
|
||||||
|
this.tempFileManager.bindPathTo(this.threadLocal.ctx, tmpPath);
|
||||||
|
filePath = tmpPath;
|
||||||
|
} else {
|
||||||
|
filePath = await sideLoaded.body.filePath;
|
||||||
|
}
|
||||||
|
snapshotCopy.pdfs[0] = pathToFileURL(filePath).href;
|
||||||
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
return this.snapshotFormatter.formatSnapshot(mode, snapshotCopy, nominalUrl, urlValidMs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,8 +27,10 @@ import { LRUCache } from 'lru-cache';
|
|||||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||||
import { SERPResult } from '../db/searched';
|
import { SERPResult } from '../db/searched';
|
||||||
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import { SerperSearchQueryParams, WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
|
||||||
import { WebSearchEntry } from '../services/serp/compat';
|
import { WebSearchEntry } from '../services/serp/compat';
|
||||||
|
import { CommonGoogleSERP } from '../services/serp/common-serp';
|
||||||
|
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
|
||||||
|
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||||
|
|
||||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||||
|
|
||||||
@ -72,6 +74,9 @@ export class SearcherHost extends RPCHost {
|
|||||||
protected serperGoogle: SerperGoogleSearchService,
|
protected serperGoogle: SerperGoogleSearchService,
|
||||||
protected serperBing: SerperBingSearchService,
|
protected serperBing: SerperBingSearchService,
|
||||||
protected jinaSerp: InternalJinaSerpService,
|
protected jinaSerp: InternalJinaSerpService,
|
||||||
|
protected commonGoogleSerp: CommonGoogleSERP,
|
||||||
|
protected googleSERP: GoogleSERP,
|
||||||
|
protected googleSERPOld: GoogleSERPOldFashion,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
|
|
||||||
@ -715,26 +720,32 @@ export class SearcherHost extends RPCHost {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
*iterProviders(preference?: string, variant?: string) {
|
*iterProviders(preference?: string, _variant?: string) {
|
||||||
if (preference === 'bing') {
|
if (preference === 'bing') {
|
||||||
yield this.serperBing;
|
yield this.serperBing;
|
||||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
yield this.googleSERP;
|
||||||
|
yield this.jinaSerp;
|
||||||
yield this.serperGoogle;
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (preference === 'google') {
|
if (preference === 'google') {
|
||||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
yield this.googleSERP;
|
||||||
yield this.serperGoogle;
|
yield this.jinaSerp;
|
||||||
yield this.serperGoogle;
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
yield this.googleSERPOld;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
yield this.googleSERP;
|
||||||
yield this.serperGoogle;
|
yield this.jinaSerp;
|
||||||
yield this.serperGoogle;
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
yield this.googleSERPOld;
|
||||||
}
|
}
|
||||||
|
|
||||||
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
|
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, noCache?: boolean): Promise<WebSearchEntry[]> {
|
||||||
@ -767,22 +778,27 @@ export class SearcherHost extends RPCHost {
|
|||||||
outerLoop:
|
outerLoop:
|
||||||
for (const client of this.iterProviders(provider, variant)) {
|
for (const client of this.iterProviders(provider, variant)) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
try {
|
let func;
|
||||||
switch (variant) {
|
switch (variant) {
|
||||||
case 'images': {
|
case 'images': {
|
||||||
r = await Reflect.apply(client.imageSearch, client, [query]);
|
func = Reflect.get(client, 'imageSearch');
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case 'news': {
|
|
||||||
r = await Reflect.apply(client.newsSearch, client, [query]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'web':
|
|
||||||
default: {
|
|
||||||
r = await Reflect.apply(client.webSearch, client, [query]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
case 'news': {
|
||||||
|
func = Reflect.get(client, 'newsSearch');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'web':
|
||||||
|
default: {
|
||||||
|
func = Reflect.get(client, 'webSearch');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!func) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
r = await Reflect.apply(func, client, [query]);
|
||||||
const dt = Date.now() - t0;
|
const dt = Date.now() - t0;
|
||||||
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
||||||
break outerLoop;
|
break outerLoop;
|
||||||
@ -806,6 +822,8 @@ export class SearcherHost extends RPCHost {
|
|||||||
this.batchedCaches.push(record);
|
this.batchedCaches.push(record);
|
||||||
} else if (lastError) {
|
} else if (lastError) {
|
||||||
throw lastError;
|
throw lastError;
|
||||||
|
} else if (!r) {
|
||||||
|
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return r as WebSearchEntry[];
|
return r as WebSearchEntry[];
|
||||||
|
@ -3,6 +3,7 @@ import {
|
|||||||
RPCHost, RPCReflection, assignMeta, RawString,
|
RPCHost, RPCReflection, assignMeta, RawString,
|
||||||
ParamValidationError,
|
ParamValidationError,
|
||||||
assignTransferProtocolMeta,
|
assignTransferProtocolMeta,
|
||||||
|
AssertionFailureError,
|
||||||
} from 'civkit/civ-rpc';
|
} from 'civkit/civ-rpc';
|
||||||
import { marshalErrorLike } from 'civkit/lang';
|
import { marshalErrorLike } from 'civkit/lang';
|
||||||
import _ from 'lodash';
|
import _ from 'lodash';
|
||||||
@ -16,7 +17,7 @@ import { OutputServerEventStream } from '../lib/transform-server-event-stream';
|
|||||||
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
import { JinaEmbeddingsAuthDTO } from '../dto/jina-embeddings-auth';
|
||||||
import { InsufficientBalanceError } from '../services/errors';
|
import { InsufficientBalanceError } from '../services/errors';
|
||||||
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
import { WORLD_COUNTRIES, WORLD_LANGUAGES } from '../shared/3rd-party/serper-search';
|
||||||
import { GoogleSERP } from '../services/serp/google';
|
import { GoogleSERP, GoogleSERPOldFashion } from '../services/serp/google';
|
||||||
import { WebSearchEntry } from '../services/serp/compat';
|
import { WebSearchEntry } from '../services/serp/compat';
|
||||||
import { CrawlerOptions } from '../dto/crawler-options';
|
import { CrawlerOptions } from '../dto/crawler-options';
|
||||||
import { ScrappingOptions } from '../services/serp/puppeteer';
|
import { ScrappingOptions } from '../services/serp/puppeteer';
|
||||||
@ -26,6 +27,7 @@ import { SerperBingSearchService, SerperGoogleSearchService } from '../services/
|
|||||||
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
import type { JinaEmbeddingsTokenAccount } from '../shared/db/jina-embeddings-token-account';
|
||||||
import { LRUCache } from 'lru-cache';
|
import { LRUCache } from 'lru-cache';
|
||||||
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
import { API_CALL_STATUS } from '../shared/db/api-roll';
|
||||||
|
import { CommonGoogleSERP } from '../services/serp/common-serp';
|
||||||
import { InternalJinaSerpService } from '../services/serp/internal';
|
import { InternalJinaSerpService } from '../services/serp/internal';
|
||||||
|
|
||||||
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
const WORLD_COUNTRY_CODES = Object.keys(WORLD_COUNTRIES).map((x) => x.toLowerCase());
|
||||||
@ -91,8 +93,10 @@ export class SerpHost extends RPCHost {
|
|||||||
protected rateLimitControl: RateLimitControl,
|
protected rateLimitControl: RateLimitControl,
|
||||||
protected threadLocal: AsyncLocalContext,
|
protected threadLocal: AsyncLocalContext,
|
||||||
protected googleSerp: GoogleSERP,
|
protected googleSerp: GoogleSERP,
|
||||||
|
protected googleSerpOld: GoogleSERPOldFashion,
|
||||||
protected serperGoogle: SerperGoogleSearchService,
|
protected serperGoogle: SerperGoogleSearchService,
|
||||||
protected serperBing: SerperBingSearchService,
|
protected serperBing: SerperBingSearchService,
|
||||||
|
protected commonGoogleSerp: CommonGoogleSERP,
|
||||||
protected jinaSerp: InternalJinaSerpService,
|
protected jinaSerp: InternalJinaSerpService,
|
||||||
) {
|
) {
|
||||||
super(...arguments);
|
super(...arguments);
|
||||||
@ -157,7 +161,7 @@ export class SerpHost extends RPCHost {
|
|||||||
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
|
@Param('num', { validate: (v: number) => v >= 0 && v <= 20 })
|
||||||
num?: number,
|
num?: number,
|
||||||
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
|
@Param('gl', { validate: (v: string) => WORLD_COUNTRY_CODES.includes(v?.toLowerCase()) }) gl?: string,
|
||||||
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) _hl?: string,
|
@Param('hl', { validate: (v: string) => WORLD_LANGUAGES.some(l => l.code === v) }) hl?: string,
|
||||||
@Param('location') location?: string,
|
@Param('location') location?: string,
|
||||||
@Param('page') page?: number,
|
@Param('page') page?: number,
|
||||||
@Param('fallback') fallback?: boolean,
|
@Param('fallback') fallback?: boolean,
|
||||||
@ -318,7 +322,7 @@ export class SerpHost extends RPCHost {
|
|||||||
q,
|
q,
|
||||||
num,
|
num,
|
||||||
gl,
|
gl,
|
||||||
// hl,
|
hl,
|
||||||
location,
|
location,
|
||||||
page,
|
page,
|
||||||
}, crawlerOptions);
|
}, crawlerOptions);
|
||||||
@ -451,27 +455,32 @@ export class SerpHost extends RPCHost {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
*iterProviders(preference?: string, variant?: string) {
|
*iterProviders(preference?: string, _variant?: string) {
|
||||||
if (preference === 'bing') {
|
if (preference === 'bing') {
|
||||||
yield this.serperBing;
|
yield this.serperBing;
|
||||||
yield this.serperGoogle;
|
|
||||||
yield this.googleSerp;
|
yield this.googleSerp;
|
||||||
|
yield this.jinaSerp;
|
||||||
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
yield this.googleSerpOld;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (preference === 'google') {
|
if (preference === 'google') {
|
||||||
yield this.googleSerp;
|
|
||||||
yield this.googleSerp;
|
yield this.googleSerp;
|
||||||
yield this.serperGoogle;
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
yield this.googleSerpOld;
|
||||||
|
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
// yield variant === 'web' ? this.jinaSerp : this.serperGoogle;
|
|
||||||
yield this.serperGoogle
|
|
||||||
yield this.serperGoogle;
|
|
||||||
yield this.googleSerp;
|
yield this.googleSerp;
|
||||||
|
yield this.jinaSerp;
|
||||||
|
yield this.serperGoogle;
|
||||||
|
yield this.commonGoogleSerp;
|
||||||
|
yield this.googleSerpOld;
|
||||||
}
|
}
|
||||||
|
|
||||||
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
|
async cachedSearch(variant: 'web' | 'news' | 'images', query: Record<string, any>, opts: CrawlerOptions) {
|
||||||
@ -506,22 +515,27 @@ export class SerpHost extends RPCHost {
|
|||||||
outerLoop:
|
outerLoop:
|
||||||
for (const client of this.iterProviders(provider, variant)) {
|
for (const client of this.iterProviders(provider, variant)) {
|
||||||
const t0 = Date.now();
|
const t0 = Date.now();
|
||||||
try {
|
let func;
|
||||||
switch (variant) {
|
switch (variant) {
|
||||||
case 'images': {
|
case 'images': {
|
||||||
r = await Reflect.apply(client.imageSearch, client, [query, scrappingOptions]);
|
func = Reflect.get(client, 'imageSearch');
|
||||||
break;
|
break;
|
||||||
}
|
|
||||||
case 'news': {
|
|
||||||
r = await Reflect.apply(client.newsSearch, client, [query, scrappingOptions]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 'web':
|
|
||||||
default: {
|
|
||||||
r = await Reflect.apply(client.webSearch, client, [query, scrappingOptions]);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
case 'news': {
|
||||||
|
func = Reflect.get(client, 'newsSearch');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'web':
|
||||||
|
default: {
|
||||||
|
func = Reflect.get(client, 'webSearch');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!func) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
try {
|
||||||
|
r = await Reflect.apply(func, client, [query, scrappingOptions]);
|
||||||
const dt = Date.now() - t0;
|
const dt = Date.now() - t0;
|
||||||
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
this.logger.info(`Search took ${dt}ms, ${client.constructor.name}(${variant})`, { searchDt: dt, variant, client: client.constructor.name });
|
||||||
break outerLoop;
|
break outerLoop;
|
||||||
@ -544,6 +558,8 @@ export class SerpHost extends RPCHost {
|
|||||||
this.batchedCaches.push(record);
|
this.batchedCaches.push(record);
|
||||||
} else if (lastError) {
|
} else if (lastError) {
|
||||||
throw lastError;
|
throw lastError;
|
||||||
|
} else if (!r) {
|
||||||
|
throw new AssertionFailureError(`No provider can do ${variant} search atm.`);
|
||||||
}
|
}
|
||||||
|
|
||||||
return r;
|
return r;
|
||||||
|
@ -16,7 +16,7 @@ import { Readable } from 'stream';
|
|||||||
import { AsyncLocalContext } from './async-context';
|
import { AsyncLocalContext } from './async-context';
|
||||||
import { BlackHoleDetector } from './blackhole-detector';
|
import { BlackHoleDetector } from './blackhole-detector';
|
||||||
|
|
||||||
export interface CURLScrappingOptions extends ScrappingOptions {
|
export interface CURLScrappingOptions<T = any> extends ScrappingOptions<T> {
|
||||||
method?: string;
|
method?: string;
|
||||||
body?: string | Buffer;
|
body?: string | Buffer;
|
||||||
}
|
}
|
||||||
@ -75,7 +75,7 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const mixinHeaders: Record<string, string> = {
|
const mixinHeaders: Record<string, string> = {
|
||||||
'Sec-Ch-Ua': `Not A(Brand";v="8", "Chromium";v="${this.chromeVersion}", "Google Chrome";v="${this.chromeVersion}"`,
|
'Sec-Ch-Ua': `"Google Chrome";v="${this.chromeVersion}", "Not-A.Brand";v="8", "Chromium";v="${this.chromeVersion}"`,
|
||||||
'Sec-Ch-Ua-Mobile': '?0',
|
'Sec-Ch-Ua-Mobile': '?0',
|
||||||
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
|
'Sec-Ch-Ua-Platform': `"${uaPlatform}"`,
|
||||||
'Upgrade-Insecure-Requests': '1',
|
'Upgrade-Insecure-Requests': '1',
|
||||||
@ -108,11 +108,11 @@ export class CurlControl extends AsyncService {
|
|||||||
return curl;
|
return curl;
|
||||||
}
|
}
|
||||||
|
|
||||||
urlToFile1Shot(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
urlToStream(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||||
return new Promise<{
|
return new Promise<{
|
||||||
statusCode: number,
|
statusCode: number,
|
||||||
statusText?: string,
|
statusText?: string,
|
||||||
data?: FancyFile,
|
data?: Readable,
|
||||||
headers: HeaderInfo[],
|
headers: HeaderInfo[],
|
||||||
}>((resolve, reject) => {
|
}>((resolve, reject) => {
|
||||||
let contentType = '';
|
let contentType = '';
|
||||||
@ -193,7 +193,7 @@ export class CurlControl extends AsyncService {
|
|||||||
});
|
});
|
||||||
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
curl.setOpt(Curl.option.MAXFILESIZE, 4 * 1024 * 1024 * 1024); // 4GB
|
||||||
let status = -1;
|
let status = -1;
|
||||||
let statusText: string|undefined;
|
let statusText: string | undefined;
|
||||||
let contentEncoding = '';
|
let contentEncoding = '';
|
||||||
curl.once('end', () => {
|
curl.once('end', () => {
|
||||||
if (curlStream) {
|
if (curlStream) {
|
||||||
@ -302,13 +302,10 @@ export class CurlControl extends AsyncService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const fpath = this.tempFileManager.alloc();
|
|
||||||
const fancyFile = FancyFile.auto(stream, fpath);
|
|
||||||
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
|
||||||
resolve({
|
resolve({
|
||||||
statusCode: status,
|
statusCode: status,
|
||||||
statusText,
|
statusText,
|
||||||
data: fancyFile,
|
data: stream,
|
||||||
headers: headers as HeaderInfo[],
|
headers: headers as HeaderInfo[],
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
@ -324,7 +321,19 @@ export class CurlControl extends AsyncService {
|
|||||||
let nextHopUrl = urlToCrawl;
|
let nextHopUrl = urlToCrawl;
|
||||||
const fakeHeaderInfos: HeaderInfo[] = [];
|
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||||
do {
|
do {
|
||||||
const r = await this.urlToFile1Shot(nextHopUrl, opts);
|
const s = await this.urlToStream(nextHopUrl, opts);
|
||||||
|
const r = { ...s } as {
|
||||||
|
statusCode: number,
|
||||||
|
statusText?: string,
|
||||||
|
data?: FancyFile,
|
||||||
|
headers: HeaderInfo[],
|
||||||
|
};
|
||||||
|
if (r.data) {
|
||||||
|
const fpath = this.tempFileManager.alloc();
|
||||||
|
const fancyFile = FancyFile.auto(r.data, fpath);
|
||||||
|
this.tempFileManager.bindPathTo(fancyFile, fpath);
|
||||||
|
r.data = fancyFile;
|
||||||
|
}
|
||||||
|
|
||||||
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
|
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
|
||||||
fakeHeaderInfos.push(...r.headers);
|
fakeHeaderInfos.push(...r.headers);
|
||||||
@ -375,7 +384,7 @@ export class CurlControl extends AsyncService {
|
|||||||
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
|
const curlResult = await this.urlToFile(targetUrl, crawlOpts);
|
||||||
this.blackHoleDetector.itWorked();
|
this.blackHoleDetector.itWorked();
|
||||||
let finalURL = targetUrl;
|
let finalURL = targetUrl;
|
||||||
const sideLoadOpts: CURLScrappingOptions['sideLoad'] = {
|
const sideLoadOpts: CURLScrappingOptions<FancyFile>['sideLoad'] = {
|
||||||
impersonate: {},
|
impersonate: {},
|
||||||
proxyOrigin: {},
|
proxyOrigin: {},
|
||||||
};
|
};
|
||||||
@ -421,6 +430,140 @@ export class CurlControl extends AsyncService {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async urlToBlob(urlToCrawl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||||
|
let leftRedirection = 6;
|
||||||
|
let cookieRedirects = 0;
|
||||||
|
let opts = { ...crawlOpts };
|
||||||
|
let nextHopUrl = urlToCrawl;
|
||||||
|
const fakeHeaderInfos: HeaderInfo[] = [];
|
||||||
|
do {
|
||||||
|
const s = await this.urlToStream(nextHopUrl, opts);
|
||||||
|
const r = { ...s } as {
|
||||||
|
statusCode: number,
|
||||||
|
statusText?: string,
|
||||||
|
data?: Blob,
|
||||||
|
headers: HeaderInfo[],
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
const headers = r.headers[r.headers.length - 1];
|
||||||
|
if ([301, 302, 303, 307, 308].includes(r.statusCode)) {
|
||||||
|
fakeHeaderInfos.push(...r.headers);
|
||||||
|
const location: string | undefined = headers.Location || headers.location;
|
||||||
|
|
||||||
|
const setCookieHeader = headers['Set-Cookie'] || headers['set-cookie'];
|
||||||
|
if (setCookieHeader) {
|
||||||
|
const cookieAssignments = Array.isArray(setCookieHeader) ? setCookieHeader : [setCookieHeader];
|
||||||
|
const parsed = cookieAssignments.filter(Boolean).map((x) => parseSetCookieString(x, { decodeValues: true }));
|
||||||
|
if (parsed.length) {
|
||||||
|
opts.cookies = [...(opts.cookies || []), ...parsed];
|
||||||
|
}
|
||||||
|
if (!location) {
|
||||||
|
cookieRedirects += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!location && !setCookieHeader) {
|
||||||
|
// Follow curl behavior
|
||||||
|
if (s.data) {
|
||||||
|
const chunks: Buffer[] = [];
|
||||||
|
s.data.on('data', (chunk) => {
|
||||||
|
chunks.push(chunk);
|
||||||
|
});
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
s.data!.once('end', resolve);
|
||||||
|
s.data!.once('error', reject);
|
||||||
|
});
|
||||||
|
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
statusCode: r.statusCode,
|
||||||
|
data: r.data,
|
||||||
|
headers: fakeHeaderInfos.concat(r.headers),
|
||||||
|
};
|
||||||
|
}
|
||||||
|
if (!location && cookieRedirects > 1) {
|
||||||
|
throw new ServiceBadApproachError(`Failed to access ${urlToCrawl}: Browser required to solve complex cookie preconditions.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
nextHopUrl = new URL(location || '', nextHopUrl);
|
||||||
|
leftRedirection -= 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (s.data) {
|
||||||
|
const chunks: Buffer[] = [];
|
||||||
|
s.data.on('data', (chunk) => {
|
||||||
|
chunks.push(chunk);
|
||||||
|
});
|
||||||
|
await new Promise((resolve, reject) => {
|
||||||
|
s.data!.once('end', resolve);
|
||||||
|
s.data!.once('error', reject);
|
||||||
|
});
|
||||||
|
r.data = new Blob(chunks, { type: headers['Content-Type'] || headers['content-type'] });
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
statusCode: r.statusCode,
|
||||||
|
statusText: r.statusText,
|
||||||
|
data: r.data,
|
||||||
|
headers: fakeHeaderInfos.concat(r.headers),
|
||||||
|
};
|
||||||
|
} while (leftRedirection > 0);
|
||||||
|
|
||||||
|
throw new ServiceBadAttemptError(`Failed to access ${urlToCrawl}: Too many redirections.`);
|
||||||
|
}
|
||||||
|
|
||||||
|
async sideLoadBlob(targetUrl: URL, crawlOpts?: CURLScrappingOptions) {
|
||||||
|
const curlResult = await this.urlToBlob(targetUrl, crawlOpts);
|
||||||
|
this.blackHoleDetector.itWorked();
|
||||||
|
let finalURL = targetUrl;
|
||||||
|
const sideLoadOpts: CURLScrappingOptions<Blob>['sideLoad'] = {
|
||||||
|
impersonate: {},
|
||||||
|
proxyOrigin: {},
|
||||||
|
};
|
||||||
|
for (const headers of curlResult.headers) {
|
||||||
|
sideLoadOpts.impersonate[finalURL.href] = {
|
||||||
|
status: headers.result?.code || -1,
|
||||||
|
headers: _.omit(headers, 'result'),
|
||||||
|
contentType: headers['Content-Type'] || headers['content-type'],
|
||||||
|
};
|
||||||
|
if (crawlOpts?.proxyUrl) {
|
||||||
|
sideLoadOpts.proxyOrigin[finalURL.origin] = crawlOpts.proxyUrl;
|
||||||
|
}
|
||||||
|
if (headers.result?.code && [301, 302, 307, 308].includes(headers.result.code)) {
|
||||||
|
const location = headers.Location || headers.location;
|
||||||
|
if (location) {
|
||||||
|
finalURL = new URL(location, finalURL);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
const lastHeaders = curlResult.headers[curlResult.headers.length - 1];
|
||||||
|
const contentType = (lastHeaders['Content-Type'] || lastHeaders['content-type'])?.toLowerCase() || (curlResult.data?.type) || 'application/octet-stream';
|
||||||
|
const contentDisposition = lastHeaders['Content-Disposition'] || lastHeaders['content-disposition'];
|
||||||
|
const fileName = contentDisposition?.match(/filename="([^"]+)"/i)?.[1] || finalURL.pathname.split('/').pop();
|
||||||
|
|
||||||
|
if (sideLoadOpts.impersonate[finalURL.href] && (curlResult.data?.size)) {
|
||||||
|
sideLoadOpts.impersonate[finalURL.href].body = curlResult.data;
|
||||||
|
}
|
||||||
|
|
||||||
|
// This should keep the file from being garbage collected and deleted until this asyncContext/request is done.
|
||||||
|
this.lifeCycleTrack.set(this.asyncLocalContext.ctx, curlResult.data);
|
||||||
|
|
||||||
|
return {
|
||||||
|
finalURL,
|
||||||
|
sideLoadOpts,
|
||||||
|
chain: curlResult.headers,
|
||||||
|
status: curlResult.statusCode,
|
||||||
|
statusText: curlResult.statusText,
|
||||||
|
headers: lastHeaders,
|
||||||
|
contentType,
|
||||||
|
contentDisposition,
|
||||||
|
fileName,
|
||||||
|
file: curlResult.data
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
digestCurlCode(code: CurlCode, msg: string) {
|
digestCurlCode(code: CurlCode, msg: string) {
|
||||||
switch (code) {
|
switch (code) {
|
||||||
// 400 User errors
|
// 400 User errors
|
||||||
|
@ -79,7 +79,7 @@ export interface ExtendedSnapshot extends PageSnapshot {
|
|||||||
imgs: ImgBrief[];
|
imgs: ImgBrief[];
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ScrappingOptions {
|
export interface ScrappingOptions<T = FancyFile | Blob> {
|
||||||
proxyUrl?: string;
|
proxyUrl?: string;
|
||||||
cookies?: Cookie[];
|
cookies?: Cookie[];
|
||||||
favorScreenshot?: boolean;
|
favorScreenshot?: boolean;
|
||||||
@ -101,7 +101,7 @@ export interface ScrappingOptions {
|
|||||||
status: number;
|
status: number;
|
||||||
headers: { [k: string]: string | string[]; };
|
headers: { [k: string]: string | string[]; };
|
||||||
contentType?: string;
|
contentType?: string;
|
||||||
body?: FancyFile;
|
body?: T;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
proxyOrigin: { [origin: string]: string; };
|
proxyOrigin: { [origin: string]: string; };
|
||||||
@ -912,7 +912,11 @@ export class PuppeteerControl extends AsyncService {
|
|||||||
if (impersonate) {
|
if (impersonate) {
|
||||||
let body;
|
let body;
|
||||||
if (impersonate.body) {
|
if (impersonate.body) {
|
||||||
body = await readFile(await impersonate.body.filePath);
|
if (impersonate.body instanceof Blob) {
|
||||||
|
body = new Uint8Array(await impersonate.body.arrayBuffer());
|
||||||
|
} else {
|
||||||
|
body = await readFile(await impersonate.body.filePath);
|
||||||
|
}
|
||||||
if (req.isInterceptResolutionHandled()) {
|
if (req.isInterceptResolutionHandled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
133
src/services/serp/common-serp.ts
Normal file
133
src/services/serp/common-serp.ts
Normal file
@ -0,0 +1,133 @@
|
|||||||
|
import { singleton } from 'tsyringe';
|
||||||
|
import { AsyncService } from 'civkit/async-service';
|
||||||
|
import { GlobalLogger } from '../logger';
|
||||||
|
import { JSDomControl } from '../jsdom';
|
||||||
|
import _ from 'lodash';
|
||||||
|
import { WebSearchEntry } from './compat';
|
||||||
|
import { ServiceBadAttemptError } from '../errors';
|
||||||
|
import commonSerpClients, { CommonSerpImageResponse, CommonSerpNewsResponse, CommonSerpWebResponse } from '../../shared/3rd-party/common-serp';
|
||||||
|
import { AsyncLocalContext } from '../async-context';
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class CommonGoogleSERP extends AsyncService {
|
||||||
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
||||||
|
|
||||||
|
protected ctxIteratorMap = new WeakMap<object, ReturnType<CommonGoogleSERP['iterClients']>>();
|
||||||
|
|
||||||
|
constructor(
|
||||||
|
protected globalLogger: GlobalLogger,
|
||||||
|
protected jsDomControl: JSDomControl,
|
||||||
|
protected asyncContext: AsyncLocalContext,
|
||||||
|
|
||||||
|
) {
|
||||||
|
super(...arguments);
|
||||||
|
}
|
||||||
|
|
||||||
|
override async init() {
|
||||||
|
await this.dependencyReady();
|
||||||
|
|
||||||
|
this.emit('ready');
|
||||||
|
}
|
||||||
|
|
||||||
|
*iterClients() {
|
||||||
|
if (!commonSerpClients.length) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
while (true) {
|
||||||
|
yield* commonSerpClients;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
getClient() {
|
||||||
|
const ctx = this.asyncContext.ctx;
|
||||||
|
const it = this.ctxIteratorMap.get(ctx) || this.iterClients();
|
||||||
|
this.ctxIteratorMap.set(ctx, it);
|
||||||
|
const client = it.next().value;
|
||||||
|
if (!client) {
|
||||||
|
throw new ServiceBadAttemptError('No client available');
|
||||||
|
}
|
||||||
|
return client;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
digestQuery(query: { [k: string]: any; }) {
|
||||||
|
const url = new URL(`https://${this.googleDomain}/search`);
|
||||||
|
const clone = { ...query };
|
||||||
|
const num = clone.num || 10;
|
||||||
|
if (clone.page) {
|
||||||
|
const page = parseInt(clone.page);
|
||||||
|
delete clone.page;
|
||||||
|
clone.start = (page - 1) * num;
|
||||||
|
if (clone.start === 0) {
|
||||||
|
delete clone.start;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (clone.location) {
|
||||||
|
delete clone.location;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (const [k, v] of Object.entries(clone)) {
|
||||||
|
if (v === undefined || v === null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
url.searchParams.set(k, `${v}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
async webSearch(query: { [k: string]: any; }) {
|
||||||
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
|
const client = this.getClient();
|
||||||
|
|
||||||
|
const r = await client.queryJSON(url.href) as CommonSerpWebResponse;
|
||||||
|
|
||||||
|
return r.organic.map((x)=> ({
|
||||||
|
link: x.link,
|
||||||
|
title: x.title,
|
||||||
|
snippet: x.description,
|
||||||
|
variant: 'web',
|
||||||
|
})) as WebSearchEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
async newsSearch(query: { [k: string]: any; }) {
|
||||||
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
|
url.searchParams.set('tbm', 'nws');
|
||||||
|
|
||||||
|
const client = this.getClient();
|
||||||
|
|
||||||
|
const r = await client.queryJSON(url.href) as CommonSerpNewsResponse;
|
||||||
|
|
||||||
|
return r.news.map((x)=> ({
|
||||||
|
link: x.link,
|
||||||
|
title: x.title,
|
||||||
|
snippet: x.description,
|
||||||
|
source: x.source,
|
||||||
|
date: x.date,
|
||||||
|
imageUrl: x.image,
|
||||||
|
variant: 'news',
|
||||||
|
})) as WebSearchEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
async imageSearch(query: { [k: string]: any; }) {
|
||||||
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
|
url.searchParams.set('tbm', 'isch');
|
||||||
|
|
||||||
|
const client = this.getClient();
|
||||||
|
|
||||||
|
const r = await client.queryJSON(url.href) as CommonSerpImageResponse;
|
||||||
|
|
||||||
|
return r.images.map((x)=> ({
|
||||||
|
link: x.link,
|
||||||
|
title: x.title,
|
||||||
|
snippet: x.image_alt,
|
||||||
|
source: x.source,
|
||||||
|
imageUrl: x.image,
|
||||||
|
variant: 'images',
|
||||||
|
})) as WebSearchEntry[];
|
||||||
|
}
|
||||||
|
}
|
@ -7,24 +7,74 @@ import _ from 'lodash';
|
|||||||
import { WebSearchEntry } from './compat';
|
import { WebSearchEntry } from './compat';
|
||||||
import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
|
import { ScrappingOptions, SERPSpecializedPuppeteerControl } from './puppeteer';
|
||||||
import { CurlControl } from '../curl';
|
import { CurlControl } from '../curl';
|
||||||
import { readFile } from 'fs/promises';
|
|
||||||
import { ApplicationError } from 'civkit/civ-rpc';
|
import { ApplicationError } from 'civkit/civ-rpc';
|
||||||
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
import { ServiceBadApproachError, ServiceBadAttemptError } from '../errors';
|
||||||
import { parseJSONText } from 'civkit/vectorize';
|
import { parseJSONText } from 'civkit/vectorize';
|
||||||
import { retryWith } from 'civkit/decorators';
|
import { retry, retryWith } from 'civkit/decorators';
|
||||||
import { ProxyProviderService } from '../../shared/services/proxy-provider';
|
import { SERPProxyProviderService } from '../../shared/services/proxy-provider';
|
||||||
|
import { readBlob } from '../../utils/encoding';
|
||||||
|
import { createContext, Script } from 'vm';
|
||||||
|
import { BrowserContext } from 'puppeteer';
|
||||||
|
import { createPool } from 'generic-pool';
|
||||||
|
import { randomBytes } from 'crypto';
|
||||||
|
import { AsyncLocalContext } from '../async-context';
|
||||||
|
|
||||||
|
interface SerpContext {
|
||||||
|
proxyUrl?: string;
|
||||||
|
browserContext?: BrowserContext;
|
||||||
|
validTill?: Date;
|
||||||
|
magicId?: string;
|
||||||
|
}
|
||||||
|
|
||||||
@singleton()
|
@singleton()
|
||||||
export class GoogleSERP extends AsyncService {
|
export class GoogleSERP extends AsyncService {
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
googleDomain = process.env.OVERRIDE_GOOGLE_DOMAIN || 'www.google.com';
|
||||||
|
|
||||||
|
nativeIPHealthy = true;
|
||||||
|
|
||||||
|
contextPool = createPool({
|
||||||
|
create: () => {
|
||||||
|
return this.createContext();
|
||||||
|
},
|
||||||
|
destroy: async (ctx: SerpContext) => {
|
||||||
|
if (ctx.browserContext) {
|
||||||
|
try {
|
||||||
|
await ctx.browserContext.close();
|
||||||
|
} catch (err) {
|
||||||
|
this.logger.warn('Error closing browser context', { err });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
validate: async (ctx: SerpContext) => {
|
||||||
|
if (ctx.validTill && ctx.validTill > (new Date(Date.now() + 5 * 60 * 1000))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return !ctx.proxyUrl;
|
||||||
|
},
|
||||||
|
}, {
|
||||||
|
max: 3_000,
|
||||||
|
testOnBorrow: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
protected async createContext() {
|
||||||
|
await this.serviceReady();
|
||||||
|
this.asyncLocalContext.ctx.ctxIsNew = true;
|
||||||
|
|
||||||
|
return {
|
||||||
|
magicId: randomBytes(17).toString('base64url'),
|
||||||
|
validTill: new Date(Date.now() + 30 * 60 * 1000),
|
||||||
|
} as SerpContext;
|
||||||
|
}
|
||||||
|
|
||||||
constructor(
|
constructor(
|
||||||
protected globalLogger: GlobalLogger,
|
protected globalLogger: GlobalLogger,
|
||||||
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
protected puppeteerControl: SERPSpecializedPuppeteerControl,
|
||||||
protected jsDomControl: JSDomControl,
|
protected jsDomControl: JSDomControl,
|
||||||
protected curlControl: CurlControl,
|
protected curlControl: CurlControl,
|
||||||
protected proxyProvider: ProxyProviderService,
|
protected proxyProvider: SERPProxyProviderService,
|
||||||
|
protected asyncLocalContext: AsyncLocalContext,
|
||||||
) {
|
) {
|
||||||
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
const filteredDeps = isMainThread ? arguments : _.without(arguments, puppeteerControl);
|
||||||
super(...filteredDeps);
|
super(...filteredDeps);
|
||||||
@ -36,6 +86,15 @@ export class GoogleSERP extends AsyncService {
|
|||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
nativeIPBlocked() {
|
||||||
|
this.nativeIPHealthy = false;
|
||||||
|
this.logger.warn('Native IP is not healthy.');
|
||||||
|
setTimeout(() => {
|
||||||
|
this.nativeIPHealthy = true;
|
||||||
|
this.logger.debug('Presume native IP healthy again after timeout.');
|
||||||
|
}, 1000 * 60 * 60);
|
||||||
|
}
|
||||||
|
|
||||||
@retryWith((err) => {
|
@retryWith((err) => {
|
||||||
if (err instanceof ServiceBadApproachError) {
|
if (err instanceof ServiceBadApproachError) {
|
||||||
return false;
|
return false;
|
||||||
@ -51,15 +110,13 @@ export class GoogleSERP extends AsyncService {
|
|||||||
return undefined;
|
return undefined;
|
||||||
}, 3)
|
}, 3)
|
||||||
async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
|
async sideLoadWithAllocatedProxy(url: URL, opts?: ScrappingOptions) {
|
||||||
if (opts?.allocProxy === 'none') {
|
if (opts?.allocProxy === 'none' || opts?.proxyUrl) {
|
||||||
return this.curlControl.sideLoad(url, opts);
|
return this.curlControl.sideLoadBlob(url, opts);
|
||||||
}
|
}
|
||||||
|
|
||||||
const proxy = await this.proxyProvider.alloc(
|
const proxy = await this.proxyProvider.alloc();
|
||||||
process.env.PREFERRED_PROXY_COUNTRY || 'auto'
|
|
||||||
);
|
|
||||||
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
this.logger.debug(`Proxy allocated`, { proxy: proxy.href });
|
||||||
const r = await this.curlControl.sideLoad(url, {
|
const r = await this.curlControl.sideLoadBlob(url, {
|
||||||
...opts,
|
...opts,
|
||||||
proxyUrl: proxy.href,
|
proxyUrl: proxy.href,
|
||||||
});
|
});
|
||||||
@ -101,48 +158,126 @@ export class GoogleSERP extends AsyncService {
|
|||||||
return url;
|
return url;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@retry(2)
|
||||||
async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||||
const url = this.digestQuery(query);
|
const url = this.digestQuery(query);
|
||||||
|
const origHref = url.href;
|
||||||
|
if (!url.searchParams.has('start')) {
|
||||||
|
url.searchParams.set('start', '0');
|
||||||
|
}
|
||||||
|
url.searchParams.set('asearch', 'arc');
|
||||||
|
|
||||||
|
const ctx = await this.contextPool.acquire();
|
||||||
|
|
||||||
|
url.searchParams.set('async', getMagicAsyncParam(query.start, ctx.magicId));
|
||||||
|
|
||||||
|
const t0 = performance.now();
|
||||||
|
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||||
|
...opts,
|
||||||
|
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
|
||||||
|
proxyUrl: ctx.proxyUrl,
|
||||||
|
timeoutMs: 3_700
|
||||||
|
}).catch((err) => {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
const dt = performance.now() - t0;
|
||||||
|
|
||||||
|
if ('proxy' in sideLoaded) {
|
||||||
|
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||||
|
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sideLoaded.status === 200) {
|
||||||
|
if (dt < 1_700) {
|
||||||
|
this.contextPool.release(ctx);
|
||||||
|
} else {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
|
||||||
|
this.nativeIPBlocked();
|
||||||
|
}
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
throw new ServiceBadAttemptError({
|
||||||
|
message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.',
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
|
||||||
if (opts && sideLoaded.sideLoadOpts) {
|
if (opts && sideLoaded.sideLoadOpts) {
|
||||||
opts.sideLoad = sideLoaded.sideLoadOpts;
|
opts.sideLoad = sideLoaded.sideLoadOpts;
|
||||||
}
|
}
|
||||||
|
|
||||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
|
if (!sideLoaded.file) {
|
||||||
|
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||||
|
}
|
||||||
|
const contentType = sideLoaded.contentType;
|
||||||
|
const encoding: string | undefined = contentType.includes('charset=') ? contentType.split('charset=')[1]?.trim().toLowerCase() : 'utf-8';
|
||||||
|
|
||||||
return snapshot;
|
let html = await readBlob(sideLoaded.file, encoding);
|
||||||
}
|
let innerCharset;
|
||||||
|
const peek = html.slice(0, 1024);
|
||||||
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
innerCharset ??= peek.match(/<meta[^>]+text\/html;\s*?charset=([^>"]+)/i)?.[1]?.toLowerCase();
|
||||||
const url = this.digestQuery(query);
|
innerCharset ??= peek.match(/<meta[^>]+charset="([^>"]+)\"/i)?.[1]?.toLowerCase();
|
||||||
|
if (innerCharset && innerCharset !== encoding) {
|
||||||
url.searchParams.set('tbm', 'nws');
|
html = await readBlob(sideLoaded.file, innerCharset);
|
||||||
|
|
||||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
|
||||||
if (opts && sideLoaded.sideLoadOpts) {
|
|
||||||
opts.sideLoad = sideLoaded.sideLoadOpts;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, opts);
|
const jsdom = this.jsDomControl.linkedom.parseHTML(html, { location: { href: origHref } });
|
||||||
|
try {
|
||||||
|
const r = runGetWebSearchResultsScript(createContext(jsdom));
|
||||||
|
if (!Array.isArray(r)) {
|
||||||
|
throw new Error('Failed to parse response as SERP results');
|
||||||
|
}
|
||||||
|
|
||||||
return snapshot;
|
return r;
|
||||||
|
} catch (err) {
|
||||||
|
throw new ServiceBadAttemptError({
|
||||||
|
message: 'Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.',
|
||||||
|
err
|
||||||
|
});
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@retry(2)
|
||||||
async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
async imageSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||||
const url = this.digestQuery(query);
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
url.searchParams.set('tbm', 'isch');
|
url.searchParams.set('tbm', 'isch');
|
||||||
url.searchParams.set('asearch', 'isch');
|
url.searchParams.set('asearch', 'isch');
|
||||||
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
|
url.searchParams.set('async', `_fmt:json,p:1,ijn:${query.start ? Math.floor(query.start / (query.num || 10)) : 0}`);
|
||||||
|
const ctx = await this.contextPool.acquire();
|
||||||
|
|
||||||
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, opts);
|
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||||
|
...opts,
|
||||||
|
proxyUrl: ctx.proxyUrl,
|
||||||
|
allocProxy: opts?.allocProxy || (this.nativeIPHealthy ? 'none' : 'auto'),
|
||||||
|
}).catch((err) => {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
if ('proxy' in sideLoaded) {
|
||||||
|
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||||
|
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sideLoaded.status === 200) {
|
||||||
|
this.contextPool.release(ctx);
|
||||||
|
} else {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
if (this.nativeIPHealthy && this.asyncLocalContext.ctx.ctxIsNew) {
|
||||||
|
this.nativeIPBlocked();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (sideLoaded.status !== 200 || !sideLoaded.file) {
|
if (sideLoaded.status !== 200 || !sideLoaded.file) {
|
||||||
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||||
}
|
}
|
||||||
|
|
||||||
const jsonTxt = (await readFile((await sideLoaded.file.filePath))).toString();
|
const jsonTxt = (await readBlob(sideLoaded.file)).toString();
|
||||||
const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
|
const rJSON = parseJSONText(jsonTxt.slice(jsonTxt.indexOf('{"ischj":')));
|
||||||
|
|
||||||
return _.get(rJSON, 'ischj.metadata').map((x: any) => {
|
return _.get(rJSON, 'ischj.metadata').map((x: any) => {
|
||||||
@ -161,6 +296,95 @@ export class GoogleSERP extends AsyncService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@singleton()
|
||||||
|
export class GoogleSERPOldFashion extends GoogleSERP {
|
||||||
|
override async createContext() {
|
||||||
|
await this.serviceReady();
|
||||||
|
this.asyncLocalContext.ctx.ctxIsNew = true;
|
||||||
|
|
||||||
|
return {
|
||||||
|
browserContext: await this.puppeteerControl.browser.createBrowserContext(),
|
||||||
|
magicId: randomBytes(17).toString('base64url'),
|
||||||
|
validTill: new Date(Date.now() + 30 * 60 * 1000),
|
||||||
|
} as SerpContext;
|
||||||
|
}
|
||||||
|
|
||||||
|
override async webSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||||
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
|
const ctx = await this.contextPool.acquire();
|
||||||
|
|
||||||
|
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||||
|
...opts,
|
||||||
|
proxyUrl: ctx.proxyUrl,
|
||||||
|
}).catch((err) => {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
if ('proxy' in sideLoaded) {
|
||||||
|
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||||
|
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (sideLoaded.status === 200) {
|
||||||
|
this.contextPool.release(ctx);
|
||||||
|
} else {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (opts && sideLoaded.sideLoadOpts) {
|
||||||
|
opts.sideLoad = sideLoaded.sideLoadOpts;
|
||||||
|
}
|
||||||
|
|
||||||
|
const snapshot = await this.puppeteerControl.controlledScrap(url, getWebSearchResults, opts);
|
||||||
|
|
||||||
|
if (!Array.isArray(snapshot)) {
|
||||||
|
throw new ServiceBadAttemptError('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||||
|
}
|
||||||
|
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
|
||||||
|
async newsSearch(query: { [k: string]: any; }, opts?: ScrappingOptions) {
|
||||||
|
const url = this.digestQuery(query);
|
||||||
|
|
||||||
|
url.searchParams.set('tbm', 'nws');
|
||||||
|
|
||||||
|
const ctx = await this.contextPool.acquire();
|
||||||
|
|
||||||
|
const sideLoaded = await this.sideLoadWithAllocatedProxy(url, {
|
||||||
|
...opts,
|
||||||
|
proxyUrl: ctx.proxyUrl,
|
||||||
|
}).catch((err) => {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
if ('proxy' in sideLoaded) {
|
||||||
|
ctx.proxyUrl = sideLoaded.proxy.href;
|
||||||
|
ctx.validTill = new Date(Date.now() + 30 * 60 * 1000);
|
||||||
|
}
|
||||||
|
|
||||||
|
const snapshot = await this.puppeteerControl.controlledScrap(url, getNewsSearchResults, {
|
||||||
|
...opts,
|
||||||
|
proxyUrl: ctx.proxyUrl,
|
||||||
|
browserContext: ctx.browserContext,
|
||||||
|
}).catch((err) => {
|
||||||
|
this.contextPool.destroy(ctx);
|
||||||
|
|
||||||
|
return Promise.reject(err);
|
||||||
|
});
|
||||||
|
|
||||||
|
this.contextPool.release(ctx);
|
||||||
|
|
||||||
|
return snapshot;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function getWebSearchResults() {
|
async function getWebSearchResults() {
|
||||||
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
||||||
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
throw new Error('Google returned an error page. This may happen due to various reasons, including rate limiting or other issues.');
|
||||||
@ -254,6 +478,96 @@ async function getWebSearchResults() {
|
|||||||
};
|
};
|
||||||
}).filter(Boolean) as WebSearchEntry[];
|
}).filter(Boolean) as WebSearchEntry[];
|
||||||
}
|
}
|
||||||
|
function getWebSearchResultsSync() {
|
||||||
|
const wrapper1 = document.querySelector('div[data-async-context^="query"]');
|
||||||
|
|
||||||
|
if (!wrapper1) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const query = decodeURIComponent(wrapper1.getAttribute('data-async-context')?.split('query:')[1] || '');
|
||||||
|
|
||||||
|
if (!query) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const candidates = Array.from(wrapper1.querySelectorAll('div[lang],div[data-surl]'));
|
||||||
|
|
||||||
|
return candidates.map((x, pos) => {
|
||||||
|
const primaryLink = x.querySelector('a:not([href="#"])');
|
||||||
|
if (!primaryLink) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const url = primaryLink.getAttribute('href');
|
||||||
|
|
||||||
|
if (primaryLink.querySelector('div[role="heading"]')) {
|
||||||
|
// const spans = primaryLink.querySelectorAll('span');
|
||||||
|
// const title = spans[0]?.textContent;
|
||||||
|
// const source = spans[1]?.textContent;
|
||||||
|
// const date = spans[spans.length - 1].textContent;
|
||||||
|
|
||||||
|
// return {
|
||||||
|
// link: url,
|
||||||
|
// title,
|
||||||
|
// source,
|
||||||
|
// date,
|
||||||
|
// variant: 'video'
|
||||||
|
// };
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
const title = primaryLink.querySelector('h3')?.textContent;
|
||||||
|
const source = Array.from(primaryLink.querySelectorAll('span')).find((x) => x.textContent)?.textContent;
|
||||||
|
const cite = primaryLink.querySelector('cite[role=text]')?.textContent;
|
||||||
|
let date = cite?.split('·')[1]?.trim();
|
||||||
|
const snippets = Array.from(x.querySelectorAll('div[data-sncf*="1"] span'));
|
||||||
|
let snippet = snippets[snippets.length - 1]?.textContent;
|
||||||
|
if (!snippet) {
|
||||||
|
snippet = x.querySelector('div.IsZvec')?.textContent?.trim() || null;
|
||||||
|
}
|
||||||
|
date ??= snippets[snippets.length - 2]?.textContent?.trim();
|
||||||
|
const imageUrl = x.querySelector('div[data-sncf*="1"] img[src]:not(img[src^="data"])')?.getAttribute('src');
|
||||||
|
let siteLinks = Array.from(x.querySelectorAll('div[data-sncf*="3"] a[href]')).map((l) => {
|
||||||
|
return {
|
||||||
|
link: l.getAttribute('href'),
|
||||||
|
title: l.textContent,
|
||||||
|
};
|
||||||
|
});
|
||||||
|
const perhapsParent = x.parentElement?.closest('div[data-hveid]');
|
||||||
|
if (!siteLinks?.length && perhapsParent) {
|
||||||
|
const candidates = Array.from(perhapsParent.querySelectorAll('td h3'));
|
||||||
|
if (candidates.length) {
|
||||||
|
siteLinks = candidates.map((l) => {
|
||||||
|
const link = l.querySelector('a');
|
||||||
|
if (!link) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const snippet = l.nextElementSibling?.textContent;
|
||||||
|
return {
|
||||||
|
link: link.getAttribute('href'),
|
||||||
|
title: link.textContent,
|
||||||
|
snippet,
|
||||||
|
};
|
||||||
|
}).filter(Boolean) as any;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
link: url,
|
||||||
|
title,
|
||||||
|
source,
|
||||||
|
date,
|
||||||
|
snippet: snippet ?? undefined,
|
||||||
|
imageUrl: imageUrl?.startsWith('data:') ? undefined : imageUrl,
|
||||||
|
siteLinks: siteLinks.length ? siteLinks : undefined,
|
||||||
|
variant: 'web',
|
||||||
|
};
|
||||||
|
}).filter(Boolean) as WebSearchEntry[];
|
||||||
|
}
|
||||||
|
const script = new Script(`(${getWebSearchResultsSync.toString()})()`);
|
||||||
|
function runGetWebSearchResultsScript(ctx: object) {
|
||||||
|
return script.runInContext(ctx);
|
||||||
|
}
|
||||||
|
|
||||||
async function getNewsSearchResults() {
|
async function getNewsSearchResults() {
|
||||||
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
if (location.pathname.startsWith('/sorry') || location.pathname.startsWith('/error')) {
|
||||||
@ -305,4 +619,10 @@ async function getNewsSearchResults() {
|
|||||||
variant: 'news',
|
variant: 'news',
|
||||||
};
|
};
|
||||||
}).filter(Boolean) as WebSearchEntry[];
|
}).filter(Boolean) as WebSearchEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function getMagicAsyncParam(start: number = 0, inputArcid?: string) {
|
||||||
|
const arcid = inputArcid || randomBytes(17).toString('base64url');
|
||||||
|
|
||||||
|
return `arc_id:srp_${arcid}_1${start.toString().padStart(2, '0')},use_ac:true,_fmt:prog`;
|
||||||
}
|
}
|
@ -67,11 +67,5 @@ export class InternalJinaSerpService extends AsyncService {
|
|||||||
async webSearch(query: SerperSearchQueryParams) {
|
async webSearch(query: SerperSearchQueryParams) {
|
||||||
return this.doSearch('web', query);
|
return this.doSearch('web', query);
|
||||||
}
|
}
|
||||||
async imageSearch(query: SerperSearchQueryParams) {
|
|
||||||
return this.doSearch('images', query);
|
|
||||||
}
|
|
||||||
async newsSearch(query: SerperSearchQueryParams) {
|
|
||||||
return this.doSearch('news', query);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
@ -2,7 +2,7 @@ import _ from 'lodash';
|
|||||||
import { readFile } from 'fs/promises';
|
import { readFile } from 'fs/promises';
|
||||||
import { container, singleton } from 'tsyringe';
|
import { container, singleton } from 'tsyringe';
|
||||||
|
|
||||||
import type { Browser, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
|
import type { Browser, BrowserContext, CookieParam, GoToOptions, Page, Viewport } from 'puppeteer';
|
||||||
import type { Cookie } from 'set-cookie-parser';
|
import type { Cookie } from 'set-cookie-parser';
|
||||||
import puppeteer, { TimeoutError } from 'puppeteer';
|
import puppeteer, { TimeoutError } from 'puppeteer';
|
||||||
|
|
||||||
@ -21,6 +21,7 @@ import { BlackHoleDetector } from '../blackhole-detector';
|
|||||||
|
|
||||||
|
|
||||||
export interface ScrappingOptions {
|
export interface ScrappingOptions {
|
||||||
|
browserContext?: BrowserContext;
|
||||||
proxyUrl?: string;
|
proxyUrl?: string;
|
||||||
cookies?: Cookie[];
|
cookies?: Cookie[];
|
||||||
overrideUserAgent?: string;
|
overrideUserAgent?: string;
|
||||||
@ -38,7 +39,7 @@ export interface ScrappingOptions {
|
|||||||
status: number;
|
status: number;
|
||||||
headers: { [k: string]: string | string[]; };
|
headers: { [k: string]: string | string[]; };
|
||||||
contentType?: string;
|
contentType?: string;
|
||||||
body?: FancyFile;
|
body?: FancyFile | Blob;
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
proxyOrigin: { [origin: string]: string; };
|
proxyOrigin: { [origin: string]: string; };
|
||||||
@ -226,8 +227,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
browser!: Browser;
|
browser!: Browser;
|
||||||
logger = this.globalLogger.child({ service: this.constructor.name });
|
logger = this.globalLogger.child({ service: this.constructor.name });
|
||||||
|
|
||||||
__loadedPage: Page[] = [];
|
|
||||||
|
|
||||||
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
finalizerMap = new WeakMap<Page, ReturnType<typeof setTimeout>>();
|
||||||
snMap = new WeakMap<Page, number>();
|
snMap = new WeakMap<Page, number>();
|
||||||
livePages = new Set<Page>();
|
livePages = new Set<Page>();
|
||||||
@ -251,7 +250,6 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
let crippledTimes = 0;
|
let crippledTimes = 0;
|
||||||
this.on('crippled', () => {
|
this.on('crippled', () => {
|
||||||
crippledTimes += 1;
|
crippledTimes += 1;
|
||||||
this.__loadedPage.length = 0;
|
|
||||||
this.livePages.clear();
|
this.livePages.clear();
|
||||||
if (crippledTimes > 5) {
|
if (crippledTimes > 5) {
|
||||||
process.nextTick(() => {
|
process.nextTick(() => {
|
||||||
@ -303,20 +301,16 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
this.effectiveUA = this.ua.replace(/Headless/i, '').replace('Mozilla/5.0 (X11; Linux x86_64)', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)');
|
||||||
this.curlControl.impersonateChrome(this.effectiveUA);
|
this.curlControl.impersonateChrome(this.effectiveUA);
|
||||||
|
|
||||||
await this.newPage('beware_deadlock').then((r) => this.__loadedPage.push(r));
|
|
||||||
|
|
||||||
this.emit('ready');
|
this.emit('ready');
|
||||||
}
|
}
|
||||||
|
|
||||||
async newPage<T>(bewareDeadLock: any = false) {
|
async newPage<T>(context?: BrowserContext) {
|
||||||
if (!bewareDeadLock) {
|
await this.serviceReady();
|
||||||
await this.serviceReady();
|
|
||||||
}
|
|
||||||
const sn = this._sn++;
|
const sn = this._sn++;
|
||||||
let page;
|
let page;
|
||||||
|
context ??= await this.browser.createBrowserContext();
|
||||||
try {
|
try {
|
||||||
const dedicatedContext = await this.browser.createBrowserContext();
|
page = await context.newPage();
|
||||||
page = await dedicatedContext.newPage();
|
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
this.logger.warn(`Failed to create page ${sn}`, { err });
|
this.logger.warn(`Failed to create page ${sn}`, { err });
|
||||||
this.browser.process()?.kill('SIGKILL');
|
this.browser.process()?.kill('SIGKILL');
|
||||||
@ -347,23 +341,9 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
return page;
|
return page;
|
||||||
}
|
}
|
||||||
|
|
||||||
async getNextPage() {
|
async getNextPage(context?: BrowserContext) {
|
||||||
let thePage: Page | undefined;
|
const thePage = await this.newPage(context);
|
||||||
if (this.__loadedPage.length) {
|
|
||||||
thePage = this.__loadedPage.shift();
|
|
||||||
if (this.__loadedPage.length <= 1) {
|
|
||||||
process.nextTick(() => {
|
|
||||||
this.newPage()
|
|
||||||
.then((r) => this.__loadedPage.push(r))
|
|
||||||
.catch((err) => {
|
|
||||||
this.logger.warn(`Failed to load new page ahead of time`, { err });
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!thePage) {
|
if (!thePage) {
|
||||||
thePage = await this.newPage();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const timer = setTimeout(() => {
|
const timer = setTimeout(() => {
|
||||||
@ -387,14 +367,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
const sn = this.snMap.get(page);
|
const sn = this.snMap.get(page);
|
||||||
this.logger.debug(`Closing page ${sn}`);
|
this.logger.debug(`Closing page ${sn}`);
|
||||||
await Promise.race([
|
await Promise.race([
|
||||||
(async () => {
|
page.close(),
|
||||||
const ctx = page.browserContext();
|
|
||||||
try {
|
|
||||||
await page.close();
|
|
||||||
} finally {
|
|
||||||
await ctx.close();
|
|
||||||
}
|
|
||||||
})(),
|
|
||||||
delay(5000)
|
delay(5000)
|
||||||
]).catch((err) => {
|
]).catch((err) => {
|
||||||
this.logger.error(`Failed to destroy page ${sn}`, { err });
|
this.logger.error(`Failed to destroy page ${sn}`, { err });
|
||||||
@ -405,7 +378,7 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
|
async controlledScrap<T>(parsedUrl: URL, func: (this: void) => Promise<T>, options: ScrappingOptions = {}): Promise<T> {
|
||||||
// parsedUrl.search = '';
|
// parsedUrl.search = '';
|
||||||
const url = parsedUrl.toString();
|
const url = parsedUrl.toString();
|
||||||
const page = await this.getNextPage();
|
const page = await this.getNextPage(options.browserContext);
|
||||||
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
this.lifeCycleTrack.set(page, this.asyncLocalContext.ctx);
|
||||||
page.on('response', (_resp) => {
|
page.on('response', (_resp) => {
|
||||||
this.blackHoleDetector.itWorked();
|
this.blackHoleDetector.itWorked();
|
||||||
@ -452,7 +425,11 @@ export class SERPSpecializedPuppeteerControl extends AsyncService {
|
|||||||
if (impersonate) {
|
if (impersonate) {
|
||||||
let body;
|
let body;
|
||||||
if (impersonate.body) {
|
if (impersonate.body) {
|
||||||
body = await readFile(await impersonate.body.filePath);
|
if (impersonate.body instanceof Blob) {
|
||||||
|
body = new Uint8Array(await impersonate.body.arrayBuffer());
|
||||||
|
} else {
|
||||||
|
body = await readFile(await impersonate.body.filePath);
|
||||||
|
}
|
||||||
if (req.isInterceptResolutionHandled()) {
|
if (req.isInterceptResolutionHandled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1,13 +1,12 @@
|
|||||||
import { createReadStream } from 'fs';
|
import { createReadStream } from 'fs';
|
||||||
import { Readable } from 'stream';
|
import { Readable } from 'stream';
|
||||||
import { TextDecoderStream } from 'stream/web';
|
|
||||||
|
|
||||||
export async function decodeFileStream(
|
export async function decodeFileStream(
|
||||||
fileStream: Readable,
|
fileStream: Readable,
|
||||||
encoding: string = 'utf-8',
|
encoding: string = 'utf-8',
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||||
Readable.toWeb(fileStream).pipeThrough(decodeStream);
|
(Readable.toWeb(fileStream) as ReadableStream).pipeThrough(decodeStream);
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
for await (const chunk of decodeStream.readable) {
|
for await (const chunk of decodeStream.readable) {
|
||||||
@ -23,7 +22,22 @@ export async function readFile(
|
|||||||
encoding: string = 'utf-8',
|
encoding: string = 'utf-8',
|
||||||
): Promise<string> {
|
): Promise<string> {
|
||||||
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||||
Readable.toWeb(createReadStream(filePath)).pipeThrough(decodeStream);
|
(Readable.toWeb(createReadStream(filePath)) as ReadableStream).pipeThrough(decodeStream);
|
||||||
|
const chunks = [];
|
||||||
|
|
||||||
|
for await (const chunk of decodeStream.readable) {
|
||||||
|
chunks.push(chunk);
|
||||||
|
}
|
||||||
|
|
||||||
|
return chunks.join('');
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function readBlob(
|
||||||
|
blob: Blob,
|
||||||
|
encoding: string = 'utf-8',
|
||||||
|
): Promise<string> {
|
||||||
|
const decodeStream = new TextDecoderStream(encoding, { fatal: false, ignoreBOM: false });
|
||||||
|
blob.stream().pipeThrough(decodeStream);
|
||||||
const chunks = [];
|
const chunks = [];
|
||||||
|
|
||||||
for await (const chunk of decodeStream.readable) {
|
for await (const chunk of decodeStream.readable) {
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit 02279d88bc3940a08a92cb18cf8877d57cb49b82
|
Subproject commit 8b78eab54d78868d44065bfc59c413fe6bd4929d
|
Loading…
x
Reference in New Issue
Block a user