mirror of
https://git.mirrors.martin98.com/https://github.com/jina-ai/reader
synced 2025-08-20 02:29:05 +08:00
refactor: options dto
This commit is contained in:
parent
f0668a96b4
commit
165cce6c91
@ -470,90 +470,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
concurrency: 22,
|
concurrency: 22,
|
||||||
maxInstances: 455,
|
maxInstances: 455,
|
||||||
},
|
},
|
||||||
openapi: {
|
|
||||||
operation: {
|
|
||||||
parameters: {
|
|
||||||
'Accept': {
|
|
||||||
description: `Specifies your preference for the response format.\n\n` +
|
|
||||||
`Supported formats: \n` +
|
|
||||||
`- text/event-stream\n` +
|
|
||||||
`- application/json or text/json\n` +
|
|
||||||
`- text/plain`
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Cache-Tolerance': {
|
|
||||||
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-No-Cache': {
|
|
||||||
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Respond-With': {
|
|
||||||
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
|
|
||||||
`Supported formats: \n` +
|
|
||||||
`- markdown\n` +
|
|
||||||
`- html\n` +
|
|
||||||
`- text\n` +
|
|
||||||
`- screenshot\n`
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Wait-For-Selector': {
|
|
||||||
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
|
|
||||||
'Example: `X-Wait-For-Selector: .content-block`\n'
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Target-Selector': {
|
|
||||||
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
|
|
||||||
'Implies `X-Wait-For-Selector: (same selector)`'
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Proxy-Url': {
|
|
||||||
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
|
||||||
`Supported protocols: \n` +
|
|
||||||
`- http\n` +
|
|
||||||
`- https\n` +
|
|
||||||
`- socks4\n` +
|
|
||||||
`- socks5\n\n` +
|
|
||||||
`For authentication, https://user:pass@host:port`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Set-Cookie': {
|
|
||||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
|
||||||
`Syntax is the same with standard Set-Cookie`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-Generated-Alt': {
|
|
||||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
|
||||||
`Note: Does not work when \`X-Respond-With\` is specified`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-Images-Summary': {
|
|
||||||
description: `Enable dedicated summary section for images on the page.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-links-Summary': {
|
|
||||||
description: `Enable dedicated summary section for hyper links on the page.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
tags: ['Crawler'],
|
tags: ['Crawler'],
|
||||||
httpMethod: ['get', 'post'],
|
httpMethod: ['get', 'post'],
|
||||||
returnType: [String, OutputServerEventStream],
|
returnType: [String, OutputServerEventStream],
|
||||||
@ -953,6 +869,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
this.threadLocal.set('withLinksSummary', opts.withLinksSummary);
|
||||||
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
this.threadLocal.set('withImagesSummary', opts.withImagesSummary);
|
||||||
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
this.threadLocal.set('cacheTolerance', opts.cacheTolerance);
|
||||||
|
this.threadLocal.set('userAgent', opts.userAgent);
|
||||||
|
|
||||||
const crawlOpts: ExtraScrappingOptions = {
|
const crawlOpts: ExtraScrappingOptions = {
|
||||||
proxyUrl: opts.proxyUrl,
|
proxyUrl: opts.proxyUrl,
|
||||||
@ -960,6 +877,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
|
|||||||
favorScreenshot: opts.respondWith === 'screenshot',
|
favorScreenshot: opts.respondWith === 'screenshot',
|
||||||
waitForSelector: opts.waitForSelector,
|
waitForSelector: opts.waitForSelector,
|
||||||
targetSelector: opts.targetSelector,
|
targetSelector: opts.targetSelector,
|
||||||
|
overrideUserAgent: opts.userAgent,
|
||||||
};
|
};
|
||||||
|
|
||||||
return crawlOpts;
|
return crawlOpts;
|
||||||
|
@ -71,71 +71,6 @@ export class SearcherHost extends RPCHost {
|
|||||||
concurrency: 6,
|
concurrency: 6,
|
||||||
maxInstances: 200,
|
maxInstances: 200,
|
||||||
},
|
},
|
||||||
openapi: {
|
|
||||||
operation: {
|
|
||||||
parameters: {
|
|
||||||
'Accept': {
|
|
||||||
description: `Specifies your preference for the response format. \n\n` +
|
|
||||||
`Supported formats:\n` +
|
|
||||||
`- text/event-stream\n` +
|
|
||||||
`- application/json or text/json\n` +
|
|
||||||
`- text/plain`
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-No-Cache': {
|
|
||||||
description: `Ignores internal cache if this header is specified with a value.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Respond-With': {
|
|
||||||
description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` +
|
|
||||||
`Supported formats:\n` +
|
|
||||||
`- markdown\n` +
|
|
||||||
`- html\n` +
|
|
||||||
`- text\n` +
|
|
||||||
`- screenshot\n`
|
|
||||||
,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Proxy-Url': {
|
|
||||||
description: `Specifies your custom proxy if you prefer to use one. \n\n` +
|
|
||||||
`Supported protocols:\n` +
|
|
||||||
`- http\n` +
|
|
||||||
`- https\n` +
|
|
||||||
`- socks4\n` +
|
|
||||||
`- socks5\n\n` +
|
|
||||||
`For authentication, https://user:pass@host:port`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-Set-Cookie': {
|
|
||||||
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
|
||||||
`Syntax is the same with standard Set-Cookie`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-Generated-Alt': {
|
|
||||||
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
|
||||||
`Note: Does not work when \`X-Respond-With\` is specified`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-Images-Summary': {
|
|
||||||
description: `Enable dedicated summary section for images on the page.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
'X-With-links-Summary': {
|
|
||||||
description: `Enable dedicated summary section for hyper links on the page.`,
|
|
||||||
in: 'header',
|
|
||||||
schema: { type: 'string' }
|
|
||||||
},
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
tags: ['Searcher'],
|
tags: ['Searcher'],
|
||||||
httpMethod: ['get', 'post'],
|
httpMethod: ['get', 'post'],
|
||||||
returnType: [String, OutputServerEventStream],
|
returnType: [String, OutputServerEventStream],
|
||||||
|
@ -1,8 +1,100 @@
|
|||||||
import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined
|
||||||
import type { Request, Response } from 'express';
|
import type { Request, Response } from 'express';
|
||||||
import type { CookieParam } from 'puppeteer';
|
import type { CookieParam } from 'puppeteer';
|
||||||
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
import { parseString as parseSetCookieString } from 'set-cookie-parser';
|
||||||
|
|
||||||
|
|
||||||
|
@Also({
|
||||||
|
openapi: {
|
||||||
|
operation: {
|
||||||
|
parameters: {
|
||||||
|
'Accept': {
|
||||||
|
description: `Specifies your preference for the response format.\n\n` +
|
||||||
|
`Supported formats: \n` +
|
||||||
|
`- text/event-stream\n` +
|
||||||
|
`- application/json or text/json\n` +
|
||||||
|
`- text/plain`
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Cache-Tolerance': {
|
||||||
|
description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-No-Cache': {
|
||||||
|
description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Respond-With': {
|
||||||
|
description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` +
|
||||||
|
`Supported formats: \n` +
|
||||||
|
`- markdown\n` +
|
||||||
|
`- html\n` +
|
||||||
|
`- text\n` +
|
||||||
|
`- screenshot\n`
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Wait-For-Selector': {
|
||||||
|
description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` +
|
||||||
|
'Example: `X-Wait-For-Selector: .content-block`\n'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Target-Selector': {
|
||||||
|
description: `Specifies a CSS selector for return target instead of the full html.\n\n` +
|
||||||
|
'Implies `X-Wait-For-Selector: (same selector)`'
|
||||||
|
,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Proxy-Url': {
|
||||||
|
description: `Specifies your custom proxy if you prefer to use one.\n\n` +
|
||||||
|
`Supported protocols: \n` +
|
||||||
|
`- http\n` +
|
||||||
|
`- https\n` +
|
||||||
|
`- socks4\n` +
|
||||||
|
`- socks5\n\n` +
|
||||||
|
`For authentication, https://user:pass@host:port`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-Set-Cookie': {
|
||||||
|
description: `Sets cookie(s) to the headless browser for your request. \n\n` +
|
||||||
|
`Syntax is the same with standard Set-Cookie`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-With-Generated-Alt': {
|
||||||
|
description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` +
|
||||||
|
`Note: Does not work when \`X-Respond-With\` is specified`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-With-Images-Summary': {
|
||||||
|
description: `Enable dedicated summary section for images on the page.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-With-links-Summary': {
|
||||||
|
description: `Enable dedicated summary section for hyper links on the page.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
'X-User-Agent': {
|
||||||
|
description: `Override User-Agent.`,
|
||||||
|
in: 'header',
|
||||||
|
schema: { type: 'string' }
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
})
|
||||||
export class CrawlerOptions extends AutoCastable {
|
export class CrawlerOptions extends AutoCastable {
|
||||||
|
|
||||||
@Prop({
|
@Prop({
|
||||||
@ -47,6 +139,9 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
@Prop()
|
@Prop()
|
||||||
proxyUrl?: string;
|
proxyUrl?: string;
|
||||||
|
|
||||||
|
@Prop()
|
||||||
|
userAgent?: string;
|
||||||
|
|
||||||
static override from(input: any) {
|
static override from(input: any) {
|
||||||
const instance = super.from(input) as CrawlerOptions;
|
const instance = super.from(input) as CrawlerOptions;
|
||||||
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
|
||||||
@ -87,6 +182,8 @@ export class CrawlerOptions extends AutoCastable {
|
|||||||
instance.targetSelector ??= targetSelector;
|
instance.targetSelector ??= targetSelector;
|
||||||
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
const waitForSelector = ctx?.req.get('x-wait-for-selector');
|
||||||
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
instance.waitForSelector ??= waitForSelector || instance.targetSelector;
|
||||||
|
const overrideUserAgent = ctx?.req.get('x-user-agent');
|
||||||
|
instance.userAgent ??= overrideUserAgent;
|
||||||
|
|
||||||
const cookies: CookieParam[] = [];
|
const cookies: CookieParam[] = [];
|
||||||
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]);
|
||||||
|
@ -65,6 +65,7 @@ export interface ScrappingOptions {
|
|||||||
favorScreenshot?: boolean;
|
favorScreenshot?: boolean;
|
||||||
waitForSelector?: string;
|
waitForSelector?: string;
|
||||||
minIntervalMs?: number;
|
minIntervalMs?: number;
|
||||||
|
overrideUserAgent?: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -417,6 +418,9 @@ document.addEventListener('load', handlePageLoad);
|
|||||||
if (options?.cookies) {
|
if (options?.cookies) {
|
||||||
await page.setCookie(...options.cookies);
|
await page.setCookie(...options.cookies);
|
||||||
}
|
}
|
||||||
|
if (options?.overrideUserAgent) {
|
||||||
|
await page.setUserAgent(options.overrideUserAgent);
|
||||||
|
}
|
||||||
|
|
||||||
let nextSnapshotDeferred = Defer();
|
let nextSnapshotDeferred = Defer();
|
||||||
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` }));
|
||||||
|
@ -1 +1 @@
|
|||||||
Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1
|
Subproject commit a6116b73e99e3d335b0cd4cfcae8f4f0c7e72f6d
|
Loading…
x
Reference in New Issue
Block a user