diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index c1fecdc..a611ed7 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -470,90 +470,6 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; concurrency: 22, maxInstances: 455, }, - openapi: { - operation: { - parameters: { - 'Accept': { - description: `Specifies your preference for the response format.\n\n` + - `Supported formats: \n` + - `- text/event-stream\n` + - `- application/json or text/json\n` + - `- text/plain` - , - in: 'header', - schema: { type: 'string' } - }, - 'X-Cache-Tolerance': { - description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`, - in: 'header', - schema: { type: 'string' } - }, - 'X-No-Cache': { - description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`, - in: 'header', - schema: { type: 'string' } - }, - 'X-Respond-With': { - description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` + - `Supported formats: \n` + - `- markdown\n` + - `- html\n` + - `- text\n` + - `- screenshot\n` - , - in: 'header', - schema: { type: 'string' } - }, - 'X-Wait-For-Selector': { - description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` + - 'Example: `X-Wait-For-Selector: .content-block`\n' - , - in: 'header', - schema: { type: 'string' } - }, - 'X-Target-Selector': { - description: `Specifies a CSS selector for return target instead of the full html.\n\n` + - 'Implies `X-Wait-For-Selector: (same selector)`' - , - in: 'header', - schema: { type: 'string' } - }, - 'X-Proxy-Url': { - description: `Specifies your custom proxy if you prefer to use one.\n\n` + - `Supported protocols: \n` + - `- http\n` + - `- https\n` + - `- socks4\n` + - `- socks5\n\n` + - `For authentication, https://user:pass@host:port`, - in: 'header', - schema: { type: 'string' } - }, - 'X-Set-Cookie': { - description: `Sets cookie(s) to the headless browser for your request. \n\n` + - `Syntax is the same with standard Set-Cookie`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-Generated-Alt': { - description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + - `Note: Does not work when \`X-Respond-With\` is specified`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-Images-Summary': { - description: `Enable dedicated summary section for images on the page.`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-links-Summary': { - description: `Enable dedicated summary section for hyper links on the page.`, - in: 'header', - schema: { type: 'string' } - }, - } - } - }, tags: ['Crawler'], httpMethod: ['get', 'post'], returnType: [String, OutputServerEventStream], @@ -953,6 +869,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; this.threadLocal.set('withLinksSummary', opts.withLinksSummary); this.threadLocal.set('withImagesSummary', opts.withImagesSummary); this.threadLocal.set('cacheTolerance', opts.cacheTolerance); + this.threadLocal.set('userAgent', opts.userAgent); const crawlOpts: ExtraScrappingOptions = { proxyUrl: opts.proxyUrl, @@ -960,6 +877,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; favorScreenshot: opts.respondWith === 'screenshot', waitForSelector: opts.waitForSelector, targetSelector: opts.targetSelector, + overrideUserAgent: opts.userAgent, }; return crawlOpts; diff --git a/backend/functions/src/cloud-functions/searcher.ts b/backend/functions/src/cloud-functions/searcher.ts index a5d394a..b219160 100644 --- a/backend/functions/src/cloud-functions/searcher.ts +++ b/backend/functions/src/cloud-functions/searcher.ts @@ -71,71 +71,6 @@ export class SearcherHost extends RPCHost { concurrency: 6, maxInstances: 200, }, - openapi: { - operation: { - parameters: { - 'Accept': { - description: `Specifies your preference for the response format. \n\n` + - `Supported formats:\n` + - `- text/event-stream\n` + - `- application/json or text/json\n` + - `- text/plain` - , - in: 'header', - schema: { type: 'string' } - }, - 'X-No-Cache': { - description: `Ignores internal cache if this header is specified with a value.`, - in: 'header', - schema: { type: 'string' } - }, - 'X-Respond-With': { - description: `Specifies the (non-default) form factor of the crawled data you prefer. \n\n` + - `Supported formats:\n` + - `- markdown\n` + - `- html\n` + - `- text\n` + - `- screenshot\n` - , - in: 'header', - schema: { type: 'string' } - }, - 'X-Proxy-Url': { - description: `Specifies your custom proxy if you prefer to use one. \n\n` + - `Supported protocols:\n` + - `- http\n` + - `- https\n` + - `- socks4\n` + - `- socks5\n\n` + - `For authentication, https://user:pass@host:port`, - in: 'header', - schema: { type: 'string' } - }, - 'X-Set-Cookie': { - description: `Sets cookie(s) to the headless browser for your request. \n\n` + - `Syntax is the same with standard Set-Cookie`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-Generated-Alt': { - description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + - `Note: Does not work when \`X-Respond-With\` is specified`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-Images-Summary': { - description: `Enable dedicated summary section for images on the page.`, - in: 'header', - schema: { type: 'string' } - }, - 'X-With-links-Summary': { - description: `Enable dedicated summary section for hyper links on the page.`, - in: 'header', - schema: { type: 'string' } - }, - } - } - }, tags: ['Searcher'], httpMethod: ['get', 'post'], returnType: [String, OutputServerEventStream], diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 2433f9f..2db7cd1 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -1,8 +1,100 @@ -import { AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined +import { Also, AutoCastable, Prop, RPC_CALL_ENVIRONMENT } from 'civkit'; // Adjust the import based on where your decorators are defined import type { Request, Response } from 'express'; import type { CookieParam } from 'puppeteer'; import { parseString as parseSetCookieString } from 'set-cookie-parser'; + +@Also({ + openapi: { + operation: { + parameters: { + 'Accept': { + description: `Specifies your preference for the response format.\n\n` + + `Supported formats: \n` + + `- text/event-stream\n` + + `- application/json or text/json\n` + + `- text/plain` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Cache-Tolerance': { + description: `Sets internal cache tolerance in seconds if this header is specified with a integer.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-No-Cache': { + description: `Ignores internal cache if this header is specified with a value.\n\nEquivalent to X-Cache-Tolerance: 0`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Respond-With': { + description: `Specifies the (non-default) form factor of the crawled data you prefer.\n\n` + + `Supported formats: \n` + + `- markdown\n` + + `- html\n` + + `- text\n` + + `- screenshot\n` + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Wait-For-Selector': { + description: `Specifies a CSS selector to wait for the appearance of such an element before returning.\n\n` + + 'Example: `X-Wait-For-Selector: .content-block`\n' + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Target-Selector': { + description: `Specifies a CSS selector for return target instead of the full html.\n\n` + + 'Implies `X-Wait-For-Selector: (same selector)`' + , + in: 'header', + schema: { type: 'string' } + }, + 'X-Proxy-Url': { + description: `Specifies your custom proxy if you prefer to use one.\n\n` + + `Supported protocols: \n` + + `- http\n` + + `- https\n` + + `- socks4\n` + + `- socks5\n\n` + + `For authentication, https://user:pass@host:port`, + in: 'header', + schema: { type: 'string' } + }, + 'X-Set-Cookie': { + description: `Sets cookie(s) to the headless browser for your request. \n\n` + + `Syntax is the same with standard Set-Cookie`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Generated-Alt': { + description: `Enable automatic alt-text generating for images without an meaningful alt-text.\n\n` + + `Note: Does not work when \`X-Respond-With\` is specified`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-Images-Summary': { + description: `Enable dedicated summary section for images on the page.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-With-links-Summary': { + description: `Enable dedicated summary section for hyper links on the page.`, + in: 'header', + schema: { type: 'string' } + }, + 'X-User-Agent': { + description: `Override User-Agent.`, + in: 'header', + schema: { type: 'string' } + }, + } + } + } +}) export class CrawlerOptions extends AutoCastable { @Prop({ @@ -47,6 +139,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() proxyUrl?: string; + @Prop() + userAgent?: string; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { @@ -87,6 +182,8 @@ export class CrawlerOptions extends AutoCastable { instance.targetSelector ??= targetSelector; const waitForSelector = ctx?.req.get('x-wait-for-selector'); instance.waitForSelector ??= waitForSelector || instance.targetSelector; + const overrideUserAgent = ctx?.req.get('x-user-agent'); + instance.userAgent ??= overrideUserAgent; const cookies: CookieParam[] = []; const setCookieHeaders = ctx?.req.headers['x-set-cookie'] || (instance.setCookies as any as string[]); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 620ee82..5e23427 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -65,6 +65,7 @@ export interface ScrappingOptions { favorScreenshot?: boolean; waitForSelector?: string; minIntervalMs?: number; + overrideUserAgent?: string; } @@ -417,6 +418,9 @@ document.addEventListener('load', handlePageLoad); if (options?.cookies) { await page.setCookie(...options.cookies); } + if (options?.overrideUserAgent) { + await page.setUserAgent(options.overrideUserAgent); + } let nextSnapshotDeferred = Defer(); const crippleListener = () => nextSnapshotDeferred.reject(new ServiceCrashedError({ message: `Browser crashed, try again` })); diff --git a/thinapps-shared b/thinapps-shared index d360d01..a6116b7 160000 --- a/thinapps-shared +++ b/thinapps-shared @@ -1 +1 @@ -Subproject commit d360d01c19b34499e564315b5b5935df17c62cc1 +Subproject commit a6116b73e99e3d335b0cd4cfcae8f4f0c7e72f6d