feat: add referer param

This commit is contained in:
Zhaofeng Miao 2024-08-22 16:48:47 +08:00
parent 080056e889
commit 7e6c2fcf48
3 changed files with 24 additions and 4 deletions

View File

@ -1123,6 +1123,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`;
timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined,
withIframe: opts.withIframe, withIframe: opts.withIframe,
locale: opts.locale, locale: opts.locale,
referer: opts.referer,
}; };
return crawlOpts; return crawlOpts;

View File

@ -115,6 +115,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser';
description: 'Specify browser locale for the page.', description: 'Specify browser locale for the page.',
in: 'header', in: 'header',
schema: { type: 'string' } schema: { type: 'string' }
},
'X-Referer': {
description: 'Specify referer for the page.',
in: 'header',
schema: { type: 'string' }
} }
} }
} }
@ -201,6 +206,9 @@ export class CrawlerOptions extends AutoCastable {
@Prop() @Prop()
locale?: string; locale?: string;
@Prop()
referer?: string;
static override from(input: any) { static override from(input: any) {
const instance = super.from(input) as CrawlerOptions; const instance = super.from(input) as CrawlerOptions;
const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as {
@ -218,6 +226,11 @@ export class CrawlerOptions extends AutoCastable {
instance.locale = locale; instance.locale = locale;
} }
const referer = ctx?.req.get('x-referer');
if (referer !== undefined) {
instance.referer = referer;
}
const withGeneratedAlt = ctx?.req.get('x-with-generated-alt'); const withGeneratedAlt = ctx?.req.get('x-with-generated-alt');
if (withGeneratedAlt !== undefined) { if (withGeneratedAlt !== undefined) {
instance.withGeneratedAlt = Boolean(withGeneratedAlt); instance.withGeneratedAlt = Boolean(withGeneratedAlt);

View File

@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe';
import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit';
import { Logger } from '../shared/services/logger'; import { Logger } from '../shared/services/logger';
import type { Browser, CookieParam, Page } from 'puppeteer'; import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer';
import puppeteer from 'puppeteer-extra'; import puppeteer from 'puppeteer-extra';
import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources';
@ -69,6 +69,7 @@ export interface ScrappingOptions {
overrideUserAgent?: string; overrideUserAgent?: string;
timeoutMs?: number; timeoutMs?: number;
locale?: string; locale?: string;
referer?: string;
} }
@ -545,11 +546,16 @@ document.addEventListener('load', handlePageLoad);
}); });
const timeout = options?.timeoutMs || 30_000; const timeout = options?.timeoutMs || 30_000;
const goToOptions: GoToOptions = {
const gotoPromise = page.goto(url, {
waitUntil: ['load', 'domcontentloaded', 'networkidle0'], waitUntil: ['load', 'domcontentloaded', 'networkidle0'],
timeout, timeout,
}) };
if (options?.referer) {
goToOptions.referer = options.referer;
}
const gotoPromise = page.goto(url, goToOptions)
.catch((err) => { .catch((err) => {
if (err instanceof TimeoutError) { if (err instanceof TimeoutError) {
this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) }); this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });