From 7e6c2fcf485ed7b69c44ebd44ca7bd20077929f5 Mon Sep 17 00:00:00 2001 From: Zhaofeng Miao <522856232@qq.com> Date: Thu, 22 Aug 2024 16:48:47 +0800 Subject: [PATCH] feat: add referer param --- backend/functions/src/cloud-functions/crawler.ts | 1 + backend/functions/src/dto/scrapping-options.ts | 13 +++++++++++++ backend/functions/src/services/puppeteer.ts | 14 ++++++++++---- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index f1a8fae..2e02b22 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -1123,6 +1123,7 @@ ${suffixMixins.length ? `\n${suffixMixins.join('\n\n')}\n` : ''}`; timeoutMs: opts.timeout ? opts.timeout * 1000 : undefined, withIframe: opts.withIframe, locale: opts.locale, + referer: opts.referer, }; return crawlOpts; diff --git a/backend/functions/src/dto/scrapping-options.ts b/backend/functions/src/dto/scrapping-options.ts index 5a2d52a..e8a6de4 100644 --- a/backend/functions/src/dto/scrapping-options.ts +++ b/backend/functions/src/dto/scrapping-options.ts @@ -115,6 +115,11 @@ import { parseString as parseSetCookieString } from 'set-cookie-parser'; description: 'Specify browser locale for the page.', in: 'header', schema: { type: 'string' } + }, + 'X-Referer': { + description: 'Specify referer for the page.', + in: 'header', + schema: { type: 'string' } } } } @@ -201,6 +206,9 @@ export class CrawlerOptions extends AutoCastable { @Prop() locale?: string; + @Prop() + referer?: string; + static override from(input: any) { const instance = super.from(input) as CrawlerOptions; const ctx = Reflect.get(input, RPC_CALL_ENVIRONMENT) as { @@ -218,6 +226,11 @@ export class CrawlerOptions extends AutoCastable { instance.locale = locale; } + const referer = ctx?.req.get('x-referer'); + if (referer !== undefined) { + instance.referer = referer; + } + const withGeneratedAlt = ctx?.req.get('x-with-generated-alt'); if (withGeneratedAlt !== undefined) { instance.withGeneratedAlt = Boolean(withGeneratedAlt); diff --git a/backend/functions/src/services/puppeteer.ts b/backend/functions/src/services/puppeteer.ts index 3ab0dec..21568d2 100644 --- a/backend/functions/src/services/puppeteer.ts +++ b/backend/functions/src/services/puppeteer.ts @@ -4,7 +4,7 @@ import { container, singleton } from 'tsyringe'; import { AsyncService, Defer, marshalErrorLike, AssertionFailureError, delay, maxConcurrency } from 'civkit'; import { Logger } from '../shared/services/logger'; -import type { Browser, CookieParam, Page } from 'puppeteer'; +import type { Browser, CookieParam, GoToOptions, Page } from 'puppeteer'; import puppeteer from 'puppeteer-extra'; import puppeteerBlockResources from 'puppeteer-extra-plugin-block-resources'; @@ -69,6 +69,7 @@ export interface ScrappingOptions { overrideUserAgent?: string; timeoutMs?: number; locale?: string; + referer?: string; } @@ -545,11 +546,16 @@ document.addEventListener('load', handlePageLoad); }); const timeout = options?.timeoutMs || 30_000; - - const gotoPromise = page.goto(url, { + const goToOptions: GoToOptions = { waitUntil: ['load', 'domcontentloaded', 'networkidle0'], timeout, - }) + }; + + if (options?.referer) { + goToOptions.referer = options.referer; + } + + const gotoPromise = page.goto(url, goToOptions) .catch((err) => { if (err instanceof TimeoutError) { this.logger.warn(`Page ${sn}: Browsing of ${url} timed out`, { err: marshalErrorLike(err) });