From 0e8308e6275c74e6de03c902d95f8d82870991bc Mon Sep 17 00:00:00 2001 From: "yanlong.wang" Date: Mon, 17 Feb 2025 12:27:02 +0800 Subject: [PATCH] fix: some invalid uriComponent case --- backend/functions/src/cloud-functions/crawler.ts | 4 +++- backend/functions/src/utils/misc.ts | 15 +++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 66c2a9e..8c23c60 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -24,6 +24,7 @@ import { JSDomControl } from '../services/jsdom'; import { FormattedPage, md5Hasher, SnapshotFormatter } from '../services/snapshot-formatter'; import { CurlControl } from '../services/curl'; import { LmControl } from '../services/lm'; +import { tryDecodeURIComponent } from '../utils/misc'; export interface ExtraScrappingOptions extends ScrappingOptions { withIframe?: boolean | 'quoted'; @@ -169,7 +170,8 @@ export class CrawlerHost extends RPCHost { let chargeAmount = 0; const crawlerOptions = ctx.req.method === 'GET' ? crawlerOptionsHeaderOnly : crawlerOptionsParamsAllowed; - const targetUrl = await this.getTargetUrl(decodeURIComponent(ctx.req.url), crawlerOptions); + // Note req.url in express is actually unparsed `path`, e.g. `/some-path?abc`. Instead of a real url. + const targetUrl = await this.getTargetUrl(tryDecodeURIComponent(ctx.req.url), crawlerOptions); if (!targetUrl) { const latestUser = uid ? await auth.assertUser() : undefined; if (!ctx.req.accepts('text/plain') && (ctx.req.accepts('text/json') || ctx.req.accepts('application/json'))) { diff --git a/backend/functions/src/utils/misc.ts b/backend/functions/src/utils/misc.ts index 20947e2..645933e 100644 --- a/backend/functions/src/utils/misc.ts +++ b/backend/functions/src/utils/misc.ts @@ -1,3 +1,18 @@ +import { ParamValidationError } from 'civkit'; + export function cleanAttribute(attribute: string | null) { return attribute ? attribute.replace(/(\n+\s*)+/g, '\n') : ''; } + + +export function tryDecodeURIComponent(input: string) { + try { + return decodeURIComponent(input); + } catch (err) { + if (URL.canParse(input, 'http://localhost:3000')) { + return input; + } + + throw new ParamValidationError(`Invalid URIComponent: ${input}`); + } +}