From c1743db305f4b18c70eb477081621723c16b711c Mon Sep 17 00:00:00 2001 From: Han Xiao Date: Thu, 11 Apr 2024 15:29:57 -0700 Subject: [PATCH] chore: clean code --- backend/functions/package.json | 1 + backend/functions/src/cloud-functions/crawler.ts | 10 ++++------ 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/backend/functions/package.json b/backend/functions/package.json index ef9d8fd..172937d 100644 --- a/backend/functions/package.json +++ b/backend/functions/package.json @@ -26,6 +26,7 @@ }, "main": "build/index.js", "dependencies": { + "@esm2cjs/normalize-url": "^8.0.0", "@google-cloud/translate": "^8.2.0", "@mozilla/readability": "^0.5.0", "@napi-rs/canvas": "^0.1.44", diff --git a/backend/functions/src/cloud-functions/crawler.ts b/backend/functions/src/cloud-functions/crawler.ts index 7d24590..f58f8ee 100644 --- a/backend/functions/src/cloud-functions/crawler.ts +++ b/backend/functions/src/cloud-functions/crawler.ts @@ -5,6 +5,7 @@ import _ from 'lodash'; import { PageSnapshot, PuppeteerControl } from '../services/puppeteer'; import TurnDownService from 'turndown'; import { Request, Response } from 'express'; +import normalizeUrl from "@esm2cjs/normalize-url"; @singleton() @@ -57,11 +58,8 @@ ${contentText.trim()} res: Response, }, ) { - const url = new URL(ctx.req.url, `${ctx.req.protocol}://${ctx.req.headers.host}`); - const rawPath = url.pathname.split('/').filter(Boolean); - const host = rawPath.shift(); - const urlToCrawl = new URL(`${ctx.req.protocol}://${host}/${rawPath.join('/')}`); - urlToCrawl.search = url.search; + const noSlashURL = ctx.req.url.slice(1); + const urlToCrawl = new URL(normalizeUrl(noSlashURL)); if (!ctx.req.accepts('text/plain') && ctx.req.accepts('text/event-stream')) { const sseStream = new OutputServerEventStream(); @@ -88,7 +86,7 @@ ${contentText.trim()} }); } } catch (err: any) { - this.logger.error(`Failed to crawl ${url}`, { err: marshalErrorLike(err) }); + this.logger.error(`Failed to crawl ${urlToCrawl.toString()}`, { err: marshalErrorLike(err) }); sseStream.write({ event: 'error', data: marshalErrorLike(err),