From dad9d353d98ea84a9f2d0b147f598b657b6bbd0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 15 Aug 2024 19:19:02 +0200 Subject: [PATCH] use thomas's url validation --- apps/api/src/controllers/crawl.ts | 10 +++++++- apps/api/src/lib/validateUrl.ts | 38 +++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 apps/api/src/lib/validateUrl.ts diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 5c1e0428..54eb1f40 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -12,6 +12,7 @@ import { v4 as uuidv4 } from "uuid"; import { Logger } from "../../src/lib/logger"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../src/services/queue-service"; +import { checkAndUpdateURL } from "../../src/lib/validateUrl"; export async function crawlController(req: Request, res: Response) { try { @@ -43,10 +44,17 @@ export async function crawlController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } - const url = req.body.url; + let url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); } + try { + url = checkAndUpdateURL(url).url; + } catch (e) { + return res + .status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500) + .json({ error: e.message ?? e }); + } if (isUrlBlocked(url)) { return res diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts new file mode 100644 index 00000000..2d2111c8 --- /dev/null +++ b/apps/api/src/lib/validateUrl.ts @@ -0,0 +1,38 @@ + +const protocolIncluded = (url: string) => { + // if :// not in the start of the url assume http (maybe https?) + // regex checks if :// appears before any . + return(/^([^.:]+:\/\/)/.test(url)); +} + +const getURLobj = (s: string) => { + // URL fails if we dont include the protocol ie google.com + let error = false; + let urlObj = {}; + try { + urlObj = new URL(s); + } catch (err) { + error = true; + } + return { error, urlObj }; +}; + +export const checkAndUpdateURL = (url: string) => { + + if (!protocolIncluded(url)) { + url = `http://${url}`; + } + + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + return { urlObj: typedUrlObj, url: url }; +} \ No newline at end of file