diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 5c1e0428..54eb1f40 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -12,6 +12,7 @@ import { v4 as uuidv4 } from "uuid"; import { Logger } from "../../src/lib/logger"; import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis"; import { getScrapeQueue } from "../../src/services/queue-service"; +import { checkAndUpdateURL } from "../../src/lib/validateUrl"; export async function crawlController(req: Request, res: Response) { try { @@ -43,10 +44,17 @@ export async function crawlController(req: Request, res: Response) { return res.status(402).json({ error: "Insufficient credits" }); } - const url = req.body.url; + let url = req.body.url; if (!url) { return res.status(400).json({ error: "Url is required" }); } + try { + url = checkAndUpdateURL(url).url; + } catch (e) { + return res + .status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500) + .json({ error: e.message ?? e }); + } if (isUrlBlocked(url)) { return res diff --git a/apps/api/src/lib/validateUrl.ts b/apps/api/src/lib/validateUrl.ts new file mode 100644 index 00000000..2d2111c8 --- /dev/null +++ b/apps/api/src/lib/validateUrl.ts @@ -0,0 +1,38 @@ + +const protocolIncluded = (url: string) => { + // if :// not in the start of the url assume http (maybe https?) + // regex checks if :// appears before any . + return(/^([^.:]+:\/\/)/.test(url)); +} + +const getURLobj = (s: string) => { + // URL fails if we dont include the protocol ie google.com + let error = false; + let urlObj = {}; + try { + urlObj = new URL(s); + } catch (err) { + error = true; + } + return { error, urlObj }; +}; + +export const checkAndUpdateURL = (url: string) => { + + if (!protocolIncluded(url)) { + url = `http://${url}`; + } + + const { error, urlObj } = getURLobj(url); + if (error) { + throw new Error("Invalid URL"); + } + + const typedUrlObj = urlObj as URL; + + if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") { + throw new Error("Invalid URL"); + } + + return { urlObj: typedUrlObj, url: url }; +} \ No newline at end of file