use thomas's url validation

This commit is contained in:
Gergő Móricz 2024-08-15 19:19:02 +02:00
parent e3279274f1
commit dad9d353d9
2 changed files with 47 additions and 1 deletions

View File

@ -12,6 +12,7 @@ import { v4 as uuidv4 } from "uuid";
import { Logger } from "../../src/lib/logger";
import { addCrawlJob, addCrawlJobs, crawlToCrawler, lockURL, lockURLs, saveCrawl, StoredCrawl } from "../../src/lib/crawl-redis";
import { getScrapeQueue } from "../../src/services/queue-service";
import { checkAndUpdateURL } from "../../src/lib/validateUrl";
export async function crawlController(req: Request, res: Response) {
try {
@ -43,10 +44,17 @@ export async function crawlController(req: Request, res: Response) {
return res.status(402).json({ error: "Insufficient credits" });
}
const url = req.body.url;
let url = req.body.url;
if (!url) {
return res.status(400).json({ error: "Url is required" });
}
try {
url = checkAndUpdateURL(url).url;
} catch (e) {
return res
.status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500)
.json({ error: e.message ?? e });
}
if (isUrlBlocked(url)) {
return res

View File

@ -0,0 +1,38 @@
const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return(/^([^.:]+:\/\/)/.test(url));
}
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
let error = false;
let urlObj = {};
try {
urlObj = new URL(s);
} catch (err) {
error = true;
}
return { error, urlObj };
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if(typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
}