firecrawl/apps/api/src/lib/validateUrl.ts
2024-08-26 19:22:03 -03:00

170 lines
4.5 KiB
TypeScript

export const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return /^([^.:]+:\/\/)/.test(url);
};
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
let error = false;
let urlObj = {};
try {
urlObj = new URL(s);
} catch (err) {
error = true;
}
return { error, urlObj };
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
};
export const checkUrl = (url: string) => {
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
}
return url;
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
return domain1 === domain2;
}
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith('www.') ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(-2).join('.');
const domain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(-2).join('.');
const subdomain1 = cleanHostname(typedUrlObj1.hostname).split('.').slice(0, -2).join('.');
const subdomain2 = cleanHostname(typedUrlObj2.hostname).split('.').slice(0, -2).join('.');
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};
export function removeDuplicateUrls(urls: string[]): string[] {
const urlMap = new Map<string, string>();
for (const url of urls) {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol;
const hostname = parsedUrl.hostname.replace(/^www\./, '');
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
const key = `${hostname}${path}`;
if (!urlMap.has(key)) {
urlMap.set(key, url);
} else {
const existingUrl = new URL(urlMap.get(key)!);
const existingProtocol = existingUrl.protocol;
if (protocol === 'https:' && existingProtocol === 'http:') {
urlMap.set(key, url);
} else if (protocol === existingProtocol && !parsedUrl.hostname.startsWith('www.') && existingUrl.hostname.startsWith('www.')) {
urlMap.set(key, url);
}
}
}
return [...new Set(Array.from(urlMap.values()))];
}