firecrawl/apps/api/src/lib/validateUrl.ts
Gergő Móricz 7ecbff3b20
fix(map): do not remove query parameters from results (FIR-1015) (#1191)
* fix(map): do not remove query parameters from results

* feat(map): add tests for query parameter handling
2025-02-17 12:34:58 -03:00

186 lines
4.6 KiB
TypeScript

export const protocolIncluded = (url: string) => {
// if :// not in the start of the url assume http (maybe https?)
// regex checks if :// appears before any .
return /^([^.:]+:\/\/)/.test(url);
};
const getURLobj = (s: string) => {
// URL fails if we dont include the protocol ie google.com
let error = false;
let urlObj = {};
try {
urlObj = new URL(s);
} catch (err) {
error = true;
}
return { error, urlObj };
};
export const checkAndUpdateURL = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
return { urlObj: typedUrlObj, url: url };
};
export const checkUrl = (url: string) => {
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
}
return url;
};
/**
* Same domain check
* It checks if the domain of the url is the same as the base url
* It accounts true for subdomains and www.subdomains
* @param url
* @param baseUrl
* @returns
*/
export function isSameDomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname)
.split(".")
.slice(-2)
.join(".");
const domain2 = cleanHostname(typedUrlObj2.hostname)
.split(".")
.slice(-2)
.join(".");
return domain1 === domain2;
}
export function isSameSubdomain(url: string, baseUrl: string) {
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
if (error1 || error2) {
return false;
}
const typedUrlObj1 = urlObj1 as URL;
const typedUrlObj2 = urlObj2 as URL;
const cleanHostname = (hostname: string) => {
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
};
const domain1 = cleanHostname(typedUrlObj1.hostname)
.split(".")
.slice(-2)
.join(".");
const domain2 = cleanHostname(typedUrlObj2.hostname)
.split(".")
.slice(-2)
.join(".");
const subdomain1 = cleanHostname(typedUrlObj1.hostname)
.split(".")
.slice(0, -2)
.join(".");
const subdomain2 = cleanHostname(typedUrlObj2.hostname)
.split(".")
.slice(0, -2)
.join(".");
// Check if the domains are the same and the subdomains are the same
return domain1 === domain2 && subdomain1 === subdomain2;
}
export const checkAndUpdateURLForMap = (url: string) => {
if (!protocolIncluded(url)) {
url = `http://${url}`;
}
// remove last slash if present
if (url.endsWith("/")) {
url = url.slice(0, -1);
}
const { error, urlObj } = getURLobj(url);
if (error) {
throw new Error("Invalid URL");
}
const typedUrlObj = urlObj as URL;
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
throw new Error("Invalid URL");
}
// remove any query params
// url = url.split("?")[0].trim();
return { urlObj: typedUrlObj, url: url };
};
export function removeDuplicateUrls(urls: string[]): string[] {
const urlMap = new Map<string, string>();
for (const url of urls) {
const parsedUrl = new URL(url);
const protocol = parsedUrl.protocol;
const hostname = parsedUrl.hostname.replace(/^www\./, "");
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
const key = `${hostname}${path}`;
if (!urlMap.has(key)) {
urlMap.set(key, url);
} else {
const existingUrl = new URL(urlMap.get(key)!);
const existingProtocol = existingUrl.protocol;
if (protocol === "https:" && existingProtocol === "http:") {
urlMap.set(key, url);
} else if (
protocol === existingProtocol &&
!parsedUrl.hostname.startsWith("www.") &&
existingUrl.hostname.startsWith("www.")
) {
urlMap.set(key, url);
}
}
}
return [...new Set(Array.from(urlMap.values()))];
}