mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-06-04 11:24:40 +08:00

* fix(map): do not remove query parameters from results * feat(map): add tests for query parameter handling
186 lines
4.6 KiB
TypeScript
186 lines
4.6 KiB
TypeScript
export const protocolIncluded = (url: string) => {
|
|
// if :// not in the start of the url assume http (maybe https?)
|
|
// regex checks if :// appears before any .
|
|
return /^([^.:]+:\/\/)/.test(url);
|
|
};
|
|
|
|
const getURLobj = (s: string) => {
|
|
// URL fails if we dont include the protocol ie google.com
|
|
let error = false;
|
|
let urlObj = {};
|
|
try {
|
|
urlObj = new URL(s);
|
|
} catch (err) {
|
|
error = true;
|
|
}
|
|
return { error, urlObj };
|
|
};
|
|
|
|
export const checkAndUpdateURL = (url: string) => {
|
|
if (!protocolIncluded(url)) {
|
|
url = `http://${url}`;
|
|
}
|
|
|
|
const { error, urlObj } = getURLobj(url);
|
|
if (error) {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
const typedUrlObj = urlObj as URL;
|
|
|
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
return { urlObj: typedUrlObj, url: url };
|
|
};
|
|
|
|
export const checkUrl = (url: string) => {
|
|
const { error, urlObj } = getURLobj(url);
|
|
if (error) {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
const typedUrlObj = urlObj as URL;
|
|
|
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
if ((url.split(".")[0].match(/:/g) || []).length !== 1) {
|
|
throw new Error("Invalid URL. Invalid protocol."); // for this one: http://http://example.com
|
|
}
|
|
|
|
return url;
|
|
};
|
|
|
|
/**
|
|
* Same domain check
|
|
* It checks if the domain of the url is the same as the base url
|
|
* It accounts true for subdomains and www.subdomains
|
|
* @param url
|
|
* @param baseUrl
|
|
* @returns
|
|
*/
|
|
export function isSameDomain(url: string, baseUrl: string) {
|
|
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
|
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
|
|
|
if (error1 || error2) {
|
|
return false;
|
|
}
|
|
|
|
const typedUrlObj1 = urlObj1 as URL;
|
|
const typedUrlObj2 = urlObj2 as URL;
|
|
|
|
const cleanHostname = (hostname: string) => {
|
|
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
|
|
};
|
|
|
|
const domain1 = cleanHostname(typedUrlObj1.hostname)
|
|
.split(".")
|
|
.slice(-2)
|
|
.join(".");
|
|
const domain2 = cleanHostname(typedUrlObj2.hostname)
|
|
.split(".")
|
|
.slice(-2)
|
|
.join(".");
|
|
|
|
return domain1 === domain2;
|
|
}
|
|
|
|
export function isSameSubdomain(url: string, baseUrl: string) {
|
|
const { urlObj: urlObj1, error: error1 } = getURLobj(url);
|
|
const { urlObj: urlObj2, error: error2 } = getURLobj(baseUrl);
|
|
|
|
if (error1 || error2) {
|
|
return false;
|
|
}
|
|
|
|
const typedUrlObj1 = urlObj1 as URL;
|
|
const typedUrlObj2 = urlObj2 as URL;
|
|
|
|
const cleanHostname = (hostname: string) => {
|
|
return hostname.startsWith("www.") ? hostname.slice(4) : hostname;
|
|
};
|
|
|
|
const domain1 = cleanHostname(typedUrlObj1.hostname)
|
|
.split(".")
|
|
.slice(-2)
|
|
.join(".");
|
|
const domain2 = cleanHostname(typedUrlObj2.hostname)
|
|
.split(".")
|
|
.slice(-2)
|
|
.join(".");
|
|
|
|
const subdomain1 = cleanHostname(typedUrlObj1.hostname)
|
|
.split(".")
|
|
.slice(0, -2)
|
|
.join(".");
|
|
const subdomain2 = cleanHostname(typedUrlObj2.hostname)
|
|
.split(".")
|
|
.slice(0, -2)
|
|
.join(".");
|
|
|
|
// Check if the domains are the same and the subdomains are the same
|
|
return domain1 === domain2 && subdomain1 === subdomain2;
|
|
}
|
|
|
|
export const checkAndUpdateURLForMap = (url: string) => {
|
|
if (!protocolIncluded(url)) {
|
|
url = `http://${url}`;
|
|
}
|
|
// remove last slash if present
|
|
if (url.endsWith("/")) {
|
|
url = url.slice(0, -1);
|
|
}
|
|
|
|
const { error, urlObj } = getURLobj(url);
|
|
if (error) {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
const typedUrlObj = urlObj as URL;
|
|
|
|
if (typedUrlObj.protocol !== "http:" && typedUrlObj.protocol !== "https:") {
|
|
throw new Error("Invalid URL");
|
|
}
|
|
|
|
// remove any query params
|
|
// url = url.split("?")[0].trim();
|
|
|
|
return { urlObj: typedUrlObj, url: url };
|
|
};
|
|
|
|
export function removeDuplicateUrls(urls: string[]): string[] {
|
|
const urlMap = new Map<string, string>();
|
|
|
|
for (const url of urls) {
|
|
const parsedUrl = new URL(url);
|
|
const protocol = parsedUrl.protocol;
|
|
const hostname = parsedUrl.hostname.replace(/^www\./, "");
|
|
const path = parsedUrl.pathname + parsedUrl.search + parsedUrl.hash;
|
|
|
|
const key = `${hostname}${path}`;
|
|
|
|
if (!urlMap.has(key)) {
|
|
urlMap.set(key, url);
|
|
} else {
|
|
const existingUrl = new URL(urlMap.get(key)!);
|
|
const existingProtocol = existingUrl.protocol;
|
|
|
|
if (protocol === "https:" && existingProtocol === "http:") {
|
|
urlMap.set(key, url);
|
|
} else if (
|
|
protocol === existingProtocol &&
|
|
!parsedUrl.hostname.startsWith("www.") &&
|
|
existingUrl.hostname.startsWith("www.")
|
|
) {
|
|
urlMap.set(key, url);
|
|
}
|
|
}
|
|
}
|
|
|
|
return [...new Set(Array.from(urlMap.values()))];
|
|
}
|