mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-16 12:05:59 +08:00
Nick: url normalization
This commit is contained in:
parent
f25c0c6d21
commit
f2e0bfbfe3
@ -1,8 +1,19 @@
|
||||
export function normalizeUrl(url: string) {
|
||||
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||
if (url.endsWith("/")) {
|
||||
url = url.slice(0, -1);
|
||||
}
|
||||
return url;
|
||||
}
|
||||
|
||||
export function normalizeUrlOnlyHostname(url: string) {
|
||||
try {
|
||||
const hostname = new URL(url).hostname;
|
||||
return hostname.replace(/^www\./, "");
|
||||
} catch (error) {
|
||||
return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0];
|
||||
return url
|
||||
.replace(/^https?:\/\//, "")
|
||||
.replace(/^www\./, "")
|
||||
.split("/")[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
||||
import { indexPage } from "../lib/extract/index/pinecone";
|
||||
import { Document } from "../controllers/v1/types";
|
||||
import { supabase_service } from "../services/supabase";
|
||||
import { normalizeUrl } from "../lib/canonical-url";
|
||||
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||
|
||||
configDotenv();
|
||||
|
||||
@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||
if (await finishCrawl(job.data.crawl_id)) {
|
||||
(async () => {
|
||||
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
|
||||
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||
// Get all visited URLs from Redis
|
||||
const visitedUrls = await redisConnection.smembers(
|
||||
"crawl:" + job.data.crawl_id + ":visited",
|
||||
|
Loading…
x
Reference in New Issue
Block a user