mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-18 05:56:02 +08:00
Nick: url normalization
This commit is contained in:
parent
f25c0c6d21
commit
f2e0bfbfe3
@ -1,8 +1,19 @@
|
|||||||
export function normalizeUrl(url: string) {
|
export function normalizeUrl(url: string) {
|
||||||
|
url = url.replace(/^https?:\/\//, "").replace(/^www\./, "");
|
||||||
|
if (url.endsWith("/")) {
|
||||||
|
url = url.slice(0, -1);
|
||||||
|
}
|
||||||
|
return url;
|
||||||
|
}
|
||||||
|
|
||||||
|
export function normalizeUrlOnlyHostname(url: string) {
|
||||||
try {
|
try {
|
||||||
const hostname = new URL(url).hostname;
|
const hostname = new URL(url).hostname;
|
||||||
return hostname.replace(/^www\./, "");
|
return hostname.replace(/^www\./, "");
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
return url.replace(/^https?:\/\//, "").replace(/^www\./, "").split('/')[0];
|
return url
|
||||||
|
.replace(/^https?:\/\//, "")
|
||||||
|
.replace(/^www\./, "")
|
||||||
|
.split("/")[0];
|
||||||
}
|
}
|
||||||
}
|
}
|
@ -51,7 +51,7 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings";
|
|||||||
import { indexPage } from "../lib/extract/index/pinecone";
|
import { indexPage } from "../lib/extract/index/pinecone";
|
||||||
import { Document } from "../controllers/v1/types";
|
import { Document } from "../controllers/v1/types";
|
||||||
import { supabase_service } from "../services/supabase";
|
import { supabase_service } from "../services/supabase";
|
||||||
import { normalizeUrl } from "../lib/canonical-url";
|
import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url";
|
||||||
|
|
||||||
configDotenv();
|
configDotenv();
|
||||||
|
|
||||||
@ -80,7 +80,7 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20;
|
|||||||
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) {
|
||||||
if (await finishCrawl(job.data.crawl_id)) {
|
if (await finishCrawl(job.data.crawl_id)) {
|
||||||
(async () => {
|
(async () => {
|
||||||
const originUrl = sc.originUrl ? normalizeUrl(sc.originUrl) : undefined;
|
const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined;
|
||||||
// Get all visited URLs from Redis
|
// Get all visited URLs from Redis
|
||||||
const visitedUrls = await redisConnection.smembers(
|
const visitedUrls = await redisConnection.smembers(
|
||||||
"crawl:" + job.data.crawl_id + ":visited",
|
"crawl:" + job.data.crawl_id + ":visited",
|
||||||
|
Loading…
x
Reference in New Issue
Block a user