From fe721fffbef391d2be2f078445c8ca56c4e648ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 1 Oct 2024 19:00:33 +0200 Subject: [PATCH] fix(crawl-redis): normalize URL before locking --- apps/api/src/lib/crawl-redis.ts | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/apps/api/src/lib/crawl-redis.ts b/apps/api/src/lib/crawl-redis.ts index 9240018e..6d578e5e 100644 --- a/apps/api/src/lib/crawl-redis.ts +++ b/apps/api/src/lib/crawl-redis.ts @@ -1,5 +1,6 @@ import { WebCrawler } from "../scraper/WebScraper/crawler"; import { redisConnection } from "../services/queue-service"; +import { Logger } from "./logger"; export type StoredCrawl = { originUrl: string; @@ -88,6 +89,16 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise return false; } } + + try { + const urlO = new URL(url); + urlO.search = ""; + urlO.hash = ""; + url = urlO.href; + } catch (error) { + Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + } + const res = (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0 await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res; @@ -95,6 +106,19 @@ export async function lockURL(id: string, sc: StoredCrawl, url: string): Promise /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap export async function lockURLs(id: string, urls: string[]): Promise { + urls = urls.map(url => { + try { + const urlO = new URL(url); + urlO.search = ""; + urlO.hash = ""; + return urlO.href; + } catch (error) { + Logger.warn("Failed to normalize URL " + JSON.stringify(url) + ": " + error); + } + + return url; + }); + const res = (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0 await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); return res;