From a7eb2f7c6afe1e4777444bce7b5a3950bfa56e81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Thu, 30 Jan 2025 08:16:51 +0100 Subject: [PATCH] fix(crawler/rust): dedupe --- apps/api/src/scraper/WebScraper/crawler.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ea606f44..14ae5d71 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -401,13 +401,13 @@ export class WebCrawler { public async extractLinksFromHTML(html: string, url: string) { try { - return (await this.extractLinksFromHTMLRust(html, url)).map(x => { + return [...new Set((await this.extractLinksFromHTMLRust(html, url)).map(x => { try { return new URL(x, url).href } catch (e) { return null; } - }).filter(x => x !== null) as string[]; + }).filter(x => x !== null) as string[])]; } catch (error) { this.logger.error("Failed to call html-transformer! Falling back to cheerio...", { error,