From 173028295b7c1a2fb1cddfb3ce245309c6599eb3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=B3ricz=20Gerg=C5=91?= Date: Tue, 28 Jan 2025 09:41:37 +0100 Subject: [PATCH] fix(crawl): relative URL page discovery issues --- apps/api/sharedLibs/html-transformer/src/lib.rs | 3 ++- apps/api/src/scraper/WebScraper/crawler.ts | 8 +++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/sharedLibs/html-transformer/src/lib.rs b/apps/api/sharedLibs/html-transformer/src/lib.rs index f5eb86e5..d4a29934 100644 --- a/apps/api/sharedLibs/html-transformer/src/lib.rs +++ b/apps/api/sharedLibs/html-transformer/src/lib.rs @@ -17,7 +17,8 @@ pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 { let mut out: Vec = Vec::new(); - for anchor in document.select("a[href]").unwrap() { + let anchors: Vec<_> = document.select("a[href]").unwrap().collect(); + for anchor in anchors { let mut href = anchor.attributes.borrow().get("href").unwrap().to_string(); if href.starts_with("http:/") && !href.starts_with("http://") { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index ceee94ef..ea606f44 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -401,7 +401,13 @@ export class WebCrawler { public async extractLinksFromHTML(html: string, url: string) { try { - return await this.extractLinksFromHTMLRust(html, url); + return (await this.extractLinksFromHTMLRust(html, url)).map(x => { + try { + return new URL(x, url).href + } catch (e) { + return null; + } + }).filter(x => x !== null) as string[]; } catch (error) { this.logger.error("Failed to call html-transformer! Falling back to cheerio...", { error,