fix(crawl): relative URL page discovery issues

This commit is contained in:
Móricz Gergő 2025-01-28 09:41:37 +01:00
parent b8c4e198d1
commit 173028295b
2 changed files with 9 additions and 2 deletions

View File

@ -17,7 +17,8 @@ pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
let mut out: Vec<String> = Vec::new();
for anchor in document.select("a[href]").unwrap() {
let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
for anchor in anchors {
let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
if href.starts_with("http:/") && !href.starts_with("http://") {

View File

@ -401,7 +401,13 @@ export class WebCrawler {
public async extractLinksFromHTML(html: string, url: string) {
try {
return await this.extractLinksFromHTMLRust(html, url);
return (await this.extractLinksFromHTMLRust(html, url)).map(x => {
try {
return new URL(x, url).href
} catch (e) {
return null;
}
}).filter(x => x !== null) as string[];
} catch (error) {
this.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
error,