mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 07:29:04 +08:00
fix(crawl): relative URL page discovery issues
This commit is contained in:
parent
b8c4e198d1
commit
173028295b
@ -17,7 +17,8 @@ pub unsafe extern "C" fn extract_links(html: *const libc::c_char) -> *mut i8 {
|
||||
|
||||
let mut out: Vec<String> = Vec::new();
|
||||
|
||||
for anchor in document.select("a[href]").unwrap() {
|
||||
let anchors: Vec<_> = document.select("a[href]").unwrap().collect();
|
||||
for anchor in anchors {
|
||||
let mut href = anchor.attributes.borrow().get("href").unwrap().to_string();
|
||||
|
||||
if href.starts_with("http:/") && !href.starts_with("http://") {
|
||||
|
@ -401,7 +401,13 @@ export class WebCrawler {
|
||||
|
||||
public async extractLinksFromHTML(html: string, url: string) {
|
||||
try {
|
||||
return await this.extractLinksFromHTMLRust(html, url);
|
||||
return (await this.extractLinksFromHTMLRust(html, url)).map(x => {
|
||||
try {
|
||||
return new URL(x, url).href
|
||||
} catch (e) {
|
||||
return null;
|
||||
}
|
||||
}).filter(x => x !== null) as string[];
|
||||
} catch (error) {
|
||||
this.logger.error("Failed to call html-transformer! Falling back to cheerio...", {
|
||||
error,
|
||||
|
Loading…
x
Reference in New Issue
Block a user