mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-05 20:16:03 +08:00
added regex for links in sitemap
This commit is contained in:
parent
252bc09ee2
commit
a684bd3c5d
@ -64,6 +64,14 @@ export class WebCrawler {
|
||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||
return sitemapLinks
|
||||
.filter((link) => {
|
||||
|
||||
// if link is not a complete url, add the base url
|
||||
link = link.trim();
|
||||
const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i');
|
||||
if (!isCompleteUrl.test(link)){
|
||||
link = this.baseUrl + link;
|
||||
}
|
||||
|
||||
const url = new URL(link);
|
||||
const path = url.pathname;
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user