mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-06 08:06:01 +08:00
Update apps/api/src/scraper/WebScraper/crawler.ts
no need for regex Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
This commit is contained in:
parent
a684bd3c5d
commit
5e728c1a4d
@ -64,15 +64,7 @@ export class WebCrawler {
|
|||||||
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
private filterLinks(sitemapLinks: string[], limit: number, maxDepth: number): string[] {
|
||||||
return sitemapLinks
|
return sitemapLinks
|
||||||
.filter((link) => {
|
.filter((link) => {
|
||||||
|
const url = new URL(link.trim(), this.baseUrl);
|
||||||
// if link is not a complete url, add the base url
|
|
||||||
link = link.trim();
|
|
||||||
const isCompleteUrl = new RegExp('^(?:[a-z+]+:)?//', 'i');
|
|
||||||
if (!isCompleteUrl.test(link)){
|
|
||||||
link = this.baseUrl + link;
|
|
||||||
}
|
|
||||||
|
|
||||||
const url = new URL(link);
|
|
||||||
const path = url.pathname;
|
const path = url.pathname;
|
||||||
|
|
||||||
const depth = getURLDepth(url.toString());
|
const depth = getURLDepth(url.toString());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user