mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-19 01:35:55 +08:00
minor spacing and comment stuff
This commit is contained in:
parent
a5fb45988c
commit
4d6e25619b
@ -283,24 +283,22 @@ export class WebCrawler {
|
|||||||
const path = urlObj.pathname;
|
const path = urlObj.pathname;
|
||||||
|
|
||||||
|
|
||||||
if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS
|
if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS
|
||||||
if (this.isInternalLink(fullUrl) &&
|
if (this.isInternalLink(fullUrl) &&
|
||||||
this.noSections(fullUrl) &&
|
this.noSections(fullUrl) &&
|
||||||
// The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards
|
|
||||||
// this.matchesIncludes(path) &&
|
|
||||||
!this.matchesExcludes(path) &&
|
!this.matchesExcludes(path) &&
|
||||||
this.isRobotsAllowed(fullUrl)
|
this.isRobotsAllowed(fullUrl)
|
||||||
) {
|
) {
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}else{ // EXTERNAL LINKS
|
} else { // EXTERNAL LINKS
|
||||||
if(
|
if (
|
||||||
this.isInternalLink(url) && //its avoid to add links from external pages on the queue
|
this.isInternalLink(url) &&
|
||||||
this.allowExternalContentLinks &&
|
this.allowExternalContentLinks &&
|
||||||
!this.isSocialMediaOrEmail(fullUrl) &&
|
!this.isSocialMediaOrEmail(fullUrl) &&
|
||||||
!this.matchesExcludes(fullUrl, true) &&
|
!this.matchesExcludes(fullUrl, true) &&
|
||||||
!this.isExternalMainPage(fullUrl)
|
!this.isExternalMainPage(fullUrl)
|
||||||
){
|
) {
|
||||||
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user