From 4d6e25619b5c8aec8698ab90ea5532a8b924096d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 1 Jul 2024 16:05:34 -0300 Subject: [PATCH] minor spacing and comment stuff --- apps/api/src/scraper/WebScraper/crawler.ts | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 29baa1ce..831970ea 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -283,24 +283,22 @@ export class WebCrawler { const path = urlObj.pathname; - if(this.isInternalLink(fullUrl)){ // INTERNAL LINKS + if (this.isInternalLink(fullUrl)) { // INTERNAL LINKS if (this.isInternalLink(fullUrl) && this.noSections(fullUrl) && - // The idea here to comment this out is to allow wider website coverage as we filter this anyway afterwards - // this.matchesIncludes(path) && !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } - }else{ // EXTERNAL LINKS - if( - this.isInternalLink(url) && //its avoid to add links from external pages on the queue + } else { // EXTERNAL LINKS + if ( + this.isInternalLink(url) && this.allowExternalContentLinks && !this.isSocialMediaOrEmail(fullUrl) && !this.matchesExcludes(fullUrl, true) && !this.isExternalMainPage(fullUrl) - ){ + ) { links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } }