diff --git a/apps/api/src/lib/timeout.ts b/apps/api/src/lib/timeout.ts new file mode 100644 index 00000000..fd0e5ade --- /dev/null +++ b/apps/api/src/lib/timeout.ts @@ -0,0 +1 @@ +export const axiosTimeout = 3000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3a3ca052..d1e93cdf 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; +import { axiosTimeout } from "../../../src/lib/timeout"; export class WebCrawler { private initialUrl: string; @@ -131,7 +132,7 @@ export class WebCrawler { try { console.log('3.1 here OK') console.log('this.robotsTxtUrl:', this.robotsTxtUrl) - const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 }); + const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); console.log('????', {response}) console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); @@ -274,7 +275,7 @@ export class WebCrawler { pageError = page.metadata?.pageError || undefined; } else { // console.log('crawl - else') - const response = await axios.get(url, { timeout: 3000 }); + const response = await axios.get(url, { timeout: axiosTimeout }); console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; @@ -312,6 +313,7 @@ export class WebCrawler { !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { + console.log(fullUrl) links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } @@ -428,7 +430,7 @@ export class WebCrawler { console.log("4.1.3 - Fetching sitemap from constructed URL"); try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); @@ -441,7 +443,7 @@ export class WebCrawler { console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap, { timeout: 3000 }); + const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 84e926e5..3f563471 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; export async function getLinksFromSitemap( @@ -8,7 +9,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -42,7 +43,7 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async (url: string): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 30db27c9..3e01571e 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -6,6 +6,7 @@ import dotenv from "dotenv"; import pdf from "pdf-parse"; import path from "path"; import os from "os"; +import { axiosTimeout } from "../../../lib/timeout"; dotenv.config(); @@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro while (attempt < maxAttempts && !resultAvailable) { try { - resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 }); + resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); if (resultResponse.status === 200) { resultAvailable = true; // Exit condition met } else {