This commit is contained in:
Nicolas 2024-06-24 16:33:07 -03:00
parent 21d29de819
commit 56d42d9c9b
4 changed files with 12 additions and 7 deletions

View File

@ -0,0 +1 @@
export const axiosTimeout = 3000;

View File

@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
export class WebCrawler {
private initialUrl: string;
@ -131,7 +132,7 @@ export class WebCrawler {
try {
console.log('3.1 here OK')
console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 });
const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
console.log('????', {response})
console.log('3.2 here OK')
this.robots = robotsParser(this.robotsTxtUrl, response.data);
@ -274,7 +275,7 @@ export class WebCrawler {
pageError = page.metadata?.pageError || undefined;
} else {
// console.log('crawl - else')
const response = await axios.get(url, { timeout: 3000 });
const response = await axios.get(url, { timeout: axiosTimeout });
console.log('crawl - else - response ok')
content = response.data ?? "";
pageStatusCode = response.status;
@ -312,6 +313,7 @@ export class WebCrawler {
!this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl)
) {
console.log(fullUrl)
links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
}
@ -428,7 +430,7 @@ export class WebCrawler {
console.log("4.1.3 - Fetching sitemap from constructed URL");
try {
const response = await axios.get(sitemapUrl, { timeout: 3000 });
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) {
console.log("4.1.4 - Extracting links from sitemap");
sitemapLinks = await getLinksFromSitemap(sitemapUrl);
@ -441,7 +443,7 @@ export class WebCrawler {
console.log("4.1.5 - Trying base URL sitemap as fallback");
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try {
const response = await axios.get(baseUrlSitemap, { timeout: 3000 });
const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) {
console.log("4.1.6 - Extracting links from base URL sitemap");
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);

View File

@ -1,4 +1,5 @@
import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js";
export async function getLinksFromSitemap(
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
try {
let content: string;
try {
const response = await axios.get(sitemapUrl, { timeout: 3000 });
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data;
} catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try {
const response = await axios.get(sitemapUrl, { timeout: 3000 });
const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) {
const xml = response.data;
const parsedXml = await parseStringPromise(xml);

View File

@ -6,6 +6,7 @@ import dotenv from "dotenv";
import pdf from "pdf-parse";
import path from "path";
import os from "os";
import { axiosTimeout } from "../../../lib/timeout";
dotenv.config();
@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
while (attempt < maxAttempts && !resultAvailable) {
try {
resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 });
resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
if (resultResponse.status === 200) {
resultAvailable = true; // Exit condition met
} else {