This commit is contained in:
Nicolas 2024-06-24 16:33:07 -03:00
parent 21d29de819
commit 56d42d9c9b
4 changed files with 12 additions and 7 deletions

View File

@ -0,0 +1 @@
export const axiosTimeout = 3000;

View File

@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
import robotsParser from "robots-parser"; import robotsParser from "robots-parser";
import { getURLDepth } from "./utils/maxDepthUtils"; import { getURLDepth } from "./utils/maxDepthUtils";
import { axiosTimeout } from "../../../src/lib/timeout";
export class WebCrawler { export class WebCrawler {
private initialUrl: string; private initialUrl: string;
@ -131,7 +132,7 @@ export class WebCrawler {
try { try {
console.log('3.1 here OK') console.log('3.1 here OK')
console.log('this.robotsTxtUrl:', this.robotsTxtUrl) console.log('this.robotsTxtUrl:', this.robotsTxtUrl)
const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 }); const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
console.log('????', {response}) console.log('????', {response})
console.log('3.2 here OK') console.log('3.2 here OK')
this.robots = robotsParser(this.robotsTxtUrl, response.data); this.robots = robotsParser(this.robotsTxtUrl, response.data);
@ -274,7 +275,7 @@ export class WebCrawler {
pageError = page.metadata?.pageError || undefined; pageError = page.metadata?.pageError || undefined;
} else { } else {
// console.log('crawl - else') // console.log('crawl - else')
const response = await axios.get(url, { timeout: 3000 }); const response = await axios.get(url, { timeout: axiosTimeout });
console.log('crawl - else - response ok') console.log('crawl - else - response ok')
content = response.data ?? ""; content = response.data ?? "";
pageStatusCode = response.status; pageStatusCode = response.status;
@ -312,6 +313,7 @@ export class WebCrawler {
!this.matchesExcludes(path) && !this.matchesExcludes(path) &&
this.isRobotsAllowed(fullUrl) this.isRobotsAllowed(fullUrl)
) { ) {
console.log(fullUrl)
links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
} }
@ -428,7 +430,7 @@ export class WebCrawler {
console.log("4.1.3 - Fetching sitemap from constructed URL"); console.log("4.1.3 - Fetching sitemap from constructed URL");
try { try {
const response = await axios.get(sitemapUrl, { timeout: 3000 }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
console.log("4.1.4 - Extracting links from sitemap"); console.log("4.1.4 - Extracting links from sitemap");
sitemapLinks = await getLinksFromSitemap(sitemapUrl); sitemapLinks = await getLinksFromSitemap(sitemapUrl);
@ -441,7 +443,7 @@ export class WebCrawler {
console.log("4.1.5 - Trying base URL sitemap as fallback"); console.log("4.1.5 - Trying base URL sitemap as fallback");
const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
try { try {
const response = await axios.get(baseUrlSitemap, { timeout: 3000 }); const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
console.log("4.1.6 - Extracting links from base URL sitemap"); console.log("4.1.6 - Extracting links from base URL sitemap");
sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);

View File

@ -1,4 +1,5 @@
import axios from "axios"; import axios from "axios";
import { axiosTimeout } from "../../lib/timeout";
import { parseStringPromise } from "xml2js"; import { parseStringPromise } from "xml2js";
export async function getLinksFromSitemap( export async function getLinksFromSitemap(
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
try { try {
let content: string; let content: string;
try { try {
const response = await axios.get(sitemapUrl, { timeout: 3000 }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
content = response.data; content = response.data;
} catch (error) { } catch (error) {
console.error(`Request failed for ${sitemapUrl}: ${error}`); console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => { export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
try { try {
const response = await axios.get(sitemapUrl, { timeout: 3000 }); const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
if (response.status === 200) { if (response.status === 200) {
const xml = response.data; const xml = response.data;
const parsedXml = await parseStringPromise(xml); const parsedXml = await parseStringPromise(xml);

View File

@ -6,6 +6,7 @@ import dotenv from "dotenv";
import pdf from "pdf-parse"; import pdf from "pdf-parse";
import path from "path"; import path from "path";
import os from "os"; import os from "os";
import { axiosTimeout } from "../../../lib/timeout";
dotenv.config(); dotenv.config();
@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro
while (attempt < maxAttempts && !resultAvailable) { while (attempt < maxAttempts && !resultAvailable) {
try { try {
resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 }); resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
if (resultResponse.status === 200) { if (resultResponse.status === 200) {
resultAvailable = true; // Exit condition met resultAvailable = true; // Exit condition met
} else { } else {