Nick: revision

This commit is contained in:
Nicolas 2024-07-03 18:06:53 -03:00
parent 2d30cc6117
commit f5b2fbd7e8
4 changed files with 24 additions and 25 deletions

View File

@ -3,7 +3,6 @@ import { logScrape } from "../../../services/logging/scrape_log";
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
import { universalTimeout } from "../global";
/**
* Scrapes a URL with Axios
* @param url The URL to scrape
@ -50,15 +49,16 @@ export async function scrapWithFetch(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
const text = response.data;
const result = { content: text, pageStatusCode: 200 };
logParams.success = true;
logParams.html = text;
logParams.response_code = 200;
logParams.error_message = null;
return result;
logParams.response_code = response.status;
return { content: text, pageStatusCode: response.status, pageError: null };
}
} catch (error) {
if (error.code === "ECONNABORTED") {
@ -68,7 +68,7 @@ export async function scrapWithFetch(
logParams.error_message = error.message || error;
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
}
return { content: "" };
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;

View File

@ -87,7 +87,8 @@ export async function scrapWithFireEngine({
pageOptions?.parsePDF
);
logParams.success = true;
// We shouldnt care about the pdf logging here I believe
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError };
} else {
const data = response.data;
@ -112,18 +113,12 @@ export async function scrapWithFireEngine({
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
logParams.error_message = error.message || error;
}
return { html: "", screenshot: "" };
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
const time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape({
url: logParams.url,
scraper: logParams.scraper,
success: logParams.success,
response_code: logParams.response_code,
time_taken_seconds,
error_message: logParams.error_message,
html: logParams.html,
});
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
await logScrape(logParams);
}
}

View File

@ -66,7 +66,10 @@ export async function scrapWithPlaywright(
const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
const textData = response.data;
try {
@ -86,7 +89,7 @@ export async function scrapWithPlaywright(
console.error(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
);
return { content: "" };
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
}
}
} catch (error) {
@ -97,7 +100,7 @@ export async function scrapWithPlaywright(
logParams.error_message = error.message || error;
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
}
return { content: "" };
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
} finally {
const endTime = Date.now();
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;

View File

@ -46,6 +46,8 @@ export async function scrapWithScrapingBee(
if (contentType && contentType.includes("application/pdf")) {
logParams.success = true;
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { content, pageStatusCode, pageError };
} else {
let text = "";
@ -62,12 +64,11 @@ export async function scrapWithScrapingBee(
logParams.response_code = response.status;
logParams.html = text;
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
logParams.error_message = response.statusText != "OK" ? response.statusText : undefined;
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
return {
content: text,
pageStatusCode: response.status,
pageError:
response.statusText != "OK" ? response.statusText : undefined,
pageError: response.statusText !== "OK" ? response.statusText : undefined,
};
}
} catch (error) {