Update single_url.ts

This commit is contained in:
Nicolas 2024-07-03 17:56:21 -03:00
parent b36406e465
commit 90cf799a3c

View File

@ -121,13 +121,11 @@ export async function scrapWithFireEngine({
pageOptions?.parsePDF pageOptions?.parsePDF
); );
logParams.success = true; logParams.success = true;
logParams.html = content; // We shouldnt care about the pdf logging here I believe
logParams.response_code = pageStatusCode;
logParams.error_message = pageError;
return { html: content, screenshot: "", pageStatusCode, pageError }; return { html: content, screenshot: "", pageStatusCode, pageError };
} else { } else {
const data = response.data; const data = response.data;
logParams.success = true; logParams.success = data.pageStatusCode >= 200 && data.pageStatusCode < 300 || data.pageStatusCode === 404;
logParams.html = data.content ?? ""; logParams.html = data.content ?? "";
logParams.response_code = data.pageStatusCode; logParams.response_code = data.pageStatusCode;
logParams.error_message = data.pageError; logParams.error_message = data.pageError;
@ -144,7 +142,7 @@ export async function scrapWithFireEngine({
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
} else { } else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
logParams.error_message = error.message; logParams.error_message = error.message || error;
} }
return { html: "", screenshot: "" }; return { html: "", screenshot: "" };
} finally { } finally {
@ -195,7 +193,8 @@ export async function scrapWithScrapingBee(
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
logParams.success = true; logParams.success = true;
return await fetchAndProcessPdf(url, pageOptions?.parsePDF); const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
return { content, pageStatusCode, pageError };
} else { } else {
let text = ""; let text = "";
try { try {
@ -206,10 +205,12 @@ export async function scrapWithScrapingBee(
console.error( console.error(
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
); );
logParams.error_message = decodeError.message; logParams.error_message = decodeError.message || decodeError;
} }
logParams.response_code = response.status; logParams.response_code = response.status;
logParams.html = text; logParams.html = text;
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
logParams.error_message = response.statusText != "OK" ? response.statusText : undefined;
return { return {
content: text, content: text,
pageStatusCode: response.status, pageStatusCode: response.status,
@ -219,7 +220,7 @@ export async function scrapWithScrapingBee(
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
logParams.error_message = error.message; logParams.error_message = error.message || error;
logParams.response_code = error.response?.status; logParams.response_code = error.response?.status;
return { return {
content: "", content: "",
@ -304,7 +305,7 @@ export async function scrapWithPlaywright(
pageError: data.pageError, pageError: data.pageError,
}; };
} catch (jsonError) { } catch (jsonError) {
logParams.error_message = jsonError.message; logParams.error_message = jsonError.message || jsonError;
console.error( console.error(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
); );
@ -316,7 +317,7 @@ export async function scrapWithPlaywright(
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
console.log(`[Playwright] Request timed out for ${url}`); console.log(`[Playwright] Request timed out for ${url}`);
} else { } else {
logParams.error_message = error.message; logParams.error_message = error.message || error;
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
} }
return { content: "" }; return { content: "" };
@ -374,6 +375,8 @@ export async function scrapWithFetch(
const result = { content: text, pageStatusCode: 200 }; const result = { content: text, pageStatusCode: 200 };
logParams.success = true; logParams.success = true;
logParams.html = text; logParams.html = text;
logParams.response_code = 200;
logParams.error_message = null;
return result; return result;
} }
} catch (error) { } catch (error) {
@ -381,7 +384,7 @@ export async function scrapWithFetch(
logParams.error_message = "Request timed out"; logParams.error_message = "Request timed out";
console.log(`[Axios] Request timed out for ${url}`); console.log(`[Axios] Request timed out for ${url}`);
} else { } else {
logParams.error_message = error.message; logParams.error_message = error.message || error;
console.error(`[Axios] Error fetching url: ${url} -> ${error}`); console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
} }
return { content: "" }; return { content: "" };