Reapply "Merge pull request #561 from mendableai/bug/dealing-with-dns-error"

This reverts commit ffe11a5bf73e3c57657972cd36c3af1d0b9a432c.
This commit is contained in:
Nicolas 2024-08-20 19:22:09 -03:00
parent 441628998f
commit 1b3ad60a2c
2 changed files with 12 additions and 8 deletions

View File

@ -91,7 +91,7 @@ export async function scrapWithFireEngine({
}); });
const startTime = Date.now(); const startTime = Date.now();
const response = await axiosInstance.post( const _response = await axiosInstance.post(
process.env.FIRE_ENGINE_BETA_URL + endpoint, process.env.FIRE_ENGINE_BETA_URL + endpoint,
{ {
url: url, url: url,
@ -113,20 +113,20 @@ export async function scrapWithFireEngine({
} }
); );
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${response.data.jobId}`); let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) { while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${response.data.jobId}`); checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
} }
if (checkStatusResponse.data.processing) { if (checkStatusResponse.data.processing) {
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${response.data.jobId}`); Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
try { try {
axiosInstance.delete( axiosInstance.delete(
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${response.data.jobId}`, process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`,
); );
} catch (error) { } catch (error) {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${response.data.jobId} | error: ${error}`); Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
logParams.error_message = "Failed to delete request"; logParams.error_message = "Failed to delete request";
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" }; return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
} }
@ -145,7 +145,7 @@ export async function scrapWithFireEngine({
logParams.response_code = checkStatusResponse.data?.pageStatusCode; logParams.response_code = checkStatusResponse.data?.pageStatusCode;
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) { if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`); Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
} }
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined; const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
@ -158,7 +158,7 @@ export async function scrapWithFireEngine({
}; };
} }
const contentType = checkStatusResponse.headers["content-type"]; const contentType = checkStatusResponse.data.responseHeaders["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url, url,
@ -170,6 +170,7 @@ export async function scrapWithFireEngine({
return { html: content, screenshot: "", pageStatusCode, pageError }; return { html: content, screenshot: "", pageStatusCode, pageError };
} else { } else {
const data = checkStatusResponse.data; const data = checkStatusResponse.data;
logParams.success = logParams.success =
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) || (data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
data.pageStatusCode === 404; data.pageStatusCode === 404;

View File

@ -43,6 +43,9 @@ export async function scrapWithScrapingBee(
transparent_status_code: "True", transparent_status_code: "True",
}, },
}); });
Logger.info(
`⛏️ ScrapingBee: Scraping ${url}`
);
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
logParams.success = true; logParams.success = true;