mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-12 02:28:59 +08:00
Reapply "Merge pull request #561 from mendableai/bug/dealing-with-dns-error"
This reverts commit ffe11a5bf73e3c57657972cd36c3af1d0b9a432c.
This commit is contained in:
parent
441628998f
commit
1b3ad60a2c
@ -91,7 +91,7 @@ export async function scrapWithFireEngine({
|
|||||||
});
|
});
|
||||||
|
|
||||||
const startTime = Date.now();
|
const startTime = Date.now();
|
||||||
const response = await axiosInstance.post(
|
const _response = await axiosInstance.post(
|
||||||
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
process.env.FIRE_ENGINE_BETA_URL + endpoint,
|
||||||
{
|
{
|
||||||
url: url,
|
url: url,
|
||||||
@ -113,20 +113,20 @@ export async function scrapWithFireEngine({
|
|||||||
}
|
}
|
||||||
);
|
);
|
||||||
|
|
||||||
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${response.data.jobId}`);
|
let checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
while (checkStatusResponse.data.processing && Date.now() - startTime < universalTimeout + waitParam) {
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
await new Promise(resolve => setTimeout(resolve, 1000)); // wait 1 second
|
||||||
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${response.data.jobId}`);
|
checkStatusResponse = await axiosInstance.get(`${process.env.FIRE_ENGINE_BETA_URL}/scrape/${_response.data.jobId}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (checkStatusResponse.data.processing) {
|
if (checkStatusResponse.data.processing) {
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${response.data.jobId}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): deleting request - jobId: ${_response.data.jobId}`);
|
||||||
try {
|
try {
|
||||||
axiosInstance.delete(
|
axiosInstance.delete(
|
||||||
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${response.data.jobId}`,
|
process.env.FIRE_ENGINE_BETA_URL + `/scrape/${_response.data.jobId}`,
|
||||||
);
|
);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${response.data.jobId} | error: ${error}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to delete request - jobId: ${_response.data.jobId} | error: ${error}`);
|
||||||
logParams.error_message = "Failed to delete request";
|
logParams.error_message = "Failed to delete request";
|
||||||
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
return { html: "", screenshot: "", pageStatusCode: null, pageError: "" };
|
||||||
}
|
}
|
||||||
@ -145,7 +145,7 @@ export async function scrapWithFireEngine({
|
|||||||
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
logParams.response_code = checkStatusResponse.data?.pageStatusCode;
|
||||||
|
|
||||||
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
if(checkStatusResponse.data && checkStatusResponse.data?.pageStatusCode !== 200) {
|
||||||
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${response.status}`);
|
Logger.debug(`⛏️ Fire-Engine (${engine}): Failed to fetch url: ${url} \t status: ${checkStatusResponse.data?.pageStatusCode}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
const pageStatusCode = checkStatusResponse.data?.pageStatusCode ? checkStatusResponse.data?.pageStatusCode : checkStatusResponse.data?.error && checkStatusResponse.data?.error.includes("Dns resolution error for hostname") ? 404 : undefined;
|
||||||
@ -158,7 +158,7 @@ export async function scrapWithFireEngine({
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const contentType = checkStatusResponse.headers["content-type"];
|
const contentType = checkStatusResponse.data.responseHeaders["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
|
||||||
url,
|
url,
|
||||||
@ -170,6 +170,7 @@ export async function scrapWithFireEngine({
|
|||||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const data = checkStatusResponse.data;
|
const data = checkStatusResponse.data;
|
||||||
|
|
||||||
logParams.success =
|
logParams.success =
|
||||||
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
(data.pageStatusCode >= 200 && data.pageStatusCode < 300) ||
|
||||||
data.pageStatusCode === 404;
|
data.pageStatusCode === 404;
|
||||||
|
@ -43,6 +43,9 @@ export async function scrapWithScrapingBee(
|
|||||||
transparent_status_code: "True",
|
transparent_status_code: "True",
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
|
Logger.info(
|
||||||
|
`⛏️ ScrapingBee: Scraping ${url}`
|
||||||
|
);
|
||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user