mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-14 21:57:21 +08:00
Nick: revision
This commit is contained in:
parent
2d30cc6117
commit
f5b2fbd7e8
@ -3,7 +3,6 @@ import { logScrape } from "../../../services/logging/scrape_log";
|
|||||||
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
import { fetchAndProcessPdf } from "../utils/pdfProcessor";
|
||||||
import { universalTimeout } from "../global";
|
import { universalTimeout } from "../global";
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes a URL with Axios
|
* Scrapes a URL with Axios
|
||||||
* @param url The URL to scrape
|
* @param url The URL to scrape
|
||||||
@ -50,15 +49,16 @@ export async function scrapWithFetch(
|
|||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
logParams.response_code = pageStatusCode;
|
||||||
|
logParams.error_message = pageError;
|
||||||
|
return { content, pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const text = response.data;
|
const text = response.data;
|
||||||
const result = { content: text, pageStatusCode: 200 };
|
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
logParams.html = text;
|
logParams.html = text;
|
||||||
logParams.response_code = 200;
|
logParams.response_code = response.status;
|
||||||
logParams.error_message = null;
|
return { content: text, pageStatusCode: response.status, pageError: null };
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code === "ECONNABORTED") {
|
if (error.code === "ECONNABORTED") {
|
||||||
@ -68,7 +68,7 @@ export async function scrapWithFetch(
|
|||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
|
||||||
}
|
}
|
||||||
return { content: "" };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
|
@ -87,7 +87,8 @@ export async function scrapWithFireEngine({
|
|||||||
pageOptions?.parsePDF
|
pageOptions?.parsePDF
|
||||||
);
|
);
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
// We shouldnt care about the pdf logging here I believe
|
logParams.response_code = pageStatusCode;
|
||||||
|
logParams.error_message = pageError;
|
||||||
return { html: content, screenshot: "", pageStatusCode, pageError };
|
return { html: content, screenshot: "", pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const data = response.data;
|
const data = response.data;
|
||||||
@ -112,18 +113,12 @@ export async function scrapWithFireEngine({
|
|||||||
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
|
||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
}
|
}
|
||||||
return { html: "", screenshot: "" };
|
return { html: "", screenshot: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
const time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
await logScrape({
|
await logScrape(logParams);
|
||||||
url: logParams.url,
|
|
||||||
scraper: logParams.scraper,
|
|
||||||
success: logParams.success,
|
|
||||||
response_code: logParams.response_code,
|
|
||||||
time_taken_seconds,
|
|
||||||
error_message: logParams.error_message,
|
|
||||||
html: logParams.html,
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -66,7 +66,10 @@ export async function scrapWithPlaywright(
|
|||||||
const contentType = response.headers["content-type"];
|
const contentType = response.headers["content-type"];
|
||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
logParams.response_code = pageStatusCode;
|
||||||
|
logParams.error_message = pageError;
|
||||||
|
return { content, pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
const textData = response.data;
|
const textData = response.data;
|
||||||
try {
|
try {
|
||||||
@ -86,7 +89,7 @@ export async function scrapWithPlaywright(
|
|||||||
console.error(
|
console.error(
|
||||||
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
|
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
|
||||||
);
|
);
|
||||||
return { content: "" };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
@ -97,7 +100,7 @@ export async function scrapWithPlaywright(
|
|||||||
logParams.error_message = error.message || error;
|
logParams.error_message = error.message || error;
|
||||||
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
|
||||||
}
|
}
|
||||||
return { content: "" };
|
return { content: "", pageStatusCode: null, pageError: logParams.error_message };
|
||||||
} finally {
|
} finally {
|
||||||
const endTime = Date.now();
|
const endTime = Date.now();
|
||||||
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000;
|
||||||
|
@ -46,6 +46,8 @@ export async function scrapWithScrapingBee(
|
|||||||
if (contentType && contentType.includes("application/pdf")) {
|
if (contentType && contentType.includes("application/pdf")) {
|
||||||
logParams.success = true;
|
logParams.success = true;
|
||||||
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
|
||||||
|
logParams.response_code = pageStatusCode;
|
||||||
|
logParams.error_message = pageError;
|
||||||
return { content, pageStatusCode, pageError };
|
return { content, pageStatusCode, pageError };
|
||||||
} else {
|
} else {
|
||||||
let text = "";
|
let text = "";
|
||||||
@ -62,12 +64,11 @@ export async function scrapWithScrapingBee(
|
|||||||
logParams.response_code = response.status;
|
logParams.response_code = response.status;
|
||||||
logParams.html = text;
|
logParams.html = text;
|
||||||
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
|
logParams.success = response.status >= 200 && response.status < 300 || response.status === 404;
|
||||||
logParams.error_message = response.statusText != "OK" ? response.statusText : undefined;
|
logParams.error_message = response.statusText !== "OK" ? response.statusText : undefined;
|
||||||
return {
|
return {
|
||||||
content: text,
|
content: text,
|
||||||
pageStatusCode: response.status,
|
pageStatusCode: response.status,
|
||||||
pageError:
|
pageError: response.statusText !== "OK" ? response.statusText : undefined,
|
||||||
response.statusText != "OK" ? response.statusText : undefined,
|
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user