Update single_url.ts

This commit is contained in:
Nicolas 2024-06-28 15:51:18 -03:00
parent 7e17498bcf
commit 9bf74bc774

View File

@ -45,10 +45,21 @@ export async function generateRequestParams(
return defaultParams; return defaultParams;
} }
} }
export async function scrapWithFireEngine( export async function scrapWithFireEngine({
{ url, waitFor = 0, screenshot = false, pageOptions = { parsePDF: true }, headers, options }: url,
{ url: string, waitFor?: number, screenshot?: boolean, pageOptions?: { scrollXPaths?: string[], parsePDF?: boolean }, headers?: Record<string, string>, options?: any } waitFor = 0,
): Promise<FireEngineResponse> { screenshot = false,
pageOptions = { parsePDF: true },
headers,
options,
}: {
url: string;
waitFor?: number;
screenshot?: boolean;
pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
headers?: Record<string, string>;
options?: any;
}): Promise<FireEngineResponse> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
@ -71,7 +82,7 @@ export async function scrapWithFireEngine(
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
timeout: universalTimeout + waitParam timeout: universalTimeout + waitParam,
} }
); );
@ -79,21 +90,34 @@ export async function scrapWithFireEngine(
console.error( console.error(
`[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
); );
return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; return {
html: "",
screenshot: "",
pageStatusCode: response.data?.pageStatusCode,
pageError: response.data?.pageError,
};
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
url,
pageOptions?.parsePDF
);
return { html: content, screenshot: "", pageStatusCode, pageError }; return { html: content, screenshot: "", pageStatusCode, pageError };
} else { } else {
const data = response.data; const data = response.data;
const html = data.content; const html = data.content;
const screenshot = data.screenshot; const screenshot = data.screenshot;
return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; return {
html: html ?? "",
screenshot: screenshot ?? "",
pageStatusCode: data.pageStatusCode,
pageError: data.pageError,
};
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === "ECONNABORTED") {
console.log(`[Fire-Engine] Request timed out for ${url}`); console.log(`[Fire-Engine] Request timed out for ${url}`);
} else { } else {
console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
@ -107,38 +131,48 @@ export async function scrapWithScrapingBee(
wait_browser: string = "domcontentloaded", wait_browser: string = "domcontentloaded",
timeout: number = universalTimeout, timeout: number = universalTimeout,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
try { try {
const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
const clientParams = await generateRequestParams( const clientParams = await generateRequestParams(
url, url,
wait_browser, wait_browser,
timeout, timeout
); );
const response = await client.get({ const response = await client.get({
...clientParams, ...clientParams,
params: { params: {
...clientParams.params, ...clientParams.params,
'transparent_status_code': 'True' transparent_status_code: "True",
} },
}); });
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
if (contentType && contentType.includes("application/pdf")) { if (contentType && contentType.includes("application/pdf")) {
return await fetchAndProcessPdf(url, pageOptions?.parsePDF); return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
} else { } else {
let text = ""; let text = "";
try { try {
const decoder = new TextDecoder(); const decoder = new TextDecoder();
text = decoder.decode(response.data); text = decoder.decode(response.data);
} catch (decodeError) { } catch (decodeError) {
console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); console.error(
`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
);
} }
return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; return {
content: text,
pageStatusCode: response.status,
pageError:
response.statusText != "OK" ? response.statusText : undefined,
};
} }
} catch (error) { } catch (error) {
console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; return {
content: "",
pageStatusCode: error.response.status,
pageError: error.response.statusText,
};
} }
} }
@ -147,29 +181,37 @@ export async function scrapWithPlaywright(
waitFor: number = 0, waitFor: number = 0,
headers?: Record<string, string>, headers?: Record<string, string>,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
try { try {
const reqParams = await generateRequestParams(url); const reqParams = await generateRequestParams(url);
// If the user has passed a wait parameter in the request, use that // If the user has passed a wait parameter in the request, use that
const waitParam = reqParams["params"]?.wait ?? waitFor; const waitParam = reqParams["params"]?.wait ?? waitFor;
const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, { const response = await axios.post(
process.env.PLAYWRIGHT_MICROSERVICE_URL,
{
url: url, url: url,
wait_after_load: waitParam, wait_after_load: waitParam,
headers: headers, headers: headers,
}, { },
{
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
}); }
);
if (response.status !== 200) { if (response.status !== 200) {
console.error( console.error(
`[Playwright] Error fetching url: ${url} with status: ${response.status}` `[Playwright] Error fetching url: ${url} with status: ${response.status}`
); );
return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; return {
content: "",
pageStatusCode: response.data?.pageStatusCode,
pageError: response.data?.pageError,
};
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
@ -180,14 +222,20 @@ export async function scrapWithPlaywright(
try { try {
const data = JSON.parse(textData); const data = JSON.parse(textData);
const html = data.content; const html = data.content;
return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; return {
content: html ?? "",
pageStatusCode: data.pageStatusCode,
pageError: data.pageError,
};
} catch (jsonError) { } catch (jsonError) {
console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); console.error(
`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
);
return { content: "" }; return { content: "" };
} }
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === "ECONNABORTED") {
console.log(`[Playwright] Request timed out for ${url}`); console.log(`[Playwright] Request timed out for ${url}`);
} else { } else {
console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
@ -199,21 +247,25 @@ export async function scrapWithPlaywright(
export async function scrapWithFetch( export async function scrapWithFetch(
url: string, url: string,
pageOptions: { parsePDF?: boolean } = { parsePDF: true } pageOptions: { parsePDF?: boolean } = { parsePDF: true }
): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
try { try {
const response = await axios.get(url, { const response = await axios.get(url, {
headers: { headers: {
"Content-Type": "application/json", "Content-Type": "application/json",
}, },
timeout: universalTimeout, timeout: universalTimeout,
transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
}); });
if (response.status !== 200) { if (response.status !== 200) {
console.error( console.error(
`[Axios] Error fetching url: ${url} with status: ${response.status}` `[Axios] Error fetching url: ${url} with status: ${response.status}`
); );
return { content: "", pageStatusCode: response.status, pageError: response.statusText }; return {
content: "",
pageStatusCode: response.status,
pageError: response.statusText,
};
} }
const contentType = response.headers["content-type"]; const contentType = response.headers["content-type"];
@ -224,7 +276,7 @@ export async function scrapWithFetch(
return { content: text, pageStatusCode: 200 }; return { content: text, pageStatusCode: 200 };
} }
} catch (error) { } catch (error) {
if (error.code === 'ECONNABORTED') { if (error.code === "ECONNABORTED") {
console.log(`[Axios] Request timed out for ${url}`); console.log(`[Axios] Request timed out for ${url}`);
} else { } else {
console.error(`[Axios] Error fetching url: ${url} -> ${error}`); console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
@ -291,9 +343,6 @@ function getScrapingFallbackOrder(
return scrapersInOrder as (typeof baseScrapers)[number][]; return scrapersInOrder as (typeof baseScrapers)[number][];
} }
export async function scrapSingleUrl( export async function scrapSingleUrl(
urlToScrap: string, urlToScrap: string,
pageOptions: PageOptions = { pageOptions: PageOptions = {
@ -301,7 +350,7 @@ export async function scrapSingleUrl(
includeHtml: false, includeHtml: false,
waitFor: 0, waitFor: 0,
screenshot: false, screenshot: false,
headers: undefined headers: undefined,
}, },
existingHtml: string = "" existingHtml: string = ""
): Promise<Document> { ): Promise<Document> {
@ -311,7 +360,11 @@ export async function scrapSingleUrl(
url: string, url: string,
method: (typeof baseScrapers)[number] method: (typeof baseScrapers)[number]
) => { ) => {
let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; let scraperResponse: {
text: string;
screenshot: string;
metadata: { pageStatusCode?: number; pageError?: string | null };
} = { text: "", screenshot: "", metadata: {} };
let screenshot = ""; let screenshot = "";
switch (method) { switch (method) {
case "fire-engine": case "fire-engine":
@ -322,9 +375,8 @@ export async function scrapSingleUrl(
waitFor: pageOptions.waitFor, waitFor: pageOptions.waitFor,
screenshot: pageOptions.screenshot, screenshot: pageOptions.screenshot,
pageOptions: pageOptions, pageOptions: pageOptions,
headers: pageOptions.headers headers: pageOptions.headers,
} });
);
scraperResponse.text = response.html; scraperResponse.text = response.html;
scraperResponse.screenshot = response.screenshot; scraperResponse.screenshot = response.screenshot;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
@ -345,7 +397,11 @@ export async function scrapSingleUrl(
break; break;
case "playwright": case "playwright":
if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); const response = await scrapWithPlaywright(
url,
pageOptions.waitFor,
pageOptions.headers
);
scraperResponse.text = response.content; scraperResponse.text = response.content;
scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
scraperResponse.metadata.pageError = response.pageError; scraperResponse.metadata.pageError = response.pageError;
@ -367,22 +423,39 @@ export async function scrapSingleUrl(
break; break;
} }
let customScrapedContent : FireEngineResponse | null = null; let customScrapedContent: FireEngineResponse | null = null;
// Check for custom scraping conditions // Check for custom scraping conditions
const customScraperResult = await handleCustomScraping(scraperResponse.text, url); const customScraperResult = await handleCustomScraping(
scraperResponse.text,
url
);
if (customScraperResult){ if (customScraperResult) {
switch (customScraperResult.scraper) { switch (customScraperResult.scraper) {
case "fire-engine": case "fire-engine":
customScrapedContent = await scrapWithFireEngine({url: customScraperResult.url, waitFor: customScraperResult.waitAfterLoad, screenshot: false, pageOptions: customScraperResult.pageOptions}) customScrapedContent = await scrapWithFireEngine({
url: customScraperResult.url,
waitFor: customScraperResult.waitAfterLoad,
screenshot: false,
pageOptions: customScraperResult.pageOptions,
});
if (screenshot) { if (screenshot) {
customScrapedContent.screenshot = screenshot; customScrapedContent.screenshot = screenshot;
} }
break; break;
case "pdf": case "pdf":
const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); const { content, pageStatusCode, pageError } =
customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } await fetchAndProcessPdf(
customScraperResult.url,
pageOptions?.parsePDF
);
customScrapedContent = {
html: content,
screenshot,
pageStatusCode,
pageError,
};
break; break;
} }
} }
@ -400,11 +473,18 @@ export async function scrapSingleUrl(
rawHtml: scraperResponse.text, rawHtml: scraperResponse.text,
screenshot: scraperResponse.screenshot, screenshot: scraperResponse.screenshot,
pageStatusCode: scraperResponse.metadata.pageStatusCode, pageStatusCode: scraperResponse.metadata.pageStatusCode,
pageError: scraperResponse.metadata.pageError || undefined pageError: scraperResponse.metadata.pageError || undefined,
}; };
}; };
let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
text: "",
html: "",
rawHtml: "",
screenshot: "",
pageStatusCode: 200,
pageError: undefined,
};
try { try {
let urlKey = urlToScrap; let urlKey = urlToScrap;
try { try {
@ -430,10 +510,10 @@ export async function scrapSingleUrl(
} }
const attempt = await attemptScraping(urlToScrap, scraper); const attempt = await attemptScraping(urlToScrap, scraper);
text = attempt.text ?? ''; text = attempt.text ?? "";
html = attempt.html ?? ''; html = attempt.html ?? "";
rawHtml = attempt.rawHtml ?? ''; rawHtml = attempt.rawHtml ?? "";
screenshot = attempt.screenshot ?? ''; screenshot = attempt.screenshot ?? "";
if (attempt.pageStatusCode) { if (attempt.pageStatusCode) {
pageStatusCode = attempt.pageStatusCode; pageStatusCode = attempt.pageStatusCode;
} }
@ -441,7 +521,6 @@ export async function scrapSingleUrl(
pageError = attempt.pageError; pageError = attempt.pageError;
} }
if (text && text.trim().length >= 100) break; if (text && text.trim().length >= 100) break;
if (pageStatusCode && pageStatusCode == 404) break; if (pageStatusCode && pageStatusCode == 404) break;
const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
@ -468,7 +547,7 @@ export async function scrapSingleUrl(
screenshot: screenshot, screenshot: screenshot,
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
}; };
} else { } else {
@ -480,7 +559,7 @@ export async function scrapSingleUrl(
...metadata, ...metadata,
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
}; };
} }
@ -495,7 +574,7 @@ export async function scrapSingleUrl(
metadata: { metadata: {
sourceURL: urlToScrap, sourceURL: urlToScrap,
pageStatusCode: pageStatusCode, pageStatusCode: pageStatusCode,
pageError: pageError pageError: pageError,
}, },
} as Document; } as Document;
} }