diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 5ef1bdf2..2d66315c 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -45,10 +45,21 @@ export async function generateRequestParams( return defaultParams; } } -export async function scrapWithFireEngine( - { url, waitFor = 0, screenshot = false, pageOptions = { parsePDF: true }, headers, options }: - { url: string, waitFor?: number, screenshot?: boolean, pageOptions?: { scrollXPaths?: string[], parsePDF?: boolean }, headers?: Record, options?: any } -): Promise { +export async function scrapWithFireEngine({ + url, + waitFor = 0, + screenshot = false, + pageOptions = { parsePDF: true }, + headers, + options, +}: { + url: string; + waitFor?: number; + screenshot?: boolean; + pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; + headers?: Record; + options?: any; +}): Promise { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that @@ -71,7 +82,7 @@ export async function scrapWithFireEngine( headers: { "Content-Type": "application/json", }, - timeout: universalTimeout + waitParam + timeout: universalTimeout + waitParam, } ); @@ -79,21 +90,34 @@ export async function scrapWithFireEngine( console.error( `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}` ); - return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; + return { + html: "", + screenshot: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; } const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( + url, + pageOptions?.parsePDF + ); return { html: content, screenshot: "", pageStatusCode, pageError }; } else { const data = response.data; const html = data.content; const screenshot = data.screenshot; - return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; + return { + html: html ?? "", + screenshot: screenshot ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; } } catch (error) { - if (error.code === 'ECONNABORTED') { + if (error.code === "ECONNABORTED") { console.log(`[Fire-Engine] Request timed out for ${url}`); } else { console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`); @@ -107,38 +131,48 @@ export async function scrapWithScrapingBee( wait_browser: string = "domcontentloaded", timeout: number = universalTimeout, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { try { const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); const clientParams = await generateRequestParams( url, wait_browser, - timeout, + timeout ); const response = await client.get({ ...clientParams, params: { ...clientParams.params, - 'transparent_status_code': 'True' - } + transparent_status_code: "True", + }, }); const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); - } else { let text = ""; try { const decoder = new TextDecoder(); text = decoder.decode(response.data); } catch (decodeError) { - console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); + console.error( + `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}` + ); } - return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; + return { + content: text, + pageStatusCode: response.status, + pageError: + response.statusText != "OK" ? response.statusText : undefined, + }; } } catch (error) { console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`); - return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText }; + return { + content: "", + pageStatusCode: error.response.status, + pageError: error.response.statusText, + }; } } @@ -147,29 +181,37 @@ export async function scrapWithPlaywright( waitFor: number = 0, headers?: Record, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { try { const reqParams = await generateRequestParams(url); // If the user has passed a wait parameter in the request, use that const waitParam = reqParams["params"]?.wait ?? waitFor; - const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, { - url: url, - wait_after_load: waitParam, - headers: headers, - }, { - headers: { - "Content-Type": "application/json", + const response = await axios.post( + process.env.PLAYWRIGHT_MICROSERVICE_URL, + { + url: url, + wait_after_load: waitParam, + headers: headers, }, - timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time - transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically - }); + { + headers: { + "Content-Type": "application/json", + }, + timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically + } + ); if (response.status !== 200) { console.error( `[Playwright] Error fetching url: ${url} with status: ${response.status}` ); - return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError }; + return { + content: "", + pageStatusCode: response.data?.pageStatusCode, + pageError: response.data?.pageError, + }; } const contentType = response.headers["content-type"]; @@ -180,14 +222,20 @@ export async function scrapWithPlaywright( try { const data = JSON.parse(textData); const html = data.content; - return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError }; + return { + content: html ?? "", + pageStatusCode: data.pageStatusCode, + pageError: data.pageError, + }; } catch (jsonError) { - console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`); + console.error( + `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}` + ); return { content: "" }; } } } catch (error) { - if (error.code === 'ECONNABORTED') { + if (error.code === "ECONNABORTED") { console.log(`[Playwright] Request timed out for ${url}`); } else { console.error(`[Playwright] Error fetching url: ${url} -> ${error}`); @@ -199,21 +247,25 @@ export async function scrapWithPlaywright( export async function scrapWithFetch( url: string, pageOptions: { parsePDF?: boolean } = { parsePDF: true } -): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { +): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { try { const response = await axios.get(url, { headers: { "Content-Type": "application/json", }, timeout: universalTimeout, - transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically + transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically }); if (response.status !== 200) { console.error( `[Axios] Error fetching url: ${url} with status: ${response.status}` ); - return { content: "", pageStatusCode: response.status, pageError: response.statusText }; + return { + content: "", + pageStatusCode: response.status, + pageError: response.statusText, + }; } const contentType = response.headers["content-type"]; @@ -224,7 +276,7 @@ export async function scrapWithFetch( return { content: text, pageStatusCode: 200 }; } } catch (error) { - if (error.code === 'ECONNABORTED') { + if (error.code === "ECONNABORTED") { console.log(`[Axios] Request timed out for ${url}`); } else { console.error(`[Axios] Error fetching url: ${url} -> ${error}`); @@ -291,9 +343,6 @@ function getScrapingFallbackOrder( return scrapersInOrder as (typeof baseScrapers)[number][]; } - - - export async function scrapSingleUrl( urlToScrap: string, pageOptions: PageOptions = { @@ -301,7 +350,7 @@ export async function scrapSingleUrl( includeHtml: false, waitFor: 0, screenshot: false, - headers: undefined + headers: undefined, }, existingHtml: string = "" ): Promise { @@ -310,8 +359,12 @@ export async function scrapSingleUrl( const attemptScraping = async ( url: string, method: (typeof baseScrapers)[number] - ) => { - let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} }; + ) => { + let scraperResponse: { + text: string; + screenshot: string; + metadata: { pageStatusCode?: number; pageError?: string | null }; + } = { text: "", screenshot: "", metadata: {} }; let screenshot = ""; switch (method) { case "fire-engine": @@ -322,9 +375,8 @@ export async function scrapSingleUrl( waitFor: pageOptions.waitFor, screenshot: pageOptions.screenshot, pageOptions: pageOptions, - headers: pageOptions.headers - } - ); + headers: pageOptions.headers, + }); scraperResponse.text = response.html; scraperResponse.screenshot = response.screenshot; scraperResponse.metadata.pageStatusCode = response.pageStatusCode; @@ -345,7 +397,11 @@ export async function scrapSingleUrl( break; case "playwright": if (process.env.PLAYWRIGHT_MICROSERVICE_URL) { - const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers); + const response = await scrapWithPlaywright( + url, + pageOptions.waitFor, + pageOptions.headers + ); scraperResponse.text = response.content; scraperResponse.metadata.pageStatusCode = response.pageStatusCode; scraperResponse.metadata.pageError = response.pageError; @@ -367,22 +423,39 @@ export async function scrapSingleUrl( break; } - let customScrapedContent : FireEngineResponse | null = null; + let customScrapedContent: FireEngineResponse | null = null; // Check for custom scraping conditions - const customScraperResult = await handleCustomScraping(scraperResponse.text, url); + const customScraperResult = await handleCustomScraping( + scraperResponse.text, + url + ); - if (customScraperResult){ + if (customScraperResult) { switch (customScraperResult.scraper) { case "fire-engine": - customScrapedContent = await scrapWithFireEngine({url: customScraperResult.url, waitFor: customScraperResult.waitAfterLoad, screenshot: false, pageOptions: customScraperResult.pageOptions}) + customScrapedContent = await scrapWithFireEngine({ + url: customScraperResult.url, + waitFor: customScraperResult.waitAfterLoad, + screenshot: false, + pageOptions: customScraperResult.pageOptions, + }); if (screenshot) { customScrapedContent.screenshot = screenshot; } break; case "pdf": - const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF); - customScrapedContent = { html: content, screenshot, pageStatusCode, pageError } + const { content, pageStatusCode, pageError } = + await fetchAndProcessPdf( + customScraperResult.url, + pageOptions?.parsePDF + ); + customScrapedContent = { + html: content, + screenshot, + pageStatusCode, + pageError, + }; break; } } @@ -400,11 +473,18 @@ export async function scrapSingleUrl( rawHtml: scraperResponse.text, screenshot: scraperResponse.screenshot, pageStatusCode: scraperResponse.metadata.pageStatusCode, - pageError: scraperResponse.metadata.pageError || undefined + pageError: scraperResponse.metadata.pageError || undefined, }; }; - let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined }; + let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { + text: "", + html: "", + rawHtml: "", + screenshot: "", + pageStatusCode: 200, + pageError: undefined, + }; try { let urlKey = urlToScrap; try { @@ -430,18 +510,17 @@ export async function scrapSingleUrl( } const attempt = await attemptScraping(urlToScrap, scraper); - text = attempt.text ?? ''; - html = attempt.html ?? ''; - rawHtml = attempt.rawHtml ?? ''; - screenshot = attempt.screenshot ?? ''; + text = attempt.text ?? ""; + html = attempt.html ?? ""; + rawHtml = attempt.rawHtml ?? ""; + screenshot = attempt.screenshot ?? ""; if (attempt.pageStatusCode) { pageStatusCode = attempt.pageStatusCode; } if (attempt.pageError) { pageError = attempt.pageError; } - - + if (text && text.trim().length >= 100) break; if (pageStatusCode && pageStatusCode == 404) break; const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1; @@ -468,7 +547,7 @@ export async function scrapSingleUrl( screenshot: screenshot, sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, }; } else { @@ -480,7 +559,7 @@ export async function scrapSingleUrl( ...metadata, sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, }; } @@ -495,7 +574,7 @@ export async function scrapSingleUrl( metadata: { sourceURL: urlToScrap, pageStatusCode: pageStatusCode, - pageError: pageError + pageError: pageError, }, } as Document; }