Update single_url.ts

2025-08-14 19:35:55 +08:00 · 2024-06-28 15:51:18 -03:00 · 2024-06-28 15:51:18 -03:00 · 9bf74bc774
commit 9bf74bc774
parent 7e17498bcf
1 changed files with 142 additions and 63 deletions
--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -45,10 +45,21 @@ export async function generateRequestParams(
    return defaultParams;
  }
 }
-export async function scrapWithFireEngine(
+export async function scrapWithFireEngine({
-  { url, waitFor = 0, screenshot = false, pageOptions = { parsePDF: true }, headers, options }: 
+  url,
-  { url: string, waitFor?: number, screenshot?: boolean, pageOptions?: { scrollXPaths?: string[], parsePDF?: boolean }, headers?: Record<string, string>, options?: any }
+  waitFor = 0,
-): Promise<FireEngineResponse> {
+  screenshot = false,
  pageOptions = { parsePDF: true },
  headers,
  options,
 }: {
  url: string;
  waitFor?: number;
  screenshot?: boolean;
  pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean };
  headers?: Record<string, string>;
  options?: any;
 }): Promise<FireEngineResponse> {
  try {
    const reqParams = await generateRequestParams(url);
    // If the user has passed a wait parameter in the request, use that
@ -71,7 +82,7 @@ export async function scrapWithFireEngine(
        headers: {
          "Content-Type": "application/json",
        },
-        timeout: universalTimeout + waitParam
+        timeout: universalTimeout + waitParam,
      }
    );
@ -79,21 +90,34 @@ export async function scrapWithFireEngine(
      console.error(
        `[Fire-Engine] Error fetching url: ${url} with status: ${response.status}`
      );
-      return { html: "", screenshot: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
+      return {
        html: "",
        screenshot: "",
        pageStatusCode: response.data?.pageStatusCode,
        pageError: response.data?.pageError,
      };
    }
    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
-      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(url, pageOptions?.parsePDF);
+      const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(
        url,
        pageOptions?.parsePDF
      );
      return { html: content, screenshot: "", pageStatusCode, pageError };
    } else {
      const data = response.data;
      const html = data.content;
      const screenshot = data.screenshot;
-      return { html: html ?? "", screenshot: screenshot ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
+      return {
        html: html ?? "",
        screenshot: screenshot ?? "",
        pageStatusCode: data.pageStatusCode,
        pageError: data.pageError,
      };
    }
  } catch (error) {
-    if (error.code === 'ECONNABORTED') {
+    if (error.code === "ECONNABORTED") {
      console.log(`[Fire-Engine] Request timed out for ${url}`);
    } else {
      console.error(`[Fire-Engine][c] Error fetching url: ${url} -> ${error}`);
@ -107,38 +131,48 @@ export async function scrapWithScrapingBee(
  wait_browser: string = "domcontentloaded",
  timeout: number = universalTimeout,
  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
+): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
  try {
    const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY);
    const clientParams = await generateRequestParams(
      url,
      wait_browser,
-      timeout,
+      timeout
    );
    const response = await client.get({
      ...clientParams,
      params: {
        ...clientParams.params,
-        'transparent_status_code': 'True'
+        transparent_status_code: "True",
-      }
+      },
    });
    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
      return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
    } else {
      let text = "";
      try {
        const decoder = new TextDecoder();
        text = decoder.decode(response.data);
      } catch (decodeError) {
-        console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`);
+        console.error(
          `[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`
        );
      }
-      return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined };
+      return {
        content: text,
        pageStatusCode: response.status,
        pageError:
          response.statusText != "OK" ? response.statusText : undefined,
      };
    }
  } catch (error) {
    console.error(`[ScrapingBee][c] Error fetching url: ${url} -> ${error}`);
-    return { content: "", pageStatusCode: error.response.status, pageError: error.response.statusText };
+    return {
      content: "",
      pageStatusCode: error.response.status,
      pageError: error.response.statusText,
    };
  }
 }
@ -147,29 +181,37 @@ export async function scrapWithPlaywright(
  waitFor: number = 0,
  headers?: Record<string, string>,
  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
+): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
  try {
    const reqParams = await generateRequestParams(url);
    // If the user has passed a wait parameter in the request, use that
    const waitParam = reqParams["params"]?.wait ?? waitFor;
-    const response = await axios.post(process.env.PLAYWRIGHT_MICROSERVICE_URL, {
+    const response = await axios.post(
      process.env.PLAYWRIGHT_MICROSERVICE_URL,
      {
        url: url,
        wait_after_load: waitParam,
        headers: headers,
-    }, {
+      },
      {
        headers: {
          "Content-Type": "application/json",
        },
        timeout: universalTimeout + waitParam, // Add waitParam to timeout to account for the wait time
-      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
+        transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
-    });
+      }
    );
    if (response.status !== 200) {
      console.error(
        `[Playwright] Error fetching url: ${url} with status: ${response.status}`
      );
-      return { content: "", pageStatusCode: response.data?.pageStatusCode, pageError: response.data?.pageError };
+      return {
        content: "",
        pageStatusCode: response.data?.pageStatusCode,
        pageError: response.data?.pageError,
      };
    }
    const contentType = response.headers["content-type"];
@ -180,14 +222,20 @@ export async function scrapWithPlaywright(
      try {
        const data = JSON.parse(textData);
        const html = data.content;
-        return { content: html ?? "", pageStatusCode: data.pageStatusCode, pageError: data.pageError };
+        return {
          content: html ?? "",
          pageStatusCode: data.pageStatusCode,
          pageError: data.pageError,
        };
      } catch (jsonError) {
-        console.error(`[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`);
+        console.error(
          `[Playwright] Error parsing JSON response for url: ${url} -> ${jsonError}`
        );
        return { content: "" };
      }
    }
  } catch (error) {
-    if (error.code === 'ECONNABORTED') {
+    if (error.code === "ECONNABORTED") {
      console.log(`[Playwright] Request timed out for ${url}`);
    } else {
      console.error(`[Playwright] Error fetching url: ${url} -> ${error}`);
@ -199,21 +247,25 @@ export async function scrapWithPlaywright(
 export async function scrapWithFetch(
  url: string,
  pageOptions: { parsePDF?: boolean } = { parsePDF: true }
-): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> {
+): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> {
  try {
    const response = await axios.get(url, {
      headers: {
        "Content-Type": "application/json",
      },
      timeout: universalTimeout,
-      transformResponse: [(data) => data] // Prevent axios from parsing JSON automatically
+      transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically
    });
    if (response.status !== 200) {
      console.error(
        `[Axios] Error fetching url: ${url} with status: ${response.status}`
      );
-      return { content: "", pageStatusCode: response.status, pageError: response.statusText };
+      return {
        content: "",
        pageStatusCode: response.status,
        pageError: response.statusText,
      };
    }
    const contentType = response.headers["content-type"];
@ -224,7 +276,7 @@ export async function scrapWithFetch(
      return { content: text, pageStatusCode: 200 };
    }
  } catch (error) {
-    if (error.code === 'ECONNABORTED') {
+    if (error.code === "ECONNABORTED") {
      console.log(`[Axios] Request timed out for ${url}`);
    } else {
      console.error(`[Axios] Error fetching url: ${url} -> ${error}`);
@ -291,9 +343,6 @@ function getScrapingFallbackOrder(
  return scrapersInOrder as (typeof baseScrapers)[number][];
 }
 export async function scrapSingleUrl(
  urlToScrap: string,
  pageOptions: PageOptions = {
@ -301,7 +350,7 @@ export async function scrapSingleUrl(
    includeHtml: false,
    waitFor: 0,
    screenshot: false,
-    headers: undefined
+    headers: undefined,
  },
  existingHtml: string = ""
 ): Promise<Document> {
@ -311,7 +360,11 @@ export async function scrapSingleUrl(
    url: string,
    method: (typeof baseScrapers)[number]
  ) => {
-    let scraperResponse: { text: string, screenshot: string, metadata: { pageStatusCode?: number, pageError?: string | null } } = { text: "", screenshot: "", metadata: {} };
+    let scraperResponse: {
      text: string;
      screenshot: string;
      metadata: { pageStatusCode?: number; pageError?: string | null };
    } = { text: "", screenshot: "", metadata: {} };
    let screenshot = "";
    switch (method) {
      case "fire-engine":
@ -322,9 +375,8 @@ export async function scrapSingleUrl(
            waitFor: pageOptions.waitFor,
            screenshot: pageOptions.screenshot,
            pageOptions: pageOptions,
-            headers: pageOptions.headers
+            headers: pageOptions.headers,
-          }
+          });
          );
          scraperResponse.text = response.html;
          scraperResponse.screenshot = response.screenshot;
          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
@ -345,7 +397,11 @@ export async function scrapSingleUrl(
        break;
      case "playwright":
        if (process.env.PLAYWRIGHT_MICROSERVICE_URL) {
-          const response = await scrapWithPlaywright(url, pageOptions.waitFor, pageOptions.headers);
+          const response = await scrapWithPlaywright(
            url,
            pageOptions.waitFor,
            pageOptions.headers
          );
          scraperResponse.text = response.content;
          scraperResponse.metadata.pageStatusCode = response.pageStatusCode;
          scraperResponse.metadata.pageError = response.pageError;
@ -367,22 +423,39 @@ export async function scrapSingleUrl(
        break;
    }
-    let customScrapedContent : FireEngineResponse | null = null;
+    let customScrapedContent: FireEngineResponse | null = null;
    // Check for custom scraping conditions
-    const customScraperResult = await handleCustomScraping(scraperResponse.text, url);
+    const customScraperResult = await handleCustomScraping(
      scraperResponse.text,
      url
    );
-    if (customScraperResult){
+    if (customScraperResult) {
      switch (customScraperResult.scraper) {
        case "fire-engine":
-          customScrapedContent  = await scrapWithFireEngine({url: customScraperResult.url, waitFor: customScraperResult.waitAfterLoad, screenshot: false, pageOptions: customScraperResult.pageOptions})
+          customScrapedContent = await scrapWithFireEngine({
            url: customScraperResult.url,
            waitFor: customScraperResult.waitAfterLoad,
            screenshot: false,
            pageOptions: customScraperResult.pageOptions,
          });
          if (screenshot) {
            customScrapedContent.screenshot = screenshot;
          }
          break;
        case "pdf":
-          const { content, pageStatusCode, pageError } = await fetchAndProcessPdf(customScraperResult.url, pageOptions?.parsePDF);
+          const { content, pageStatusCode, pageError } =
-          customScrapedContent  = { html: content, screenshot, pageStatusCode, pageError }
+            await fetchAndProcessPdf(
              customScraperResult.url,
              pageOptions?.parsePDF
            );
          customScrapedContent = {
            html: content,
            screenshot,
            pageStatusCode,
            pageError,
          };
          break;
      }
    }
@ -400,11 +473,18 @@ export async function scrapSingleUrl(
      rawHtml: scraperResponse.text,
      screenshot: scraperResponse.screenshot,
      pageStatusCode: scraperResponse.metadata.pageStatusCode,
-      pageError: scraperResponse.metadata.pageError || undefined
+      pageError: scraperResponse.metadata.pageError || undefined,
    };
  };
-  let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = { text: "", html: "", rawHtml: "", screenshot: "", pageStatusCode: 200, pageError: undefined };
+  let { text, html, rawHtml, screenshot, pageStatusCode, pageError } = {
    text: "",
    html: "",
    rawHtml: "",
    screenshot: "",
    pageStatusCode: 200,
    pageError: undefined,
  };
  try {
    let urlKey = urlToScrap;
    try {
@ -430,10 +510,10 @@ export async function scrapSingleUrl(
      }
      const attempt = await attemptScraping(urlToScrap, scraper);
-      text = attempt.text ?? '';
+      text = attempt.text ?? "";
-      html = attempt.html ?? '';
+      html = attempt.html ?? "";
-      rawHtml = attempt.rawHtml ?? '';
+      rawHtml = attempt.rawHtml ?? "";
-      screenshot = attempt.screenshot ?? '';
+      screenshot = attempt.screenshot ?? "";
      if (attempt.pageStatusCode) {
        pageStatusCode = attempt.pageStatusCode;
      }
@ -441,7 +521,6 @@ export async function scrapSingleUrl(
        pageError = attempt.pageError;
      }
      if (text && text.trim().length >= 100) break;
      if (pageStatusCode && pageStatusCode == 404) break;
      const nextScraperIndex = scrapersInOrder.indexOf(scraper) + 1;
@ -468,7 +547,7 @@ export async function scrapSingleUrl(
          screenshot: screenshot,
          sourceURL: urlToScrap,
          pageStatusCode: pageStatusCode,
-          pageError: pageError
+          pageError: pageError,
        },
      };
    } else {
@ -480,7 +559,7 @@ export async function scrapSingleUrl(
          ...metadata,
          sourceURL: urlToScrap,
          pageStatusCode: pageStatusCode,
-          pageError: pageError
+          pageError: pageError,
        },
      };
    }
@ -495,7 +574,7 @@ export async function scrapSingleUrl(
      metadata: {
        sourceURL: urlToScrap,
        pageStatusCode: pageStatusCode,
-        pageError: pageError
+        pageError: pageError,
      },
    } as Document;
  }