Merge pull request #312 from mendableai/rafa/investigating-crawl-bugs

[Bug] Fixed axios bug that were making jobs stuck on active queue
2025-08-10 22:29:00 +08:00 · 2024-06-24 16:52:34 -03:00 · 2024-06-24 16:52:34 -03:00 · e5314ee8e7
commit e5314ee8e7
parent 3c7b7e7242 90b7fff366
11 changed files with 38 additions and 39 deletions
--- a/apps/api/requests.http
+++ b/apps/api/requests.http
@ -1,10 +1,10 @@
 ### Crawl Website
 POST http://localhost:3002/v0/scrape HTTP/1.1
-Authorization: Bearer 
+Authorization: Bearer fc
 content-type: application/json

 {
-    "url":"https://docs.mendable.ai"
+    "url":"firecrawl.dev"
 }


@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1

 ### Scrape Website
 POST http://localhost:3002/v0/crawl HTTP/1.1
-Authorization: Bearer 
+Authorization: Bearer fc-
 content-type: application/json

 {
-    "url":"https://www.mendable.ai",
-    "crawlerOptions": {
-        "returnOnlyUrls": true
-    }
+  "url": "firecrawl.dev"
 }

+## "reoveTags": [],
+  # "mode": "crawl",
+  # "crawlerOptions": {
+  #     "allowBackwardCrawling": false
+  # },
+  # "pageOptions": {
+  #   "onlyMainContent": false,
+  #   "includeHtml": false,
+  #   "parsePDF": true
+  # }
+



--- a/apps/api/src/lib/html-to-markdown.ts
+++ b/apps/api/src/lib/html-to-markdown.ts
@ -50,6 +50,5 @@ export function parseMarkdown(html: string) {
    /\[Skip to Content\]\(#[^\)]*\)/gi,
    ""
  );
-
  return markdownContent;
 }
--- a/apps/api/src/lib/timeout.ts
+++ b/apps/api/src/lib/timeout.ts
@ -0,0 +1 @@
+export const axiosTimeout = 3000;
--- a/apps/api/src/scraper/WebScraper/crawler.ts
+++ b/apps/api/src/scraper/WebScraper/crawler.ts
@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities";
 import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url";
 import robotsParser from "robots-parser";
 import { getURLDepth } from "./utils/maxDepthUtils";
+import { axiosTimeout } from "../../../src/lib/timeout";

 export class WebCrawler {
  private initialUrl: string;
@ -129,20 +130,16 @@ export class WebCrawler {
  ): Promise<{ url: string, html: string }[]> {
    // Fetch and parse robots.txt
    try {
-      const response = await axios.get(this.robotsTxtUrl);
+      const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout });
      this.robots = robotsParser(this.robotsTxtUrl, response.data);
    } catch (error) {
      console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`);
-
    }

-
    if(!crawlerOptions?.ignoreSitemap){
      const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl);
-    
      if (sitemapLinks.length > 0) {
        let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth);
-       
        return filteredLinks.map(link => ({ url: link, html: "" }));
      }
    }
@ -154,7 +151,6 @@ export class WebCrawler {
      inProgress
    );
   
-    
    if (
      urls.length === 0 &&
      this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0
@ -192,7 +188,6 @@ export class WebCrawler {
      //   }
      // }

-
      newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html));
      
      if (inProgress && newUrls.length > 0) {
@ -258,11 +253,12 @@ export class WebCrawler {
        pageStatusCode = page.metadata?.pageStatusCode;
        pageError = page.metadata?.pageError || undefined;
      } else {
-        const response = await axios.get(url);
+        const response = await axios.get(url, { timeout: axiosTimeout });
        content = response.data ?? "";
        pageStatusCode = response.status;
        pageError = response.statusText != "OK" ? response.statusText : undefined;
      }
+
      const $ = load(content);
      let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = [];

@ -290,15 +286,15 @@ export class WebCrawler {
            !this.matchesExcludes(path) &&
            this.isRobotsAllowed(fullUrl)
          ) {
-
            links.push({ url: fullUrl, html: content, pageStatusCode, pageError });
          }
        }
      });
-
+      
      if (this.visited.size === 1) {
        return links;
      }
+
      // Create a new list to return to avoid modifying the visited list
      return links.filter((link) => !this.visited.has(link.url));
    } catch (error) {
@ -400,39 +396,32 @@ export class WebCrawler {
    let sitemapLinks: string[] = [];

    try {
-      const response = await axios.get(sitemapUrl);
+      const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
      if (response.status === 200) {
        sitemapLinks = await getLinksFromSitemap(sitemapUrl);
      }
    } catch (error) {
-      // Error handling for failed sitemap fetch
-      // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
+      console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`);
    }

    if (sitemapLinks.length === 0) {
-      // If the first one doesn't work, try the base URL
      const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`;
      try {
-        const response = await axios.get(baseUrlSitemap);
+        const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout });
        if (response.status === 200) {
          sitemapLinks = await getLinksFromSitemap(baseUrlSitemap);
        }
      } catch (error) {
-        // Error handling for failed base URL sitemap fetch
-        // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
+        console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`);
      }
    }

-    // Normalize and check if the URL is present in any of the sitemaps
    const normalizedUrl = normalizeUrl(url);
    const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link));
-
    // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl
    if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) {
-      // do not push the normalized url
      sitemapLinks.push(url);
    }
-
    return sitemapLinks;
  }
 }
--- a/apps/api/src/scraper/WebScraper/index.ts
+++ b/apps/api/src/scraper/WebScraper/index.ts
@ -106,7 +106,6 @@ export class WebScraperDataProvider {
    inProgress?: (progress: Progress) => void
  ): Promise<Document[]> {
    this.validateInitialUrl();
-
    if (!useCaching) {
      return this.processDocumentsWithoutCache(inProgress);
    }
@ -264,8 +263,8 @@ export class WebScraperDataProvider {
      inProgress,
      allHtmls
    );
-    documents = await this.getSitemapData(this.urls[0], documents);

+    documents = await this.getSitemapData(this.urls[0], documents);
    documents = this.applyPathReplacements(documents);
    // documents = await this.applyImgAltText(documents);

--- a/apps/api/src/scraper/WebScraper/single_url.ts
+++ b/apps/api/src/scraper/WebScraper/single_url.ts
@ -119,7 +119,6 @@ export async function scrapWithScrapingBee(
      wait_browser,
      timeout,
    );
-
    const response = await client.get({
      ...clientParams,
      params: {
@ -127,7 +126,6 @@ export async function scrapWithScrapingBee(
        'transparent_status_code': 'True'
      }
    });
-
    const contentType = response.headers["content-type"];
    if (contentType && contentType.includes("application/pdf")) {
      return await fetchAndProcessPdf(url, pageOptions?.parsePDF);
@ -398,7 +396,6 @@ export async function scrapSingleUrl(

    //* TODO: add an optional to return markdown or structured/extracted content
    let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions);
-
    return {
      text: await parseMarkdown(cleanedHtml),
      html: cleanedHtml,
--- a/apps/api/src/scraper/WebScraper/sitemap.ts
+++ b/apps/api/src/scraper/WebScraper/sitemap.ts
@ -1,4 +1,5 @@
 import axios from "axios";
+import { axiosTimeout } from "../../lib/timeout";
 import { parseStringPromise } from "xml2js";

 export async function getLinksFromSitemap(
@ -8,7 +9,7 @@ export async function getLinksFromSitemap(
  try {
    let content: string;
    try {
-      const response = await axios.get(sitemapUrl);
+      const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
      content = response.data;
    } catch (error) {
      console.error(`Request failed for ${sitemapUrl}: ${error}`);
@ -42,7 +43,7 @@ export async function getLinksFromSitemap(
 export const fetchSitemapData = async (url: string): Promise<SitemapEntry[] | null> => {
  const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`;
  try {
-    const response = await axios.get(sitemapUrl);
+    const response = await axios.get(sitemapUrl, { timeout: axiosTimeout });
    if (response.status === 200) {
      const xml = response.data;
      const parsedXml = await parseStringPromise(xml);
--- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts
+++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts
@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean {
  }

  try {
+    if (!url.startsWith('http://') && !url.startsWith('https://')) {
+      url = 'https://' + url;
+    }
+    
    const urlObj = new URL(url);
    const hostname = urlObj.hostname.toLowerCase();

--- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
+++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts
@ -6,6 +6,7 @@ import dotenv from "dotenv";
 import pdf from "pdf-parse";
 import path from "path";
 import os from "os";
+import { axiosTimeout } from "../../../lib/timeout";

 dotenv.config();

@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro

      while (attempt < maxAttempts && !resultAvailable) {
        try {
-          resultResponse = await axios.get(resultUrl, { headers });
+          resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) });
          if (resultResponse.status === 200) {
            resultAvailable = true; // Exit condition met
          } else {
--- a/apps/api/src/scraper/WebScraper/utils/utils.ts
+++ b/apps/api/src/scraper/WebScraper/utils/utils.ts
@ -4,7 +4,7 @@ export async function attemptScrapWithRequests(
  urlToScrap: string
 ): Promise<string | null> {
  try {
-    const response = await axios.get(urlToScrap);
+    const response = await axios.get(urlToScrap, { timeout: 15000 });

    if (!response.data) {
      console.log("Failed normal requests as well");
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -14,6 +14,7 @@ if(process.env.ENV === 'production') {
 getWebScraperQueue().process(
  Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)),
  async function (job, done) {
+
    try {
      job.progress({
        current: 1,
@ -22,7 +23,6 @@ getWebScraperQueue().process(
        current_url: "",
      });
      const start = Date.now();
-
      const { success, message, docs } = await startWebScraperPipeline({ job });
      const end = Date.now();
      const timeTakenInSeconds = (end - start) / 1000;