diff --git a/apps/api/requests.http b/apps/api/requests.http index 495df975..3a1a9902 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,10 +1,10 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc content-type: application/json { - "url":"https://docs.mendable.ai" + "url":"firecrawl.dev" } @@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc- content-type: application/json { - "url":"https://www.mendable.ai", - "crawlerOptions": { - "returnOnlyUrls": true - } + "url": "firecrawl.dev" } +## "reoveTags": [], + # "mode": "crawl", + # "crawlerOptions": { + # "allowBackwardCrawling": false + # }, + # "pageOptions": { + # "onlyMainContent": false, + # "includeHtml": false, + # "parsePDF": true + # } + diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index e084f5ef..233da921 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -50,6 +50,5 @@ export function parseMarkdown(html: string) { /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - return markdownContent; } diff --git a/apps/api/src/lib/timeout.ts b/apps/api/src/lib/timeout.ts new file mode 100644 index 00000000..fd0e5ade --- /dev/null +++ b/apps/api/src/lib/timeout.ts @@ -0,0 +1 @@ +export const axiosTimeout = 3000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0793d711..5003845e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; +import { axiosTimeout } from "../../../src/lib/timeout"; export class WebCrawler { private initialUrl: string; @@ -129,20 +130,16 @@ export class WebCrawler { ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { - const response = await axios.get(this.robotsTxtUrl); + const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); - } - if(!crawlerOptions?.ignoreSitemap){ const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - if (sitemapLinks.length > 0) { let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - return filteredLinks.map(link => ({ url: link, html: "" })); } } @@ -154,7 +151,6 @@ export class WebCrawler { inProgress ); - if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -192,7 +188,6 @@ export class WebCrawler { // } // } - newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { @@ -258,11 +253,12 @@ export class WebCrawler { pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; } else { - const response = await axios.get(url); + const response = await axios.get(url, { timeout: axiosTimeout }); content = response.data ?? ""; pageStatusCode = response.status; pageError = response.statusText != "OK" ? response.statusText : undefined; } + const $ = load(content); let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; @@ -290,15 +286,15 @@ export class WebCrawler { !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } }); - + if (this.visited.size === 1) { return links; } + // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -400,39 +396,32 @@ export class WebCrawler { let sitemapLinks: string[] = []; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { - // Error handling for failed sitemap fetch - // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); + console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } if (sitemapLinks.length === 0) { - // If the first one doesn't work, try the base URL const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap); + const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); } } catch (error) { - // Error handling for failed base URL sitemap fetch - // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); } } - // Normalize and check if the URL is present in any of the sitemaps const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { - // do not push the normalized url sitemapLinks.push(url); } - return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 21301af3..9e318505 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -106,7 +106,6 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { this.validateInitialUrl(); - if (!useCaching) { return this.processDocumentsWithoutCache(inProgress); } @@ -264,8 +263,8 @@ export class WebScraperDataProvider { inProgress, allHtmls ); - documents = await this.getSitemapData(this.urls[0], documents); + documents = await this.getSitemapData(this.urls[0], documents); documents = this.applyPathReplacements(documents); // documents = await this.applyImgAltText(documents); diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3c7222c4..9f8d563d 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -119,7 +119,6 @@ export async function scrapWithScrapingBee( wait_browser, timeout, ); - const response = await client.get({ ...clientParams, params: { @@ -127,7 +126,6 @@ export async function scrapWithScrapingBee( 'transparent_status_code': 'True' } }); - const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); @@ -398,7 +396,6 @@ export async function scrapSingleUrl( //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - return { text: await parseMarkdown(cleanedHtml), html: cleanedHtml, diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c6dbf110..3f563471 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; export async function getLinksFromSitemap( @@ -8,7 +9,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -42,7 +43,7 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async (url: string): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 7116963e..05958232 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean { } try { + if (!url.startsWith('http://') && !url.startsWith('https://')) { + url = 'https://' + url; + } + const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 1a67d60b..3e01571e 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -6,6 +6,7 @@ import dotenv from "dotenv"; import pdf from "pdf-parse"; import path from "path"; import os from "os"; +import { axiosTimeout } from "../../../lib/timeout"; dotenv.config(); @@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro while (attempt < maxAttempts && !resultAvailable) { try { - resultResponse = await axios.get(resultUrl, { headers }); + resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); if (resultResponse.status === 200) { resultAvailable = true; // Exit condition met } else { diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 540f0bee..f9ce9b3c 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -4,7 +4,7 @@ export async function attemptScrapWithRequests( urlToScrap: string ): Promise { try { - const response = await axios.get(urlToScrap); + const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { console.log("Failed normal requests as well"); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a42b3e8c..f6328cf3 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -14,6 +14,7 @@ if(process.env.ENV === 'production') { getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), async function (job, done) { + try { job.progress({ current: 1, @@ -22,7 +23,6 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - const { success, message, docs } = await startWebScraperPipeline({ job }); const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000;