From 21d29de819f070343fd1825e6dd2ecb0172b4934 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:25:07 -0300 Subject: [PATCH 1/5] testing crawl with new.abb.com case many unnecessary console.logs for tracing the code execution --- apps/api/requests.http | 22 +++++-- apps/api/src/controllers/crawl.ts | 2 + apps/api/src/lib/html-to-markdown.ts | 3 +- apps/api/src/main/runWebScraper.ts | 2 + apps/api/src/scraper/WebScraper/crawler.ts | 64 +++++++++++++------ apps/api/src/scraper/WebScraper/index.ts | 27 +++++++- apps/api/src/scraper/WebScraper/single_url.ts | 26 +++++++- apps/api/src/scraper/WebScraper/sitemap.ts | 4 +- .../src/scraper/WebScraper/utils/blocklist.ts | 4 ++ .../scraper/WebScraper/utils/pdfProcessor.ts | 2 +- .../api/src/scraper/WebScraper/utils/utils.ts | 2 +- apps/api/src/services/queue-worker.ts | 5 +- 12 files changed, 128 insertions(+), 35 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 495df975..32f9f60e 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,10 +1,10 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc- content-type: application/json { - "url":"https://docs.mendable.ai" + "url":"new.abb.com/sustainability/foundation" } @@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc- content-type: application/json { - "url":"https://www.mendable.ai", - "crawlerOptions": { - "returnOnlyUrls": true - } + "url": "new.abb.com/sustainability/foundation" } +## "reoveTags": [], + # "mode": "crawl", + # "crawlerOptions": { + # "allowBackwardCrawling": false + # }, + # "pageOptions": { + # "onlyMainContent": false, + # "includeHtml": false, + # "parsePDF": true + # } + diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 8fd876d3..43edc579 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -66,6 +66,7 @@ export async function crawlController(req: Request, res: Response) { parsePDF: true }; + console.log('1. here OK!') if (mode === "single_urls" && !url.includes(",")) { try { const a = new WebScraperDataProvider(); @@ -84,6 +85,7 @@ export async function crawlController(req: Request, res: Response) { current_url: progress.currentDocumentUrl, }); }); + console.log('crawlController - return res.json...') return res.json({ success: true, documents: docs, diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index e084f5ef..86899598 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { + console.log('parseMarkdown - start!') var TurndownService = require("turndown"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm') @@ -50,6 +51,6 @@ export function parseMarkdown(html: string) { /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - + console.log('parseMarkdown - return') return markdownContent; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index dee89bc4..a8c4e84a 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -78,9 +78,11 @@ export async function runWebScraper({ pageOptions: pageOptions, }); } + console.log('runWebScraper - getDocuments') const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); })) as Document[]; + console.log('runWebScraper - getDocuments - done - docs.length:', docs.length) if (docs.length === 0) { return { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0793d711..3a3ca052 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -129,24 +129,31 @@ export class WebCrawler { ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { - const response = await axios.get(this.robotsTxtUrl); + console.log('3.1 here OK') + console.log('this.robotsTxtUrl:', this.robotsTxtUrl) + const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 }); + console.log('????', {response}) + console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); } - + console.log('4. here OK!') if(!crawlerOptions?.ignoreSitemap){ + console.log('4.1') const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - + console.log('4.2') if (sitemapLinks.length > 0) { + console.log('4.3') let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - + console.log('4.4') return filteredLinks.map(link => ({ url: link, html: "" })); } } + console.log('5. here OK!') const urls = await this.crawlUrls( [this.initialUrl], pageOptions, @@ -154,7 +161,7 @@ export class WebCrawler { inProgress ); - + console.log('6. here OK!') if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -164,6 +171,7 @@ export class WebCrawler { // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + console.log('7. here OK!') return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } @@ -180,6 +188,7 @@ export class WebCrawler { } return; } + console.log('crawlUrls - crawl') const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { @@ -192,7 +201,7 @@ export class WebCrawler { // } // } - + console.log('---??---') newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { @@ -210,12 +219,14 @@ export class WebCrawler { currentDocumentUrl: task, }); } + console.log('----???----') await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } }, concurrencyLimit); + console.log('crawlUrls - queue.push') queue.push( urls.filter( (url) => @@ -225,7 +236,9 @@ export class WebCrawler { if (err) console.error(err); } ); + console.log('crawlUrls - queue.drain') await queue.drain(); + console.log('crawlUrls - return') return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -253,16 +266,22 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { + console.log('crawl scrapSingleUrl...') const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + console.log('got a page! lets continue...') content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; } else { - const response = await axios.get(url); + // console.log('crawl - else') + const response = await axios.get(url, { timeout: 3000 }); + console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; pageError = response.statusText != "OK" ? response.statusText : undefined; } + + console.log('crawl... keep going') const $ = load(content); let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; @@ -271,14 +290,17 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } + console.log('crawl... keep going 2') $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { + console.log('href:', href) let fullUrl = href; if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } const urlObj = new URL(fullUrl); + console.log('urlObj:', urlObj) const path = urlObj.pathname; @@ -295,10 +317,13 @@ export class WebCrawler { } } }); + console.log('crawl... keep going 3') if (this.visited.size === 1) { return links; } + + console.log('returning crawl...') // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -385,6 +410,7 @@ export class WebCrawler { // private async tryFetchSitemapLinks(url: string): Promise { + console.log("4.1.1 - Normalizing URL"); const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); if (url.endsWith("/")) { @@ -393,46 +419,48 @@ export class WebCrawler { return url; }; + console.log("4.1.2 - Constructing sitemap URL"); const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; let sitemapLinks: string[] = []; + console.log("4.1.3 - Fetching sitemap from constructed URL"); try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); if (response.status === 200) { + console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { - // Error handling for failed sitemap fetch - // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); + console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } if (sitemapLinks.length === 0) { - // If the first one doesn't work, try the base URL + console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap); + const response = await axios.get(baseUrlSitemap, { timeout: 3000 }); if (response.status === 200) { + console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); } } catch (error) { - // Error handling for failed base URL sitemap fetch - // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); } } - // Normalize and check if the URL is present in any of the sitemaps + console.log("4.1.7 - Normalizing sitemap links"); const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl + console.log("4.1.8 - Checking if normalized URL is already included"); if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { - // do not push the normalized url + console.log("4.1.9 - Adding initial URL to sitemap links"); sitemapLinks.push(url); } - + console.log("4.1.10 - Returning sitemap links"); return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 21301af3..037b6898 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -63,11 +63,13 @@ export class WebScraperDataProvider { await Promise.all( batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; + console.log('convertUrlsToDocuments - scrapSingleUrl') const result = await scrapSingleUrl( url, this.pageOptions, existingHTML ); + console.log('convertUrlsToDocuments - result ok') processedUrls++; if (inProgress) { inProgress({ @@ -98,6 +100,7 @@ export class WebScraperDataProvider { return [] as Document[]; } } + console.log('returning results from convertUrlsToDocuments...') return results.filter((result) => result !== null) as Document[]; } @@ -106,7 +109,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { this.validateInitialUrl(); - + console.log('2. here OK!') if (!useCaching) { return this.processDocumentsWithoutCache(inProgress); } @@ -175,6 +178,7 @@ export class WebScraperDataProvider { allowBackwardCrawling: this.allowBackwardCrawling, }); + console.log('3. here OK!') let links = await crawler.start( inProgress, this.pageOptions, @@ -186,21 +190,28 @@ export class WebScraperDataProvider { this.maxCrawledDepth ); + console.log("8 - Mapping URLs from links"); let allLinks = links.map((e) => e.url); + console.log("9 - Mapping HTML content from links"); const allHtmls = links.map((e) => e.html); + console.log("10 - Checking if only URLs should be returned"); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks, inProgress); } let documents = []; + console.log("11 - Checking if crawler is in fast mode and HTML content is present"); // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { + console.log("12 - Processing links with HTML content in fast mode"); documents = await this.processLinks(allLinks, inProgress, allHtmls); } else { + console.log("13 - Processing links in normal mode"); documents = await this.processLinks(allLinks, inProgress); } + console.log("14 - Caching and finalizing documents"); return this.cacheAndFinalizeDocuments(documents, allLinks); } @@ -259,14 +270,22 @@ export class WebScraperDataProvider { links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); + console.log('processLinks - convertUrlsToDocuments...') let documents = await this.convertUrlsToDocuments( links, inProgress, allHtmls ); - documents = await this.getSitemapData(this.urls[0], documents); + console.log('processLinks - convertUrlsToDocuments - done') + console.log('processLinks - getSitemapData...') + documents = await this.getSitemapData(this.urls[0], documents); + console.log('processLinks - getSitemapData - done') + + console.log('processLinks - applyPathReplacements...') documents = this.applyPathReplacements(documents); + console.log('processLinks - applyPathReplacements - done') + // documents = await this.applyImgAltText(documents); if ( @@ -275,6 +294,7 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } + console.log('processLinks - returning...') return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -320,8 +340,11 @@ export class WebScraperDataProvider { documents: Document[], links: string[] ): Promise { + console.log('cacheAndFinalizeDocuments - 1') await this.setCachedDocuments(documents, links); + console.log('cacheAndFinalizeDocuments - 2') documents = this.removeChildLinks(documents); + console.log('cacheAndFinalizeDocuments - 3') return documents.splice(0, this.limit); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3c7222c4..05a24c29 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -113,13 +113,25 @@ export async function scrapWithScrapingBee( pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { + console.log("13. scrapWithScrapingBee - 1") const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + console.log("13. scrapWithScrapingBee - 2") const clientParams = await generateRequestParams( url, wait_browser, timeout, ); - + console.log({ url, + wait_browser, + timeout }) + console.log({ + ...clientParams, + params: { + ...clientParams.params, + 'transparent_status_code': 'True' + } + }) + console.log("13. scrapWithScrapingBee - 3") const response = await client.get({ ...clientParams, params: { @@ -127,7 +139,7 @@ export async function scrapWithScrapingBee( 'transparent_status_code': 'True' } }); - + console.log("13. scrapWithScrapingBee - 4") const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); @@ -140,6 +152,7 @@ export async function scrapWithScrapingBee( } catch (decodeError) { console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); } + console.log("13. scrapWithScrapingBee - 5 - returning ok") return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { @@ -396,8 +409,13 @@ export async function scrapSingleUrl( screenshot = customScrapedContent.screenshot; } + console.log( + 'chegou aqui' + ) + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); + console.log('cleanedHtml') return { text: await parseMarkdown(cleanedHtml), @@ -432,7 +450,9 @@ export async function scrapSingleUrl( break; } + console.log('attemptScraping - 1') const attempt = await attemptScraping(urlToScrap, scraper); + console.log('attemptScraping - 2 - return ok') text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; @@ -451,6 +471,7 @@ export async function scrapSingleUrl( console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); } } + console.log('ok... here we are...') if (!text) { throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); @@ -487,6 +508,7 @@ export async function scrapSingleUrl( }; } + console.log('returning document...') return document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c6dbf110..84e926e5 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -8,7 +8,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -42,7 +42,7 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async (url: string): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 7116963e..05958232 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean { } try { + if (!url.startsWith('http://') && !url.startsWith('https://')) { + url = 'https://' + url; + } + const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 1a67d60b..30db27c9 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -71,7 +71,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro while (attempt < maxAttempts && !resultAvailable) { try { - resultResponse = await axios.get(resultUrl, { headers }); + resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 }); if (resultResponse.status === 200) { resultAvailable = true; // Exit condition met } else { diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 540f0bee..f9ce9b3c 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -4,7 +4,7 @@ export async function attemptScrapWithRequests( urlToScrap: string ): Promise { try { - const response = await axios.get(urlToScrap); + const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { console.log("Failed normal requests as well"); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a42b3e8c..7bb39a68 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -14,6 +14,7 @@ if(process.env.ENV === 'production') { getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), async function (job, done) { + console.log('getWebScraperQueue - start') try { job.progress({ current: 1, @@ -22,11 +23,13 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - + console.log('getWebScraperQueue - startWebScraperPipeline') const { success, message, docs } = await startWebScraperPipeline({ job }); + console.log('getWebScraperQueue - startWebScraperPipeline - done') const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; + console.log('docs.length:', docs.length) const data = { success: success, result: { From 56d42d9c9badccf0d10c2d5f31c6c201f944e527 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Jun 2024 16:33:07 -0300 Subject: [PATCH 2/5] Nick: --- apps/api/src/lib/timeout.ts | 1 + apps/api/src/scraper/WebScraper/crawler.ts | 10 ++++++---- apps/api/src/scraper/WebScraper/sitemap.ts | 5 +++-- apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts | 3 ++- 4 files changed, 12 insertions(+), 7 deletions(-) create mode 100644 apps/api/src/lib/timeout.ts diff --git a/apps/api/src/lib/timeout.ts b/apps/api/src/lib/timeout.ts new file mode 100644 index 00000000..fd0e5ade --- /dev/null +++ b/apps/api/src/lib/timeout.ts @@ -0,0 +1 @@ +export const axiosTimeout = 3000; \ No newline at end of file diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 3a3ca052..d1e93cdf 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -7,6 +7,7 @@ import { CrawlerOptions, PageOptions, Progress } from "../../lib/entities"; import { scrapSingleUrl, scrapWithScrapingBee } from "./single_url"; import robotsParser from "robots-parser"; import { getURLDepth } from "./utils/maxDepthUtils"; +import { axiosTimeout } from "../../../src/lib/timeout"; export class WebCrawler { private initialUrl: string; @@ -131,7 +132,7 @@ export class WebCrawler { try { console.log('3.1 here OK') console.log('this.robotsTxtUrl:', this.robotsTxtUrl) - const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 }); + const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); console.log('????', {response}) console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); @@ -274,7 +275,7 @@ export class WebCrawler { pageError = page.metadata?.pageError || undefined; } else { // console.log('crawl - else') - const response = await axios.get(url, { timeout: 3000 }); + const response = await axios.get(url, { timeout: axiosTimeout }); console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; @@ -312,6 +313,7 @@ export class WebCrawler { !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { + console.log(fullUrl) links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } @@ -428,7 +430,7 @@ export class WebCrawler { console.log("4.1.3 - Fetching sitemap from constructed URL"); try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); @@ -441,7 +443,7 @@ export class WebCrawler { console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap, { timeout: 3000 }); + const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 84e926e5..3f563471 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -1,4 +1,5 @@ import axios from "axios"; +import { axiosTimeout } from "../../lib/timeout"; import { parseStringPromise } from "xml2js"; export async function getLinksFromSitemap( @@ -8,7 +9,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -42,7 +43,7 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async (url: string): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl, { timeout: 3000 }); + const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 30db27c9..3e01571e 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -6,6 +6,7 @@ import dotenv from "dotenv"; import pdf from "pdf-parse"; import path from "path"; import os from "os"; +import { axiosTimeout } from "../../../lib/timeout"; dotenv.config(); @@ -71,7 +72,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro while (attempt < maxAttempts && !resultAvailable) { try { - resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 }); + resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); if (resultResponse.status === 200) { resultAvailable = true; // Exit condition met } else { From 3ebdf93342baa1ebaf039341156ab06449e37651 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:43:12 -0300 Subject: [PATCH 3/5] removed console.logs --- apps/api/requests.http | 6 +-- apps/api/src/controllers/crawl.ts | 2 - apps/api/src/lib/html-to-markdown.ts | 2 - apps/api/src/main/runWebScraper.ts | 2 - apps/api/src/scraper/WebScraper/crawler.ts | 43 +------------------ apps/api/src/scraper/WebScraper/index.ts | 24 ----------- apps/api/src/scraper/WebScraper/single_url.ts | 25 ----------- 7 files changed, 4 insertions(+), 100 deletions(-) diff --git a/apps/api/requests.http b/apps/api/requests.http index 32f9f60e..3a1a9902 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,10 +1,10 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer fc- +Authorization: Bearer fc content-type: application/json { - "url":"new.abb.com/sustainability/foundation" + "url":"firecrawl.dev" } @@ -18,7 +18,7 @@ Authorization: Bearer fc- content-type: application/json { - "url": "new.abb.com/sustainability/foundation" + "url": "firecrawl.dev" } ## "reoveTags": [], diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 43edc579..8fd876d3 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -66,7 +66,6 @@ export async function crawlController(req: Request, res: Response) { parsePDF: true }; - console.log('1. here OK!') if (mode === "single_urls" && !url.includes(",")) { try { const a = new WebScraperDataProvider(); @@ -85,7 +84,6 @@ export async function crawlController(req: Request, res: Response) { current_url: progress.currentDocumentUrl, }); }); - console.log('crawlController - return res.json...') return res.json({ success: true, documents: docs, diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index 86899598..233da921 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,6 +1,5 @@ export function parseMarkdown(html: string) { - console.log('parseMarkdown - start!') var TurndownService = require("turndown"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm') @@ -51,6 +50,5 @@ export function parseMarkdown(html: string) { /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - console.log('parseMarkdown - return') return markdownContent; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index a8c4e84a..dee89bc4 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -78,11 +78,9 @@ export async function runWebScraper({ pageOptions: pageOptions, }); } - console.log('runWebScraper - getDocuments') const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); })) as Document[]; - console.log('runWebScraper - getDocuments - done - docs.length:', docs.length) if (docs.length === 0) { return { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index d1e93cdf..2e2dec2b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -130,31 +130,20 @@ export class WebCrawler { ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { - console.log('3.1 here OK') - console.log('this.robotsTxtUrl:', this.robotsTxtUrl) const response = await axios.get(this.robotsTxtUrl, { timeout: axiosTimeout }); - console.log('????', {response}) - console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); - } - console.log('4. here OK!') if(!crawlerOptions?.ignoreSitemap){ - console.log('4.1') const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - console.log('4.2') if (sitemapLinks.length > 0) { - console.log('4.3') let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - console.log('4.4') return filteredLinks.map(link => ({ url: link, html: "" })); } } - console.log('5. here OK!') const urls = await this.crawlUrls( [this.initialUrl], pageOptions, @@ -162,7 +151,6 @@ export class WebCrawler { inProgress ); - console.log('6. here OK!') if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -172,7 +160,6 @@ export class WebCrawler { // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); - console.log('7. here OK!') return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } @@ -189,7 +176,6 @@ export class WebCrawler { } return; } - console.log('crawlUrls - crawl') const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { @@ -202,7 +188,6 @@ export class WebCrawler { // } // } - console.log('---??---') newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { @@ -220,14 +205,12 @@ export class WebCrawler { currentDocumentUrl: task, }); } - console.log('----???----') await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } }, concurrencyLimit); - console.log('crawlUrls - queue.push') queue.push( urls.filter( (url) => @@ -237,9 +220,7 @@ export class WebCrawler { if (err) console.error(err); } ); - console.log('crawlUrls - queue.drain') await queue.drain(); - console.log('crawlUrls - return') return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -267,22 +248,17 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { - console.log('crawl scrapSingleUrl...') const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); - console.log('got a page! lets continue...') content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; } else { - // console.log('crawl - else') const response = await axios.get(url, { timeout: axiosTimeout }); - console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; pageError = response.statusText != "OK" ? response.statusText : undefined; } - console.log('crawl... keep going') const $ = load(content); let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; @@ -291,17 +267,14 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } - console.log('crawl... keep going 2') $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { - console.log('href:', href) let fullUrl = href; if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } const urlObj = new URL(fullUrl); - console.log('urlObj:', urlObj) const path = urlObj.pathname; @@ -313,19 +286,15 @@ export class WebCrawler { !this.matchesExcludes(path) && this.isRobotsAllowed(fullUrl) ) { - console.log(fullUrl) - links.push({ url: fullUrl, html: content, pageStatusCode, pageError }); } } }); - console.log('crawl... keep going 3') - + if (this.visited.size === 1) { return links; } - console.log('returning crawl...') // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -412,7 +381,6 @@ export class WebCrawler { // private async tryFetchSitemapLinks(url: string): Promise { - console.log("4.1.1 - Normalizing URL"); const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); if (url.endsWith("/")) { @@ -421,18 +389,15 @@ export class WebCrawler { return url; }; - console.log("4.1.2 - Constructing sitemap URL"); const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; let sitemapLinks: string[] = []; - console.log("4.1.3 - Fetching sitemap from constructed URL"); try { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); if (response.status === 200) { - console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { @@ -440,12 +405,10 @@ export class WebCrawler { } if (sitemapLinks.length === 0) { - console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { - console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); } } catch (error) { @@ -453,16 +416,12 @@ export class WebCrawler { } } - console.log("4.1.7 - Normalizing sitemap links"); const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - console.log("4.1.8 - Checking if normalized URL is already included"); if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { - console.log("4.1.9 - Adding initial URL to sitemap links"); sitemapLinks.push(url); } - console.log("4.1.10 - Returning sitemap links"); return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 037b6898..9e318505 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -63,13 +63,11 @@ export class WebScraperDataProvider { await Promise.all( batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; - console.log('convertUrlsToDocuments - scrapSingleUrl') const result = await scrapSingleUrl( url, this.pageOptions, existingHTML ); - console.log('convertUrlsToDocuments - result ok') processedUrls++; if (inProgress) { inProgress({ @@ -100,7 +98,6 @@ export class WebScraperDataProvider { return [] as Document[]; } } - console.log('returning results from convertUrlsToDocuments...') return results.filter((result) => result !== null) as Document[]; } @@ -109,7 +106,6 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { this.validateInitialUrl(); - console.log('2. here OK!') if (!useCaching) { return this.processDocumentsWithoutCache(inProgress); } @@ -178,7 +174,6 @@ export class WebScraperDataProvider { allowBackwardCrawling: this.allowBackwardCrawling, }); - console.log('3. here OK!') let links = await crawler.start( inProgress, this.pageOptions, @@ -190,28 +185,21 @@ export class WebScraperDataProvider { this.maxCrawledDepth ); - console.log("8 - Mapping URLs from links"); let allLinks = links.map((e) => e.url); - console.log("9 - Mapping HTML content from links"); const allHtmls = links.map((e) => e.html); - console.log("10 - Checking if only URLs should be returned"); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks, inProgress); } let documents = []; - console.log("11 - Checking if crawler is in fast mode and HTML content is present"); // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { - console.log("12 - Processing links with HTML content in fast mode"); documents = await this.processLinks(allLinks, inProgress, allHtmls); } else { - console.log("13 - Processing links in normal mode"); documents = await this.processLinks(allLinks, inProgress); } - console.log("14 - Caching and finalizing documents"); return this.cacheAndFinalizeDocuments(documents, allLinks); } @@ -270,22 +258,14 @@ export class WebScraperDataProvider { links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); - console.log('processLinks - convertUrlsToDocuments...') let documents = await this.convertUrlsToDocuments( links, inProgress, allHtmls ); - console.log('processLinks - convertUrlsToDocuments - done') - console.log('processLinks - getSitemapData...') documents = await this.getSitemapData(this.urls[0], documents); - console.log('processLinks - getSitemapData - done') - - console.log('processLinks - applyPathReplacements...') documents = this.applyPathReplacements(documents); - console.log('processLinks - applyPathReplacements - done') - // documents = await this.applyImgAltText(documents); if ( @@ -294,7 +274,6 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } - console.log('processLinks - returning...') return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -340,11 +319,8 @@ export class WebScraperDataProvider { documents: Document[], links: string[] ): Promise { - console.log('cacheAndFinalizeDocuments - 1') await this.setCachedDocuments(documents, links); - console.log('cacheAndFinalizeDocuments - 2') documents = this.removeChildLinks(documents); - console.log('cacheAndFinalizeDocuments - 3') return documents.splice(0, this.limit); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 05a24c29..9f8d563d 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -113,25 +113,12 @@ export async function scrapWithScrapingBee( pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { - console.log("13. scrapWithScrapingBee - 1") const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); - console.log("13. scrapWithScrapingBee - 2") const clientParams = await generateRequestParams( url, wait_browser, timeout, ); - console.log({ url, - wait_browser, - timeout }) - console.log({ - ...clientParams, - params: { - ...clientParams.params, - 'transparent_status_code': 'True' - } - }) - console.log("13. scrapWithScrapingBee - 3") const response = await client.get({ ...clientParams, params: { @@ -139,7 +126,6 @@ export async function scrapWithScrapingBee( 'transparent_status_code': 'True' } }); - console.log("13. scrapWithScrapingBee - 4") const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); @@ -152,7 +138,6 @@ export async function scrapWithScrapingBee( } catch (decodeError) { console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); } - console.log("13. scrapWithScrapingBee - 5 - returning ok") return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { @@ -409,14 +394,8 @@ export async function scrapSingleUrl( screenshot = customScrapedContent.screenshot; } - console.log( - 'chegou aqui' - ) - //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); - console.log('cleanedHtml') - return { text: await parseMarkdown(cleanedHtml), html: cleanedHtml, @@ -450,9 +429,7 @@ export async function scrapSingleUrl( break; } - console.log('attemptScraping - 1') const attempt = await attemptScraping(urlToScrap, scraper); - console.log('attemptScraping - 2 - return ok') text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; @@ -471,7 +448,6 @@ export async function scrapSingleUrl( console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); } } - console.log('ok... here we are...') if (!text) { throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); @@ -508,7 +484,6 @@ export async function scrapSingleUrl( }; } - console.log('returning document...') return document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); From 08c1fa799bce357f17afd10a79708a4113bfb363 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Jun 2024 16:51:32 -0300 Subject: [PATCH 4/5] Update queue-worker.ts --- apps/api/src/services/queue-worker.ts | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 7bb39a68..f6328cf3 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -14,7 +14,7 @@ if(process.env.ENV === 'production') { getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), async function (job, done) { - console.log('getWebScraperQueue - start') + try { job.progress({ current: 1, @@ -23,13 +23,10 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - console.log('getWebScraperQueue - startWebScraperPipeline') const { success, message, docs } = await startWebScraperPipeline({ job }); - console.log('getWebScraperQueue - startWebScraperPipeline - done') const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; - console.log('docs.length:', docs.length) const data = { success: success, result: { From 90b7fff3665649c4714c5d0195b8121e524308cb Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Jun 2024 16:52:01 -0300 Subject: [PATCH 5/5] Update crawler.ts --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 2e2dec2b..5003845e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -418,7 +418,7 @@ export class WebCrawler { const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - + // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { sitemapLinks.push(url); }