diff --git a/apps/api/requests.http b/apps/api/requests.http index 495df975..32f9f60e 100644 --- a/apps/api/requests.http +++ b/apps/api/requests.http @@ -1,10 +1,10 @@ ### Crawl Website POST http://localhost:3002/v0/scrape HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc- content-type: application/json { - "url":"https://docs.mendable.ai" + "url":"new.abb.com/sustainability/foundation" } @@ -14,16 +14,24 @@ GET http://localhost:3002/v0/jobs/active HTTP/1.1 ### Scrape Website POST http://localhost:3002/v0/crawl HTTP/1.1 -Authorization: Bearer +Authorization: Bearer fc- content-type: application/json { - "url":"https://www.mendable.ai", - "crawlerOptions": { - "returnOnlyUrls": true - } + "url": "new.abb.com/sustainability/foundation" } +## "reoveTags": [], + # "mode": "crawl", + # "crawlerOptions": { + # "allowBackwardCrawling": false + # }, + # "pageOptions": { + # "onlyMainContent": false, + # "includeHtml": false, + # "parsePDF": true + # } + diff --git a/apps/api/src/controllers/crawl.ts b/apps/api/src/controllers/crawl.ts index 8fd876d3..43edc579 100644 --- a/apps/api/src/controllers/crawl.ts +++ b/apps/api/src/controllers/crawl.ts @@ -66,6 +66,7 @@ export async function crawlController(req: Request, res: Response) { parsePDF: true }; + console.log('1. here OK!') if (mode === "single_urls" && !url.includes(",")) { try { const a = new WebScraperDataProvider(); @@ -84,6 +85,7 @@ export async function crawlController(req: Request, res: Response) { current_url: progress.currentDocumentUrl, }); }); + console.log('crawlController - return res.json...') return res.json({ success: true, documents: docs, diff --git a/apps/api/src/lib/html-to-markdown.ts b/apps/api/src/lib/html-to-markdown.ts index e084f5ef..86899598 100644 --- a/apps/api/src/lib/html-to-markdown.ts +++ b/apps/api/src/lib/html-to-markdown.ts @@ -1,5 +1,6 @@ export function parseMarkdown(html: string) { + console.log('parseMarkdown - start!') var TurndownService = require("turndown"); var turndownPluginGfm = require('joplin-turndown-plugin-gfm') @@ -50,6 +51,6 @@ export function parseMarkdown(html: string) { /\[Skip to Content\]\(#[^\)]*\)/gi, "" ); - + console.log('parseMarkdown - return') return markdownContent; } diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index dee89bc4..a8c4e84a 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -78,9 +78,11 @@ export async function runWebScraper({ pageOptions: pageOptions, }); } + console.log('runWebScraper - getDocuments') const docs = (await provider.getDocuments(false, (progress: Progress) => { inProgress(progress); })) as Document[]; + console.log('runWebScraper - getDocuments - done - docs.length:', docs.length) if (docs.length === 0) { return { diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 0793d711..3a3ca052 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -129,24 +129,31 @@ export class WebCrawler { ): Promise<{ url: string, html: string }[]> { // Fetch and parse robots.txt try { - const response = await axios.get(this.robotsTxtUrl); + console.log('3.1 here OK') + console.log('this.robotsTxtUrl:', this.robotsTxtUrl) + const response = await axios.get(this.robotsTxtUrl, { timeout: 3000 }); + console.log('????', {response}) + console.log('3.2 here OK') this.robots = robotsParser(this.robotsTxtUrl, response.data); } catch (error) { console.log(`Failed to fetch robots.txt from ${this.robotsTxtUrl}`); } - + console.log('4. here OK!') if(!crawlerOptions?.ignoreSitemap){ + console.log('4.1') const sitemapLinks = await this.tryFetchSitemapLinks(this.initialUrl); - + console.log('4.2') if (sitemapLinks.length > 0) { + console.log('4.3') let filteredLinks = this.filterLinks(sitemapLinks, limit, maxDepth); - + console.log('4.4') return filteredLinks.map(link => ({ url: link, html: "" })); } } + console.log('5. here OK!') const urls = await this.crawlUrls( [this.initialUrl], pageOptions, @@ -154,7 +161,7 @@ export class WebCrawler { inProgress ); - + console.log('6. here OK!') if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -164,6 +171,7 @@ export class WebCrawler { // make sure to run include exclude here again const filteredUrls = this.filterLinks(urls.map(urlObj => urlObj.url), limit, this.maxCrawledDepth); + console.log('7. here OK!') return filteredUrls.map(url => ({ url, html: urls.find(urlObj => urlObj.url === url)?.html || "" })); } @@ -180,6 +188,7 @@ export class WebCrawler { } return; } + console.log('crawlUrls - crawl') const newUrls = await this.crawl(task, pageOptions); // add the initial url if not already added // if (this.visited.size === 1) { @@ -192,7 +201,7 @@ export class WebCrawler { // } // } - + console.log('---??---') newUrls.forEach((page) => this.crawledUrls.set(page.url, page.html)); if (inProgress && newUrls.length > 0) { @@ -210,12 +219,14 @@ export class WebCrawler { currentDocumentUrl: task, }); } + console.log('----???----') await this.crawlUrls(newUrls.map((p) => p.url), pageOptions, concurrencyLimit, inProgress); if (callback && typeof callback === "function") { callback(); } }, concurrencyLimit); + console.log('crawlUrls - queue.push') queue.push( urls.filter( (url) => @@ -225,7 +236,9 @@ export class WebCrawler { if (err) console.error(err); } ); + console.log('crawlUrls - queue.drain') await queue.drain(); + console.log('crawlUrls - return') return Array.from(this.crawledUrls.entries()).map(([url, html]) => ({ url, html })); } @@ -253,16 +266,22 @@ export class WebCrawler { // If it is the first link, fetch with single url if (this.visited.size === 1) { + console.log('crawl scrapSingleUrl...') const page = await scrapSingleUrl(url, { ...pageOptions, includeHtml: true }); + console.log('got a page! lets continue...') content = page.html ?? ""; pageStatusCode = page.metadata?.pageStatusCode; pageError = page.metadata?.pageError || undefined; } else { - const response = await axios.get(url); + // console.log('crawl - else') + const response = await axios.get(url, { timeout: 3000 }); + console.log('crawl - else - response ok') content = response.data ?? ""; pageStatusCode = response.status; pageError = response.statusText != "OK" ? response.statusText : undefined; } + + console.log('crawl... keep going') const $ = load(content); let links: { url: string, html: string, pageStatusCode?: number, pageError?: string }[] = []; @@ -271,14 +290,17 @@ export class WebCrawler { links.push({ url, html: content, pageStatusCode, pageError }); } + console.log('crawl... keep going 2') $("a").each((_, element) => { const href = $(element).attr("href"); if (href) { + console.log('href:', href) let fullUrl = href; if (!href.startsWith("http")) { fullUrl = new URL(href, this.baseUrl).toString(); } const urlObj = new URL(fullUrl); + console.log('urlObj:', urlObj) const path = urlObj.pathname; @@ -295,10 +317,13 @@ export class WebCrawler { } } }); + console.log('crawl... keep going 3') if (this.visited.size === 1) { return links; } + + console.log('returning crawl...') // Create a new list to return to avoid modifying the visited list return links.filter((link) => !this.visited.has(link.url)); } catch (error) { @@ -385,6 +410,7 @@ export class WebCrawler { // private async tryFetchSitemapLinks(url: string): Promise { + console.log("4.1.1 - Normalizing URL"); const normalizeUrl = (url: string) => { url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); if (url.endsWith("/")) { @@ -393,46 +419,48 @@ export class WebCrawler { return url; }; + console.log("4.1.2 - Constructing sitemap URL"); const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; let sitemapLinks: string[] = []; + console.log("4.1.3 - Fetching sitemap from constructed URL"); try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); if (response.status === 200) { + console.log("4.1.4 - Extracting links from sitemap"); sitemapLinks = await getLinksFromSitemap(sitemapUrl); } } catch (error) { - // Error handling for failed sitemap fetch - // console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); + console.error(`Failed to fetch sitemap from ${sitemapUrl}: ${error}`); } if (sitemapLinks.length === 0) { - // If the first one doesn't work, try the base URL + console.log("4.1.5 - Trying base URL sitemap as fallback"); const baseUrlSitemap = `${this.baseUrl}/sitemap.xml`; try { - const response = await axios.get(baseUrlSitemap); + const response = await axios.get(baseUrlSitemap, { timeout: 3000 }); if (response.status === 200) { + console.log("4.1.6 - Extracting links from base URL sitemap"); sitemapLinks = await getLinksFromSitemap(baseUrlSitemap); } } catch (error) { - // Error handling for failed base URL sitemap fetch - // console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); + console.error(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); } } - // Normalize and check if the URL is present in any of the sitemaps + console.log("4.1.7 - Normalizing sitemap links"); const normalizedUrl = normalizeUrl(url); const normalizedSitemapLinks = sitemapLinks.map(link => normalizeUrl(link)); - // has to be greater than 0 to avoid adding the initial URL to the sitemap links, and preventing crawler to crawl + console.log("4.1.8 - Checking if normalized URL is already included"); if (!normalizedSitemapLinks.includes(normalizedUrl) && sitemapLinks.length > 0) { - // do not push the normalized url + console.log("4.1.9 - Adding initial URL to sitemap links"); sitemapLinks.push(url); } - + console.log("4.1.10 - Returning sitemap links"); return sitemapLinks; } } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 21301af3..037b6898 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -63,11 +63,13 @@ export class WebScraperDataProvider { await Promise.all( batchUrls.map(async (url, index) => { const existingHTML = allHtmls ? allHtmls[i + index] : ""; + console.log('convertUrlsToDocuments - scrapSingleUrl') const result = await scrapSingleUrl( url, this.pageOptions, existingHTML ); + console.log('convertUrlsToDocuments - result ok') processedUrls++; if (inProgress) { inProgress({ @@ -98,6 +100,7 @@ export class WebScraperDataProvider { return [] as Document[]; } } + console.log('returning results from convertUrlsToDocuments...') return results.filter((result) => result !== null) as Document[]; } @@ -106,7 +109,7 @@ export class WebScraperDataProvider { inProgress?: (progress: Progress) => void ): Promise { this.validateInitialUrl(); - + console.log('2. here OK!') if (!useCaching) { return this.processDocumentsWithoutCache(inProgress); } @@ -175,6 +178,7 @@ export class WebScraperDataProvider { allowBackwardCrawling: this.allowBackwardCrawling, }); + console.log('3. here OK!') let links = await crawler.start( inProgress, this.pageOptions, @@ -186,21 +190,28 @@ export class WebScraperDataProvider { this.maxCrawledDepth ); + console.log("8 - Mapping URLs from links"); let allLinks = links.map((e) => e.url); + console.log("9 - Mapping HTML content from links"); const allHtmls = links.map((e) => e.html); + console.log("10 - Checking if only URLs should be returned"); if (this.returnOnlyUrls) { return this.returnOnlyUrlsResponse(allLinks, inProgress); } let documents = []; + console.log("11 - Checking if crawler is in fast mode and HTML content is present"); // check if fast mode is enabled and there is html inside the links if (this.crawlerMode === "fast" && links.some((link) => link.html)) { + console.log("12 - Processing links with HTML content in fast mode"); documents = await this.processLinks(allLinks, inProgress, allHtmls); } else { + console.log("13 - Processing links in normal mode"); documents = await this.processLinks(allLinks, inProgress); } + console.log("14 - Caching and finalizing documents"); return this.cacheAndFinalizeDocuments(documents, allLinks); } @@ -259,14 +270,22 @@ export class WebScraperDataProvider { links = links.filter(link => !pdfLinks.includes(link) && !docLinks.includes(link)); + console.log('processLinks - convertUrlsToDocuments...') let documents = await this.convertUrlsToDocuments( links, inProgress, allHtmls ); - documents = await this.getSitemapData(this.urls[0], documents); + console.log('processLinks - convertUrlsToDocuments - done') + console.log('processLinks - getSitemapData...') + documents = await this.getSitemapData(this.urls[0], documents); + console.log('processLinks - getSitemapData - done') + + console.log('processLinks - applyPathReplacements...') documents = this.applyPathReplacements(documents); + console.log('processLinks - applyPathReplacements - done') + // documents = await this.applyImgAltText(documents); if ( @@ -275,6 +294,7 @@ export class WebScraperDataProvider { ) { documents = await generateCompletions(documents, this.extractorOptions); } + console.log('processLinks - returning...') return documents.concat(pdfDocuments).concat(docxDocuments); } @@ -320,8 +340,11 @@ export class WebScraperDataProvider { documents: Document[], links: string[] ): Promise { + console.log('cacheAndFinalizeDocuments - 1') await this.setCachedDocuments(documents, links); + console.log('cacheAndFinalizeDocuments - 2') documents = this.removeChildLinks(documents); + console.log('cacheAndFinalizeDocuments - 3') return documents.splice(0, this.limit); } diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 3c7222c4..05a24c29 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -113,13 +113,25 @@ export async function scrapWithScrapingBee( pageOptions: { parsePDF?: boolean } = { parsePDF: true } ): Promise<{ content: string, pageStatusCode?: number, pageError?: string }> { try { + console.log("13. scrapWithScrapingBee - 1") const client = new ScrapingBeeClient(process.env.SCRAPING_BEE_API_KEY); + console.log("13. scrapWithScrapingBee - 2") const clientParams = await generateRequestParams( url, wait_browser, timeout, ); - + console.log({ url, + wait_browser, + timeout }) + console.log({ + ...clientParams, + params: { + ...clientParams.params, + 'transparent_status_code': 'True' + } + }) + console.log("13. scrapWithScrapingBee - 3") const response = await client.get({ ...clientParams, params: { @@ -127,7 +139,7 @@ export async function scrapWithScrapingBee( 'transparent_status_code': 'True' } }); - + console.log("13. scrapWithScrapingBee - 4") const contentType = response.headers["content-type"]; if (contentType && contentType.includes("application/pdf")) { return await fetchAndProcessPdf(url, pageOptions?.parsePDF); @@ -140,6 +152,7 @@ export async function scrapWithScrapingBee( } catch (decodeError) { console.error(`[ScrapingBee][c] Error decoding response data for url: ${url} -> ${decodeError}`); } + console.log("13. scrapWithScrapingBee - 5 - returning ok") return { content: text, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined }; } } catch (error) { @@ -396,8 +409,13 @@ export async function scrapSingleUrl( screenshot = customScrapedContent.screenshot; } + console.log( + 'chegou aqui' + ) + //* TODO: add an optional to return markdown or structured/extracted content let cleanedHtml = removeUnwantedElements(scraperResponse.text, pageOptions); + console.log('cleanedHtml') return { text: await parseMarkdown(cleanedHtml), @@ -432,7 +450,9 @@ export async function scrapSingleUrl( break; } + console.log('attemptScraping - 1') const attempt = await attemptScraping(urlToScrap, scraper); + console.log('attemptScraping - 2 - return ok') text = attempt.text ?? ''; html = attempt.html ?? ''; screenshot = attempt.screenshot ?? ''; @@ -451,6 +471,7 @@ export async function scrapSingleUrl( console.info(`Falling back to ${scrapersInOrder[nextScraperIndex]}`); } } + console.log('ok... here we are...') if (!text) { throw new Error(`All scraping methods failed for URL: ${urlToScrap}`); @@ -487,6 +508,7 @@ export async function scrapSingleUrl( }; } + console.log('returning document...') return document; } catch (error) { console.error(`Error: ${error} - Failed to fetch URL: ${urlToScrap}`); diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index c6dbf110..84e926e5 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -8,7 +8,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); content = response.data; } catch (error) { console.error(`Request failed for ${sitemapUrl}: ${error}`); @@ -42,7 +42,7 @@ export async function getLinksFromSitemap( export const fetchSitemapData = async (url: string): Promise => { const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; try { - const response = await axios.get(sitemapUrl); + const response = await axios.get(sitemapUrl, { timeout: 3000 }); if (response.status === 200) { const xml = response.data; const parsedXml = await parseStringPromise(xml); diff --git a/apps/api/src/scraper/WebScraper/utils/blocklist.ts b/apps/api/src/scraper/WebScraper/utils/blocklist.ts index 7116963e..05958232 100644 --- a/apps/api/src/scraper/WebScraper/utils/blocklist.ts +++ b/apps/api/src/scraper/WebScraper/utils/blocklist.ts @@ -43,6 +43,10 @@ export function isUrlBlocked(url: string): boolean { } try { + if (!url.startsWith('http://') && !url.startsWith('https://')) { + url = 'https://' + url; + } + const urlObj = new URL(url); const hostname = urlObj.hostname.toLowerCase(); diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 1a67d60b..30db27c9 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -71,7 +71,7 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro while (attempt < maxAttempts && !resultAvailable) { try { - resultResponse = await axios.get(resultUrl, { headers }); + resultResponse = await axios.get(resultUrl, { headers, timeout: 6000 }); if (resultResponse.status === 200) { resultAvailable = true; // Exit condition met } else { diff --git a/apps/api/src/scraper/WebScraper/utils/utils.ts b/apps/api/src/scraper/WebScraper/utils/utils.ts index 540f0bee..f9ce9b3c 100644 --- a/apps/api/src/scraper/WebScraper/utils/utils.ts +++ b/apps/api/src/scraper/WebScraper/utils/utils.ts @@ -4,7 +4,7 @@ export async function attemptScrapWithRequests( urlToScrap: string ): Promise { try { - const response = await axios.get(urlToScrap); + const response = await axios.get(urlToScrap, { timeout: 15000 }); if (!response.data) { console.log("Failed normal requests as well"); diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a42b3e8c..7bb39a68 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -14,6 +14,7 @@ if(process.env.ENV === 'production') { getWebScraperQueue().process( Math.floor(Number(process.env.NUM_WORKERS_PER_QUEUE ?? 8)), async function (job, done) { + console.log('getWebScraperQueue - start') try { job.progress({ current: 1, @@ -22,11 +23,13 @@ getWebScraperQueue().process( current_url: "", }); const start = Date.now(); - + console.log('getWebScraperQueue - startWebScraperPipeline') const { success, message, docs } = await startWebScraperPipeline({ job }); + console.log('getWebScraperQueue - startWebScraperPipeline - done') const end = Date.now(); const timeTakenInSeconds = (end - start) / 1000; + console.log('docs.length:', docs.length) const data = { success: success, result: {