diff --git a/apps/api/src/controllers/v1/batch-scrape.ts b/apps/api/src/controllers/v1/batch-scrape.ts index 19ce3ba0..21c9745c 100644 --- a/apps/api/src/controllers/v1/batch-scrape.ts +++ b/apps/api/src/controllers/v1/batch-scrape.ts @@ -61,7 +61,7 @@ export async function batchScrapeController( } logger.debug("Batch scrape " + id + " starting", { - urlsLength: urls, + urlsLength: urls.length, appendToId: req.body.appendToId, account: req.account, }); diff --git a/apps/api/src/controllers/v1/crawl-status.ts b/apps/api/src/controllers/v1/crawl-status.ts index 1aec86c8..ce3831f2 100644 --- a/apps/api/src/controllers/v1/crawl-status.ts +++ b/apps/api/src/controllers/v1/crawl-status.ts @@ -157,10 +157,10 @@ export async function crawlStatusController( continue; } - if (job.returnvalue === undefined) { + if (job.returnvalue === undefined || job.returnvalue === null) { logger.warn( "Job was considered done, but returnvalue is undefined!", - { jobId: job.id, state }, + { jobId: job.id, state, returnvalue: job.returnvalue }, ); continue; } diff --git a/apps/api/src/lib/canonical-url.test.ts b/apps/api/src/lib/canonical-url.test.ts new file mode 100644 index 00000000..65171642 --- /dev/null +++ b/apps/api/src/lib/canonical-url.test.ts @@ -0,0 +1,91 @@ +import { normalizeUrl, normalizeUrlOnlyHostname } from './canonical-url'; + +describe('normalizeUrlOnlyHostname', () => { + it('should remove protocol and www from URL', () => { + const url = 'https://www.example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should remove only protocol if www is not present', () => { + const url = 'https://example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs without protocol', () => { + const url = 'www.example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs without protocol and www', () => { + const url = 'example.com'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle URLs with paths', () => { + const url = 'https://www.example.com/path/to/resource'; + const expected = 'example.com'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); + + it('should handle invalid URLs gracefully', () => { + const url = 'not a valid url'; + const expected = 'not a valid url'; + expect(normalizeUrlOnlyHostname(url)).toBe(expected); + }); +}); + + + +describe('normalizeUrl', () => { + it('should remove protocol and www from URL', () => { + const url = 'https://www.example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should remove only protocol if www is not present', () => { + const url = 'https://example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs without protocol', () => { + const url = 'www.example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs without protocol and www', () => { + const url = 'example.com'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs with paths', () => { + const url = 'https://www.example.com/path/to/resource'; + const expected = 'example.com/path/to/resource'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs with trailing slash', () => { + const url = 'https://www.example.com/'; + const expected = 'example.com'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle URLs with trailing slash and path', () => { + const url = 'https://www.example.com/path/'; + const expected = 'example.com/path'; + expect(normalizeUrl(url)).toBe(expected); + }); + + it('should handle invalid URLs gracefully', () => { + const url = 'not a valid url'; + const expected = 'not a valid url'; + expect(normalizeUrl(url)).toBe(expected); + }); +}); diff --git a/apps/api/src/lib/canonical-url.ts b/apps/api/src/lib/canonical-url.ts new file mode 100644 index 00000000..50570293 --- /dev/null +++ b/apps/api/src/lib/canonical-url.ts @@ -0,0 +1,19 @@ +export function normalizeUrl(url: string) { + url = url.replace(/^https?:\/\//, "").replace(/^www\./, ""); + if (url.endsWith("/")) { + url = url.slice(0, -1); + } + return url; +} + +export function normalizeUrlOnlyHostname(url: string) { + try { + const hostname = new URL(url).hostname; + return hostname.replace(/^www\./, ""); + } catch (error) { + return url + .replace(/^https?:\/\//, "") + .replace(/^www\./, "") + .split("/")[0]; + } +} diff --git a/apps/api/src/lib/extract/document-scraper.ts b/apps/api/src/lib/extract/document-scraper.ts index 04194b0b..91d515df 100644 --- a/apps/api/src/lib/extract/document-scraper.ts +++ b/apps/api/src/lib/extract/document-scraper.ts @@ -14,10 +14,13 @@ interface ScrapeDocumentOptions { timeout: number; } -export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: URLTrace[]): Promise { +export async function scrapeDocument( + options: ScrapeDocumentOptions, + urlTraces: URLTrace[], +): Promise { const trace = urlTraces.find((t) => t.url === options.url); if (trace) { - trace.status = 'scraped'; + trace.status = "scraped"; trace.timing.scrapedAt = new Date().toISOString(); } @@ -35,7 +38,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: mode: "single_urls", team_id: options.teamId, scrapeOptions: scrapeOptions.parse({}), - internalOptions: {}, + internalOptions: { + useCache: true, + }, plan: options.plan, origin: options.origin, is_scrape: true, @@ -61,9 +66,9 @@ export async function scrapeDocument(options: ScrapeDocumentOptions, urlTraces: } catch (error) { logger.error(`Error in scrapeDocument: ${error}`); if (trace) { - trace.status = 'error'; + trace.status = "error"; trace.error = error.message; } return null; } -} \ No newline at end of file +} diff --git a/apps/api/src/lib/extract/url-processor.ts b/apps/api/src/lib/extract/url-processor.ts index af250fcd..a5027fa9 100644 --- a/apps/api/src/lib/extract/url-processor.ts +++ b/apps/api/src/lib/extract/url-processor.ts @@ -84,7 +84,7 @@ export async function processUrl(options: ProcessUrlOptions, urlTraces: URLTrace }); // retry if only one url is returned - if (uniqueUrls.length === 1) { + if (uniqueUrls.length <= 1) { const retryMapResults = await getMapResults({ url: baseUrl, teamId: options.teamId, diff --git a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts index f48806fd..c0451df4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/cache/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/cache/index.ts @@ -10,6 +10,9 @@ export async function scrapeCache(meta: Meta): Promise { const entry = await getEntryFromCache(key); if (entry === null) throw new EngineError("Cache missed"); + // Set fromCache flag to indicate this document was retrieved from cache + meta.internalOptions.fromCache = true; + return { url: entry.url, html: entry.html, diff --git a/apps/api/src/scraper/scrapeURL/engines/index.ts b/apps/api/src/scraper/scrapeURL/engines/index.ts index bf51ac94..e452f7fa 100644 --- a/apps/api/src/scraper/scrapeURL/engines/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/index.ts @@ -298,6 +298,15 @@ export function buildFallbackList(meta: Meta): { engine: Engine; unsupportedFeatures: Set; }[] { + + if (meta.internalOptions.useCache !== true) { + const cacheIndex = engines.indexOf("cache"); + if (cacheIndex !== -1) { + engines.splice(cacheIndex, 1); + } + } else { + meta.logger.debug("Cache engine enabled by useCache option"); + } const prioritySum = [...meta.featureFlags].reduce( (a, x) => a + featureFlagOptions[x].priority, 0, diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 549ce9d1..7f4a76e4 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -151,9 +151,10 @@ export type InternalOptions = { v0CrawlOnlyUrls?: boolean; v0DisableJsDom?: boolean; - + useCache?: boolean; disableSmartWaitCache?: boolean; // Passed along to fire-engine isBackgroundIndex?: boolean; + fromCache?: boolean; // Indicates if the document was retrieved from cache }; export type EngineResultsTracker = { diff --git a/apps/api/src/scraper/scrapeURL/transformers/cache.ts b/apps/api/src/scraper/scrapeURL/transformers/cache.ts index 523a8419..f2d7bcf4 100644 --- a/apps/api/src/scraper/scrapeURL/transformers/cache.ts +++ b/apps/api/src/scraper/scrapeURL/transformers/cache.ts @@ -3,6 +3,10 @@ import { Meta } from ".."; import { CacheEntry, cacheKey, saveEntryToCache } from "../../../lib/cache"; export function saveToCache(meta: Meta, document: Document): Document { + if (meta.internalOptions.useCache !== true) { + return document; + } + if ( document.metadata.statusCode! < 200 || document.metadata.statusCode! >= 300 @@ -15,6 +19,12 @@ export function saveToCache(meta: Meta, document: Document): Document { ); } + // If the document was retrieved from cache, we don't need to save it + if (meta.internalOptions.fromCache) { + return document; + } + + const key = cacheKey(meta.url, meta.options, meta.internalOptions); if (key !== null) { diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index c235a87b..422a8160 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -53,6 +53,8 @@ import { BLOCKLISTED_URL_MESSAGE } from "../lib/strings"; import { indexPage } from "../lib/extract/index/pinecone"; import { Document } from "../controllers/v1/types"; import { performExtraction } from "../lib/extract/extraction-service"; +import { supabase_service } from "../services/supabase"; +import { normalizeUrl, normalizeUrlOnlyHostname } from "../lib/canonical-url"; configDotenv(); @@ -80,6 +82,69 @@ const gotJobInterval = Number(process.env.CONNECTION_MONITOR_INTERVAL) || 20; async function finishCrawlIfNeeded(job: Job & { id: string }, sc: StoredCrawl) { if (await finishCrawl(job.data.crawl_id)) { + (async () => { + const originUrl = sc.originUrl ? normalizeUrlOnlyHostname(sc.originUrl) : undefined; + // Get all visited URLs from Redis + const visitedUrls = await redisConnection.smembers( + "crawl:" + job.data.crawl_id + ":visited", + ); + // Upload to Supabase if we have URLs and this is a crawl (not a batch scrape) + if (visitedUrls.length > 0 && job.data.crawlerOptions !== null && originUrl) { + // Fire and forget the upload to Supabase + try { + // Standardize URLs to canonical form (https, no www) + const standardizedUrls = [ + ...new Set( + visitedUrls.map((url) => { + return normalizeUrl(url); + }), + ), + ]; + // First check if entry exists for this origin URL + const { data: existingMap } = await supabase_service + .from("crawl_maps") + .select("urls") + .eq("origin_url", originUrl) + .single(); + + if (existingMap) { + // Merge URLs, removing duplicates + const mergedUrls = [ + ...new Set([...existingMap.urls, ...standardizedUrls]), + ]; + + const { error } = await supabase_service + .from("crawl_maps") + .update({ + urls: mergedUrls, + num_urls: mergedUrls.length, + updated_at: new Date().toISOString(), + }) + .eq("origin_url", originUrl); + + if (error) { + _logger.error("Failed to update crawl map", { error }); + } + } else { + // Insert new entry if none exists + const { error } = await supabase_service.from("crawl_maps").insert({ + origin_url: originUrl, + urls: standardizedUrls, + num_urls: standardizedUrls.length, + created_at: new Date().toISOString(), + updated_at: new Date().toISOString(), + }); + + if (error) { + _logger.error("Failed to save crawl map", { error }); + } + } + } catch (error) { + _logger.error("Error saving crawl map", { error }); + } + } + })(); + if (!job.data.v1) { const jobIDs = await getCrawlJobs(job.data.crawl_id); @@ -582,16 +647,16 @@ async function indexJob(job: Job & { id: string }, document: Document) { document.markdown && job.data.team_id === process.env.BACKGROUND_INDEX_TEAM_ID! ) { - indexPage({ - document: document, - originUrl: job.data.crawl_id - ? (await getCrawl(job.data.crawl_id))?.originUrl! - : document.metadata.sourceURL!, - crawlId: job.data.crawl_id, - teamId: job.data.team_id, - }).catch((error) => { - _logger.error("Error indexing page", { error }); - }); + // indexPage({ + // document: document, + // originUrl: job.data.crawl_id + // ? (await getCrawl(job.data.crawl_id))?.originUrl! + // : document.metadata.sourceURL!, + // crawlId: job.data.crawl_id, + // teamId: job.data.team_id, + // }).catch((error) => { + // _logger.error("Error indexing page", { error }); + // }); } } @@ -696,7 +761,8 @@ async function processJob(job: Job & { id: string }, token: string) { doc.metadata.url !== undefined && doc.metadata.sourceURL !== undefined && normalizeURL(doc.metadata.url, sc) !== - normalizeURL(doc.metadata.sourceURL, sc) + normalizeURL(doc.metadata.sourceURL, sc) && + job.data.crawlerOptions !== null // only on crawls, don't care on batch scrape ) { const crawler = crawlToCrawler(job.data.crawl_id, sc); if ( @@ -828,9 +894,10 @@ async function processJob(job: Job & { id: string }, token: string) { newJobId: jobId, }); } else { - logger.debug("Could not lock URL " + JSON.stringify(link), { - url: link, - }); + // TODO: removed this, ok? too many 'not useful' logs (?) Mogery! + // logger.debug("Could not lock URL " + JSON.stringify(link), { + // url: link, + // }); } } } diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 68140437..6fdf196a 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.11.2", + "version": "1.11.3", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 60f485d0..474eea83 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -565,23 +565,39 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + break + } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } allData = data; } } - return ({ + + let resp: CrawlStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: allData, - error: response.data.error, - }) + data: allData + } + + if (!response.data.success && response.data.error) { + resp = { + ...resp, + success: false, + error: response.data.error + } as ErrorResponse; + } + + if (response.data.next) { + (resp as CrawlStatusResponse).next = response.data.next; + } + + return resp; } else { this.handleError(response, "check crawl status"); } @@ -799,23 +815,39 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + break + } statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } allData = data; } } - return ({ + + let resp: BatchScrapeStatusResponse | ErrorResponse = { success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), - next: response.data.next, - data: allData, - error: response.data.error, - }) + data: allData + } + + if (!response.data.success && response.data.error) { + resp = { + ...resp, + success: false, + error: response.data.error + } as ErrorResponse; + } + + if (response.data.next) { + (resp as BatchScrapeStatusResponse).next = response.data.next; + } + + return resp; } else { this.handleError(response, "check batch scrape status"); } @@ -971,6 +1003,9 @@ export default class FirecrawlApp { if ("data" in statusData) { let data = statusData.data; while (typeof statusData === 'object' && 'next' in statusData) { + if (data.length === 0) { + break + } statusResponse = await this.getRequest(statusData.next, headers); statusData = statusResponse.data; data = data.concat(statusData.data); diff --git a/apps/playwright-service-ts/api.ts b/apps/playwright-service-ts/api.ts index eacb35ff..3b024347 100644 --- a/apps/playwright-service-ts/api.ts +++ b/apps/playwright-service-ts/api.ts @@ -2,7 +2,7 @@ import express, { Request, Response } from 'express'; import bodyParser from 'body-parser'; import { chromium, Browser, BrowserContext, Route, Request as PlaywrightRequest } from 'playwright'; import dotenv from 'dotenv'; -import randomUseragent from 'random-useragent'; +import UserAgent from 'user-agents'; import { getError } from './helpers/get_error'; dotenv.config(); @@ -60,7 +60,7 @@ const initializeBrowser = async () => { ] }); - const userAgent = randomUseragent.getRandom(); + const userAgent = new UserAgent().toString(); const viewport = { width: 1280, height: 800 }; const contextOptions: any = { diff --git a/apps/playwright-service-ts/package.json b/apps/playwright-service-ts/package.json index fe15209f..af1c10be 100644 --- a/apps/playwright-service-ts/package.json +++ b/apps/playwright-service-ts/package.json @@ -16,12 +16,12 @@ "dotenv": "^16.4.5", "express": "^4.19.2", "playwright": "^1.45.0", - "random-useragent": "^0.5.0" + "user-agents": "^1.1.410" }, "devDependencies": { "@types/express": "^4.17.21", "@types/node": "^20.14.9", - "@types/random-useragent": "^0.3.3", + "@types/user-agents": "^1.0.4", "ts-node": "^10.9.2", "typescript": "^5.5.2" } diff --git a/apps/python-sdk/firecrawl/__init__.py b/apps/python-sdk/firecrawl/__init__.py index d4d246e9..5528b3b2 100644 --- a/apps/python-sdk/firecrawl/__init__.py +++ b/apps/python-sdk/firecrawl/__init__.py @@ -13,7 +13,7 @@ import os from .firecrawl import FirecrawlApp # noqa -__version__ = "1.8.0" +__version__ = "1.8.1" # Define the logger for the Firecrawl project logger: logging.Logger = logging.getLogger("firecrawl") diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 271a13f0..d3216405 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -250,6 +250,8 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + break next_url = status_data.get('next') if not next_url: logger.warning("Expected 'next' URL is missing.") @@ -266,17 +268,25 @@ class FirecrawlApp: logger.error(f"Error during pagination request: {e}") break status_data['data'] = data - - return { - 'success': True, + + response = { 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data'), - 'error': status_data.get('error'), - 'next': status_data.get('next', None) + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response } else: self._handle_error(response, 'check crawl status') @@ -459,6 +469,8 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + break next_url = status_data.get('next') if not next_url: logger.warning("Expected 'next' URL is missing.") @@ -476,16 +488,24 @@ class FirecrawlApp: break status_data['data'] = data - return { - 'success': True, + response = { 'status': status_data.get('status'), 'total': status_data.get('total'), 'completed': status_data.get('completed'), 'creditsUsed': status_data.get('creditsUsed'), 'expiresAt': status_data.get('expiresAt'), - 'data': status_data.get('data'), - 'error': status_data.get('error'), - 'next': status_data.get('next', None) + 'data': status_data.get('data') + } + + if 'error' in status_data: + response['error'] = status_data['error'] + + if 'next' in status_data: + response['next'] = status_data['next'] + + return { + 'success': False if 'error' in status_data else True, + **response } else: self._handle_error(response, 'check batch scrape status') @@ -669,6 +689,8 @@ class FirecrawlApp: if 'data' in status_data: data = status_data['data'] while 'next' in status_data: + if len(status_data['data']) == 0: + break status_response = self._get_request(status_data['next'], headers) status_data = status_response.json() data.extend(status_data.get('data', []))