From bdbc05a4c7c3723e688fb63a191c1ebcff070618 Mon Sep 17 00:00:00 2001 From: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 13 Dec 2024 18:33:39 -0300 Subject: [PATCH 01/13] added check for object and trycatch as workaround for 502s --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 72 ++++++++++++++++-------------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 30277cc3..74dfcb02 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.3", + "version": "1.9.4", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 37fc5ef0..a54506ad 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -462,7 +462,7 @@ export default class FirecrawlApp { let statusData = response.data if ("data" in statusData) { let data = statusData.data; - while ('next' in statusData) { + while (typeof statusData === 'object' && 'next' in statusData) { statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -691,7 +691,7 @@ export default class FirecrawlApp { let statusData = response.data if ("data" in statusData) { let data = statusData.data; - while ('next' in statusData) { + while (typeof statusData === 'object' && 'next' in statusData) { statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -850,42 +850,46 @@ export default class FirecrawlApp { headers: AxiosRequestHeaders, checkInterval: number ): Promise { - while (true) { - let statusResponse: AxiosResponse = await this.getRequest( - `${this.apiUrl}/v1/crawl/${id}`, - headers - ); - if (statusResponse.status === 200) { - let statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - let data = statusData.data; - while ('next' in statusData) { - statusResponse = await this.getRequest(statusData.next, headers); - statusData = statusResponse.data; - data = data.concat(statusData.data); + try { + while (true) { + let statusResponse: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/crawl/${id}`, + headers + ); + if (statusResponse.status === 200) { + let statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while (typeof statusData === 'object' && 'next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new FirecrawlError("Crawl job completed but no data was returned", 500); } - statusData.data = data; - return statusData; - } else { - throw new FirecrawlError("Crawl job completed but no data was returned", 500); - } - } else if ( - ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) - ) { - checkInterval = Math.max(checkInterval, 2); - await new Promise((resolve) => - setTimeout(resolve, checkInterval * 1000) - ); + } else if ( + ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) + ) { + checkInterval = Math.max(checkInterval, 2); + await new Promise((resolve) => + setTimeout(resolve, checkInterval * 1000) + ); + } else { + throw new FirecrawlError( + `Crawl job failed or was stopped. Status: ${statusData.status}`, + 500 + ); + } } else { - throw new FirecrawlError( - `Crawl job failed or was stopped. Status: ${statusData.status}`, - 500 - ); + this.handleError(statusResponse, "check crawl status"); } - } else { - this.handleError(statusResponse, "check crawl status"); } + } catch (error: any) { + throw new FirecrawlError(error, 500); } } From e776847c71a393d9fc49f6e1883d3911170a5ba7 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Tue, 17 Dec 2024 11:00:13 -0800 Subject: [PATCH 02/13] feat(js-sdk): improve API key handling for cloud vs self-hosted services in FirecrawlApp --- .../firecrawl/src/__tests__/index.test.ts | 27 +++++++---- .../__tests__/v1/e2e_withAuth/index.test.ts | 46 ++++++++++++++----- apps/js-sdk/firecrawl/src/index.ts | 12 +++-- 3 files changed, 61 insertions(+), 24 deletions(-) diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index 92951237..6958abf8 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -1,9 +1,9 @@ -import { describe, test, expect, jest } from '@jest/globals'; -import axios from 'axios'; -import FirecrawlApp from '../index'; +import { describe, expect, jest, test } from '@jest/globals'; -import { readFile } from 'fs/promises'; +import FirecrawlApp from '../index'; +import axios from 'axios'; import { join } from 'path'; +import { readFile } from 'fs/promises'; // Mock jest and set the type jest.mock('axios'); @@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise { return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') } +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; + describe('the firecrawl JS SDK', () => { - test('Should require an API key to instantiate FirecrawlApp', async () => { - const fn = () => { - new FirecrawlApp({ apiKey: undefined }); - }; - expect(fn).toThrow('No API key provided'); + test('Should require an API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).toThrow('No API key provided'); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).not.toThrow(); + } }); test('Should return scraped data from a /scrape API call', async () => { diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..60d0b44f 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { - expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); - }).toThrow("No API key provided"); + test.concurrent('should throw error for no API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).not.toThrow(); + } }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { @@ -155,8 +168,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on crawl', async () => { @@ -337,8 +355,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on map', async () => { @@ -355,8 +378,7 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid map', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); expect(response.links?.length).toBeGreaterThan(0); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 020a2293..6d9a0a73 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -289,17 +289,23 @@ export default class FirecrawlApp { public apiKey: string; public apiUrl: string; + private isCloudService(url: string): boolean { + return url.includes('api.firecrawl.dev'); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - if (typeof apiKey !== "string") { + const baseUrl = apiUrl || "https://api.firecrawl.dev"; + + if (this.isCloudService(baseUrl) && typeof apiKey !== "string") { throw new FirecrawlError("No API key provided", 401); } - this.apiKey = apiKey; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.apiKey = apiKey || ''; + this.apiUrl = baseUrl; } /** From 6002bf322872f1ad849bbecc0c26636e3d22b10f Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 19 Dec 2024 14:52:43 +0200 Subject: [PATCH 03/13] feat: dynamically import WebSocket module with error handling --- apps/js-sdk/firecrawl/src/index.ts | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 0d19ab60..7eef05f8 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,7 +1,24 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; -import { WebSocket } from "isows"; + +import type { WebSocket as IsowsWebSocket } from 'isows'; +/** + * Dynamically imports the WebSocket class from 'isows'. + * If the import fails, WebSocket is set to null. + * This approach is used because some environments, such as Firebase Functions, + * might not support WebSocket natively. + */ +const WebSocket: typeof IsowsWebSocket | null = await (async () => { + try { + const module = await import('isows'); + return module.WebSocket; + } catch (error) { + console.error("Failed to load 'isows' module:", error); + return null; + } +})(); + import { TypedEventTarget } from "typescript-event-target"; /** @@ -938,6 +955,8 @@ export class CrawlWatcher extends TypedEventTarget { constructor(id: string, app: FirecrawlApp) { super(); + if(!WebSocket) + throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; this.data = []; From c8cd0148dd86e8903a3b8cf16b87841262d3c1e6 Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Thu, 19 Dec 2024 20:39:30 +0200 Subject: [PATCH 04/13] refactor: remove error logging for 'isows' module import in WebSocket initialization --- apps/js-sdk/firecrawl/src/index.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 7eef05f8..9e3a849f 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -14,7 +14,6 @@ const WebSocket: typeof IsowsWebSocket | null = await (async () => { const module = await import('isows'); return module.WebSocket; } catch (error) { - console.error("Failed to load 'isows' module:", error); return null; } })(); From f043f5fd61d229f08dbba3f16079061a0f2cecbf Mon Sep 17 00:00:00 2001 From: Thomas Kosmas Date: Sat, 21 Dec 2024 02:27:22 +0200 Subject: [PATCH 05/13] Enhance error handling in E2E tests and introduce CrawlWatcher tests - Updated error messages in E2E tests to provide clearer feedback for blocked URLs and invalid API keys. - Added new test suite for CrawlWatcher to ensure proper instantiation and error handling when WebSocket is unavailable. - Improved test conditions for URL scraping and crawling to reflect updated error responses. --- .../__tests__/e2e_v1_withAuth/index.test.ts | 4 +-- .../src/__tests__/CrawlWatcher.test.ts | 35 +++++++++++++++++++ .../src/__tests__/e2e_withAuth/index.test.ts | 8 ++--- .../__tests__/v1/e2e_withAuth/index.test.ts | 12 +++---- 4 files changed, 47 insertions(+), 12 deletions(-) create mode 100644 apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts diff --git a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts index 35ee2d89..f5fc5d5d 100644 --- a/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts +++ b/apps/api/src/__tests__/e2e_v1_withAuth/index.test.ts @@ -58,7 +58,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", + "Request failed with status code 403. Error: URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions. ", ); }); @@ -757,7 +757,7 @@ describe("E2E Tests for v1 API Routes", () => { expect(response.statusCode).toBe(403); expect(response.body.error).toBe( - "URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions.", + "Request failed with status code 403. Error: URL is blocked intentionally. Firecrawl currently does not support scraping this site due to policy restrictions. ", ); }); diff --git a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts new file mode 100644 index 00000000..7f53828d --- /dev/null +++ b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts @@ -0,0 +1,35 @@ +import { jest } from '@jest/globals'; + +describe('CrawlWatcher', () => { + const mockApiUrl = 'https://api.firecrawl.dev'; + const mockApiKey = 'test-api-key'; + + beforeEach(() => { + jest.resetModules(); + }); + + test('should create a CrawlWatcher instance successfully when isows is available', async () => { + await jest.unstable_mockModule('isows', () => ({ + WebSocket: jest.fn(), + })); + + const { default: FirecrawlApp, CrawlWatcher } = await import('../index'); + const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); + + const watcher = new CrawlWatcher('test-id', app); + expect(watcher).toBeInstanceOf(CrawlWatcher); + }); + + test('should throw when WebSocket is not available (isows import fails)', async () => { + await jest.unstable_mockModule('isows', () => { + throw new Error('Module not found'); + }); + + const { default: FirecrawlApp, CrawlWatcher, FirecrawlError } = await import('../index'); + const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); + + expect(() => { + new CrawlWatcher('test-id', app); + }).toThrow(FirecrawlError); + }); +}); diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index 7d107afe..6db51775 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -32,7 +32,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); await expect( invalidApp.scrapeUrl("https://roastmywebsite.ai") - ).rejects.toThrow("Request failed with status code 401"); + ).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); } ); @@ -46,7 +46,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( - "Request failed with status code 403" + "Unexpected error occurred while trying to scrape URL. Status code: 403" ); } ); @@ -169,7 +169,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( - "Request failed with status code 403" + "Unexpected error occurred while trying to scrape URL. Status code: 403" ); } ); @@ -242,7 +242,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { const maxChecks = 15; let checks = 0; - while (statusResponse.status === "active" && checks < maxChecks) { + while ((statusResponse.status === "active" || statusResponse.status === "scraping" ) && checks < maxChecks) { await new Promise((resolve) => setTimeout(resolve, 5000)); expect(statusResponse.partial_data).not.toBeNull(); // expect(statusResponse.current).toBeGreaterThanOrEqual(1); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index dea55846..76dc7f73 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -17,13 +17,13 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for invalid API key on scrape', async () => { const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 403"); }); test.concurrent('should return successful response with valid preview token', async () => { @@ -61,7 +61,7 @@ describe('FirecrawlApp E2E Tests', () => { 'https://roastmywebsite.ai', { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - includeTags: ['h1'], + // includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, timeout: 30000, @@ -162,7 +162,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for blocklisted URL on crawl', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://twitter.com/fake-test"; - await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("URL is blocked. Firecrawl currently does not support social media scraping due to policy restrictions."); + await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403. Error: This website is no longer supported, please reach out to help@firecrawl.com for more info on how to activate it on your account. "); }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -212,7 +212,7 @@ describe('FirecrawlApp E2E Tests', () => { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - includeTags: ['h1'], + // includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, waitFor: 1000 @@ -334,7 +334,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse.data[0].metadata).not.toHaveProperty("error"); } } - }, 60000); // 60 seconds timeout + }, 120000); // 120 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); From 7366f36e397669fcb4260617707f63aa38ced375 Mon Sep 17 00:00:00 2001 From: RutamBhagat Date: Sat, 21 Dec 2024 07:03:16 -0800 Subject: [PATCH 06/13] docs(CONTRIBUTING.md): Add Docker Compose setup instructions to CONTRIBUTING.md --- CONTRIBUTING.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 773454e5..ce82236d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \ }' ``` +### Alternative: Using Docker Compose + +For a simpler setup, you can use Docker Compose to run all services: + +1. Prerequisites: Make sure you have Docker and Docker Compose installed +2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed +3. From the root directory, run: + +```bash +docker compose up +``` + +This will start Redis, the API server, and workers automatically in the correct configuration. + ## Tests: The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. From 18ceaf10a5fa162ee66a33967bffd682be1746f8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:42:05 -0300 Subject: [PATCH 07/13] Update .gitignore --- apps/api/.gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..52345155 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +.env.* \ No newline at end of file From b1a5625b2208ea34096096bfdd1685f9879a1d1b Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:45:51 -0300 Subject: [PATCH 08/13] Revert "Merge pull request #997 from mendableai/feat/sdk-without-ws" This reverts commit 53cda5f81c53d3de35925c610ce083923ca09fbe, reversing changes made to 51f79b55efadc53243a8c22d86bb2d08d878d524. --- .../src/__tests__/CrawlWatcher.test.ts | 35 ------------------- .../src/__tests__/e2e_withAuth/index.test.ts | 8 ++--- .../__tests__/v1/e2e_withAuth/index.test.ts | 8 ++--- apps/js-sdk/firecrawl/src/index.ts | 20 +---------- 4 files changed, 9 insertions(+), 62 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts diff --git a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts b/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts deleted file mode 100644 index 7f53828d..00000000 --- a/apps/js-sdk/firecrawl/src/__tests__/CrawlWatcher.test.ts +++ /dev/null @@ -1,35 +0,0 @@ -import { jest } from '@jest/globals'; - -describe('CrawlWatcher', () => { - const mockApiUrl = 'https://api.firecrawl.dev'; - const mockApiKey = 'test-api-key'; - - beforeEach(() => { - jest.resetModules(); - }); - - test('should create a CrawlWatcher instance successfully when isows is available', async () => { - await jest.unstable_mockModule('isows', () => ({ - WebSocket: jest.fn(), - })); - - const { default: FirecrawlApp, CrawlWatcher } = await import('../index'); - const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); - - const watcher = new CrawlWatcher('test-id', app); - expect(watcher).toBeInstanceOf(CrawlWatcher); - }); - - test('should throw when WebSocket is not available (isows import fails)', async () => { - await jest.unstable_mockModule('isows', () => { - throw new Error('Module not found'); - }); - - const { default: FirecrawlApp, CrawlWatcher, FirecrawlError } = await import('../index'); - const app = new FirecrawlApp({ apiKey: mockApiKey, apiUrl: mockApiUrl }); - - expect(() => { - new CrawlWatcher('test-id', app); - }).toThrow(FirecrawlError); - }); -}); diff --git a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts index 6db51775..7d107afe 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/e2e_withAuth/index.test.ts @@ -32,7 +32,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); await expect( invalidApp.scrapeUrl("https://roastmywebsite.ai") - ).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 401"); + ).rejects.toThrow("Request failed with status code 401"); } ); @@ -46,7 +46,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://facebook.com/fake-test"; await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow( - "Unexpected error occurred while trying to scrape URL. Status code: 403" + "Request failed with status code 403" ); } ); @@ -169,7 +169,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { }); const blocklistedUrl = "https://twitter.com/fake-test"; await expect(app.crawlUrl(blocklistedUrl)).rejects.toThrow( - "Unexpected error occurred while trying to scrape URL. Status code: 403" + "Request failed with status code 403" ); } ); @@ -242,7 +242,7 @@ describe('FirecrawlApp<"v0"> E2E Tests', () => { const maxChecks = 15; let checks = 0; - while ((statusResponse.status === "active" || statusResponse.status === "scraping" ) && checks < maxChecks) { + while (statusResponse.status === "active" && checks < maxChecks) { await new Promise((resolve) => setTimeout(resolve, 5000)); expect(statusResponse.partial_data).not.toBeNull(); // expect(statusResponse.current).toBeGreaterThanOrEqual(1); diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 4f3a9cb2..e5c04209 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -36,7 +36,7 @@ describe('FirecrawlApp E2E Tests', () => { test.concurrent('should throw error for blocklisted URL on scrape', async () => { const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const blocklistedUrl = "https://facebook.com/fake-test"; - await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 403"); + await expect(app.scrapeUrl(blocklistedUrl)).rejects.toThrow("Request failed with status code 403"); }); test.concurrent('should return successful response with valid preview token', async () => { @@ -74,7 +74,7 @@ describe('FirecrawlApp E2E Tests', () => { 'https://roastmywebsite.ai', { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - // includeTags: ['h1'], + includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, timeout: 30000, @@ -224,7 +224,7 @@ describe('FirecrawlApp E2E Tests', () => { scrapeOptions: { formats: ['markdown', 'html', 'rawHtml', 'screenshot', 'links'], headers: { "x-key": "test" }, - // includeTags: ['h1'], + includeTags: ['h1'], excludeTags: ['h2'], onlyMainContent: true, waitFor: 1000 @@ -346,7 +346,7 @@ describe('FirecrawlApp E2E Tests', () => { expect(statusResponse.data[0].metadata).not.toHaveProperty("error"); } } - }, 120000); // 120 seconds timeout + }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { if (API_URL.includes('api.firecrawl.dev')) { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index feb69f03..d3ae630b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -1,23 +1,7 @@ import axios, { type AxiosResponse, type AxiosRequestHeaders, AxiosError } from "axios"; import type * as zt from "zod"; import { zodToJsonSchema } from "zod-to-json-schema"; - -import type { WebSocket as IsowsWebSocket } from 'isows'; -/** - * Dynamically imports the WebSocket class from 'isows'. - * If the import fails, WebSocket is set to null. - * This approach is used because some environments, such as Firebase Functions, - * might not support WebSocket natively. - */ -const WebSocket: typeof IsowsWebSocket | null = await (async () => { - try { - const module = await import('isows'); - return module.WebSocket; - } catch (error) { - return null; - } -})(); - +import { WebSocket } from "isows"; import { TypedEventTarget } from "typescript-event-target"; /** @@ -961,8 +945,6 @@ export class CrawlWatcher extends TypedEventTarget { constructor(id: string, app: FirecrawlApp) { super(); - if(!WebSocket) - throw new FirecrawlError("WebSocket module failed to load. Your system might not support WebSocket.", 500); this.id = id; this.ws = new WebSocket(`${app.apiUrl}/v1/crawl/${id}`, app.apiKey); this.status = "scraping"; From c911aad228ebb76384833e8e95e2a074f9d78030 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 23 Dec 2024 18:48:03 -0300 Subject: [PATCH 09/13] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 3334abe6..8945f3fa 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.8", + "version": "1.10.0", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From 4f65d350a3f98211c18d1695a3473e5d27e6ee6f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Thu, 26 Dec 2024 12:52:52 -0300 Subject: [PATCH 10/13] Update package.json --- apps/js-sdk/firecrawl/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 74dfcb02..29679b8b 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.4", + "version": "1.10.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", From f15ef0e7582e39e6ed17ee9f01e10969f02b4c93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 26 Dec 2024 20:29:09 +0100 Subject: [PATCH 11/13] feat(scrapeURL/fire-engine/chrome-cdp): handle file downloads --- .../engines/fire-engine/checkStatus.ts | 13 ++++++- .../scrapeURL/engines/fire-engine/index.ts | 39 ++++++++----------- apps/api/src/scraper/scrapeURL/error.ts | 8 ++++ apps/api/src/scraper/scrapeURL/index.ts | 5 +++ 4 files changed, 41 insertions(+), 24 deletions(-) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 6f65db98..e02e9dbb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { ActionError, EngineError, SiteError } from "../../error"; +import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -35,6 +35,12 @@ const successSchema = z.object({ }) .array() .optional(), + + // chrome-cdp only -- file download handler + file: z.object({ + name: z.string(), + content: z.string(), + }).optional().or(z.null()), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -111,6 +117,11 @@ export async function fireEngineCheckStatus( status.error.includes("Chrome error: ") ) { throw new SiteError(status.error.split("Chrome error: ")[1]); + } else if ( + typeof status.error === "string" && + status.error.includes("File size exceeds") + ) { + throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]); } else if ( typeof status.error === "string" && // TODO: improve this later diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index d753465d..aa869836 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,7 +13,7 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { ActionError, EngineError, SiteError, TimeoutError } from "../../error"; +import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; @@ -71,7 +71,8 @@ async function performFireEngineScrape< } else if ( error instanceof EngineError || error instanceof SiteError || - error instanceof ActionError + error instanceof ActionError || + error instanceof UnsupportedFileError ) { logger.debug("Fire-engine scrape job failed.", { error, @@ -91,6 +92,19 @@ async function performFireEngineScrape< await new Promise((resolve) => setTimeout(resolve, 250)); } + specialtyScrapeCheck( + logger.child({ + method: "performFireEngineScrape/specialtyScrapeCheck", + }), + status.responseHeaders, + ); + + if (status.file) { + const content = status.file.content; + delete status.file; + status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag + } + return status; } @@ -160,13 +174,6 @@ export async function scrapeURLWithFireEngineChromeCDP( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if ( meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") @@ -241,13 +248,6 @@ export async function scrapeURLWithFireEnginePlaywright( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, @@ -301,13 +301,6 @@ export async function scrapeURLWithFireEngineTLSClient( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 689f90c8..bff3a492 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -64,3 +64,11 @@ export class ActionError extends Error { this.code = code; } } + +export class UnsupportedFileError extends Error { + public reason: string; + constructor(reason: string) { + super("Scrape resulted in unsupported file: " + reason); + this.reason = reason; + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 1df812bd..130ef9ee 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -19,6 +19,7 @@ import { RemoveFeatureError, SiteError, TimeoutError, + UnsupportedFileError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof ActionError) { throw error; + } else if (error instanceof UnsupportedFileError) { + throw error; } else { Sentry.captureException(error); meta.logger.info( @@ -414,6 +417,8 @@ export async function scrapeURL( meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); + } else if (error instanceof UnsupportedFileError) { + meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); From c543f4f76c0fd187be684b8571d1528dad84744e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Thu, 26 Dec 2024 20:31:51 +0100 Subject: [PATCH 12/13] feat(scrapeURL/pdf): update mock Blob implementation to pass TypeScript --- apps/api/src/scraper/scrapeURL/engines/pdf/index.ts | 3 +++ 1 file changed, 3 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 9d2f11b1..6bac2ba4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -32,6 +32,9 @@ async function scrapePDFWithLlamaParse( tempFilePath, ) as unknown as ReadableStream; }, + bytes() { + throw Error("Unimplemented in mock Blob: bytes"); + }, arrayBuffer() { throw Error("Unimplemented in mock Blob: arrayBuffer"); }, From 477295131372011ac9d0d5217925d924871985ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Fri, 27 Dec 2024 16:44:41 +0100 Subject: [PATCH 13/13] feat(scrapeURL/fire-engine): explicitly delete job after scrape --- .../scrapeURL/engines/fire-engine/index.ts | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index aa869836..aeafebea 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -17,6 +17,7 @@ import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; +import { fireEngineDelete } from "./delete"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -44,6 +45,13 @@ async function performFireEngineScrape< while (status === undefined) { if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterErrors: errors, + }), + scrape.jobId, + ); throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors }, }); @@ -74,6 +82,13 @@ async function performFireEngineScrape< error instanceof ActionError || error instanceof UnsupportedFileError ) { + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterError: error, + }), + scrape.jobId, + ); logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, @@ -105,6 +120,13 @@ async function performFireEngineScrape< status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag } + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + }), + scrape.jobId, + ); + return status; }