diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 773454e5..ce82236d 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -111,6 +111,20 @@ curl -X POST http://localhost:3002/v1/crawl \ }' ``` +### Alternative: Using Docker Compose + +For a simpler setup, you can use Docker Compose to run all services: + +1. Prerequisites: Make sure you have Docker and Docker Compose installed +2. Copy the `.env.example` file to `.env` in the `/apps/api/` directory and configure as needed +3. From the root directory, run: + +```bash +docker compose up +``` + +This will start Redis, the API server, and workers automatically in the correct configuration. + ## Tests: The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. diff --git a/apps/api/.gitignore b/apps/api/.gitignore index d9639687..52345155 100644 --- a/apps/api/.gitignore +++ b/apps/api/.gitignore @@ -9,3 +9,5 @@ dump.rdb .rdb .sentryclirc + +.env.* \ No newline at end of file diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts index 6f65db98..e02e9dbb 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/checkStatus.ts @@ -3,7 +3,7 @@ import * as Sentry from "@sentry/node"; import { z } from "zod"; import { robustFetch } from "../../lib/fetch"; -import { ActionError, EngineError, SiteError } from "../../error"; +import { ActionError, EngineError, SiteError, UnsupportedFileError } from "../../error"; const successSchema = z.object({ jobId: z.string(), @@ -35,6 +35,12 @@ const successSchema = z.object({ }) .array() .optional(), + + // chrome-cdp only -- file download handler + file: z.object({ + name: z.string(), + content: z.string(), + }).optional().or(z.null()), }); export type FireEngineCheckStatusSuccess = z.infer; @@ -111,6 +117,11 @@ export async function fireEngineCheckStatus( status.error.includes("Chrome error: ") ) { throw new SiteError(status.error.split("Chrome error: ")[1]); + } else if ( + typeof status.error === "string" && + status.error.includes("File size exceeds") + ) { + throw new UnsupportedFileError("File size exceeds " + status.error.split("File size exceeds ")[1]); } else if ( typeof status.error === "string" && // TODO: improve this later diff --git a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts index d753465d..aeafebea 100644 --- a/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/fire-engine/index.ts @@ -13,10 +13,11 @@ import { FireEngineCheckStatusSuccess, StillProcessingError, } from "./checkStatus"; -import { ActionError, EngineError, SiteError, TimeoutError } from "../../error"; +import { ActionError, EngineError, SiteError, TimeoutError, UnsupportedFileError } from "../../error"; import * as Sentry from "@sentry/node"; import { Action } from "../../../../lib/entities"; import { specialtyScrapeCheck } from "../utils/specialtyHandler"; +import { fireEngineDelete } from "./delete"; // This function does not take `Meta` on purpose. It may not access any // meta values to construct the request -- that must be done by the @@ -44,6 +45,13 @@ async function performFireEngineScrape< while (status === undefined) { if (errors.length >= errorLimit) { logger.error("Error limit hit.", { errors }); + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterErrors: errors, + }), + scrape.jobId, + ); throw new Error("Error limit hit. See e.cause.errors for errors.", { cause: { errors }, }); @@ -71,8 +79,16 @@ async function performFireEngineScrape< } else if ( error instanceof EngineError || error instanceof SiteError || - error instanceof ActionError + error instanceof ActionError || + error instanceof UnsupportedFileError ) { + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + afterError: error, + }), + scrape.jobId, + ); logger.debug("Fire-engine scrape job failed.", { error, jobId: scrape.jobId, @@ -91,6 +107,26 @@ async function performFireEngineScrape< await new Promise((resolve) => setTimeout(resolve, 250)); } + specialtyScrapeCheck( + logger.child({ + method: "performFireEngineScrape/specialtyScrapeCheck", + }), + status.responseHeaders, + ); + + if (status.file) { + const content = status.file.content; + delete status.file; + status.content = Buffer.from(content, "base64").toString("utf8"); // TODO: handle other encodings via Content-Type tag + } + + fireEngineDelete( + logger.child({ + method: "performFireEngineScrape/fireEngineDelete", + }), + scrape.jobId, + ); + return status; } @@ -160,13 +196,6 @@ export async function scrapeURLWithFireEngineChromeCDP( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineChromeCDP/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if ( meta.options.formats.includes("screenshot") || meta.options.formats.includes("screenshot@fullPage") @@ -241,13 +270,6 @@ export async function scrapeURLWithFireEnginePlaywright( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEnginePlaywright/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, @@ -301,13 +323,6 @@ export async function scrapeURLWithFireEngineTLSClient( timeout, ); - specialtyScrapeCheck( - meta.logger.child({ - method: "scrapeURLWithFireEngineTLSClient/specialtyScrapeCheck", - }), - response.responseHeaders, - ); - if (!response.url) { meta.logger.warn("Fire-engine did not return the response's URL", { response, diff --git a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts index 9d2f11b1..6bac2ba4 100644 --- a/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts +++ b/apps/api/src/scraper/scrapeURL/engines/pdf/index.ts @@ -32,6 +32,9 @@ async function scrapePDFWithLlamaParse( tempFilePath, ) as unknown as ReadableStream; }, + bytes() { + throw Error("Unimplemented in mock Blob: bytes"); + }, arrayBuffer() { throw Error("Unimplemented in mock Blob: arrayBuffer"); }, diff --git a/apps/api/src/scraper/scrapeURL/error.ts b/apps/api/src/scraper/scrapeURL/error.ts index 689f90c8..bff3a492 100644 --- a/apps/api/src/scraper/scrapeURL/error.ts +++ b/apps/api/src/scraper/scrapeURL/error.ts @@ -64,3 +64,11 @@ export class ActionError extends Error { this.code = code; } } + +export class UnsupportedFileError extends Error { + public reason: string; + constructor(reason: string) { + super("Scrape resulted in unsupported file: " + reason); + this.reason = reason; + } +} diff --git a/apps/api/src/scraper/scrapeURL/index.ts b/apps/api/src/scraper/scrapeURL/index.ts index 1df812bd..130ef9ee 100644 --- a/apps/api/src/scraper/scrapeURL/index.ts +++ b/apps/api/src/scraper/scrapeURL/index.ts @@ -19,6 +19,7 @@ import { RemoveFeatureError, SiteError, TimeoutError, + UnsupportedFileError, } from "./error"; import { executeTransformers } from "./transformers"; import { LLMRefusalError } from "./transformers/llmExtract"; @@ -292,6 +293,8 @@ async function scrapeURLLoop(meta: Meta): Promise { throw error; } else if (error instanceof ActionError) { throw error; + } else if (error instanceof UnsupportedFileError) { + throw error; } else { Sentry.captureException(error); meta.logger.info( @@ -414,6 +417,8 @@ export async function scrapeURL( meta.logger.warn("scrapeURL: Site failed to load in browser", { error }); } else if (error instanceof ActionError) { meta.logger.warn("scrapeURL: Action(s) failed to complete", { error }); + } else if (error instanceof UnsupportedFileError) { + meta.logger.warn("scrapeURL: Tried to scrape unsupported file", { error }); } else { Sentry.captureException(error); meta.logger.error("scrapeURL: Unexpected error happened", { error }); diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 3334abe6..29679b8b 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.9.8", + "version": "1.10.1", "description": "JavaScript SDK for Firecrawl API", "main": "dist/index.js", "types": "dist/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts index 92951237..6958abf8 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/index.test.ts @@ -1,9 +1,9 @@ -import { describe, test, expect, jest } from '@jest/globals'; -import axios from 'axios'; -import FirecrawlApp from '../index'; +import { describe, expect, jest, test } from '@jest/globals'; -import { readFile } from 'fs/promises'; +import FirecrawlApp from '../index'; +import axios from 'axios'; import { join } from 'path'; +import { readFile } from 'fs/promises'; // Mock jest and set the type jest.mock('axios'); @@ -14,13 +14,22 @@ async function loadFixture(name: string): Promise { return await readFile(join(__dirname, 'fixtures', `${name}.json`), 'utf-8') } +const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; + describe('the firecrawl JS SDK', () => { - test('Should require an API key to instantiate FirecrawlApp', async () => { - const fn = () => { - new FirecrawlApp({ apiKey: undefined }); - }; - expect(fn).toThrow('No API key provided'); + test('Should require an API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).toThrow('No API key provided'); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: undefined, apiUrl: API_URL }); + }).not.toThrow(); + } }); test('Should return scraped data from a /scrape API call', async () => { diff --git a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts index 81b0a523..e5c04209 100644 --- a/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts +++ b/apps/js-sdk/firecrawl/src/__tests__/v1/e2e_withAuth/index.test.ts @@ -9,15 +9,28 @@ const TEST_API_KEY = process.env.TEST_API_KEY; const API_URL = process.env.API_URL ?? "https://api.firecrawl.dev"; describe('FirecrawlApp E2E Tests', () => { - test.concurrent('should throw error for no API key', async () => { - expect(() => { - new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); - }).toThrow("No API key provided"); + test.concurrent('should throw error for no API key only for cloud service', async () => { + if (API_URL.includes('api.firecrawl.dev')) { + // Should throw for cloud service + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).toThrow("No API key provided"); + } else { + // Should not throw for self-hosted + expect(() => { + new FirecrawlApp({ apiKey: null, apiUrl: API_URL }); + }).not.toThrow(); + } }); test.concurrent('should throw error for invalid API key on scrape', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).rejects.toThrow("Unexpected error occurred while trying to scrape URL. Status code: 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.scrapeUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on scrape', async () => { @@ -155,8 +168,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should throw error for invalid API key on crawl', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.crawlUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should return successful response for crawl and wait for completion', async () => { @@ -331,8 +349,13 @@ describe('FirecrawlApp E2E Tests', () => { }, 60000); // 60 seconds timeout test.concurrent('should throw error for invalid API key on map', async () => { - const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); - await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 401"); + if (API_URL.includes('api.firecrawl.dev')) { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).rejects.toThrow("Request failed with status code 404"); + } else { + const invalidApp = new FirecrawlApp({ apiKey: "invalid_api_key", apiUrl: API_URL }); + await expect(invalidApp.mapUrl('https://roastmywebsite.ai')).resolves.not.toThrow(); + } }); test.concurrent('should throw error for blocklisted URL on map', async () => { @@ -349,8 +372,7 @@ describe('FirecrawlApp E2E Tests', () => { }, 30000); // 30 seconds timeout test.concurrent('should return successful response for valid map', async () => { - const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); - const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; + const app = new FirecrawlApp({ apiKey: TEST_API_KEY, apiUrl: API_URL }); const response = await app.mapUrl('https://roastmywebsite.ai') as MapResponse; expect(response).not.toBeNull(); expect(response.links?.length).toBeGreaterThan(0); diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 43b77825..29fabf5d 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -290,17 +290,23 @@ export default class FirecrawlApp { public apiKey: string; public apiUrl: string; + private isCloudService(url: string): boolean { + return url.includes('api.firecrawl.dev'); + } + /** * Initializes a new instance of the FirecrawlApp class. * @param config - Configuration options for the FirecrawlApp instance. */ constructor({ apiKey = null, apiUrl = null }: FirecrawlAppConfig) { - if (typeof apiKey !== "string") { + const baseUrl = apiUrl || "https://api.firecrawl.dev"; + + if (this.isCloudService(baseUrl) && typeof apiKey !== "string") { throw new FirecrawlError("No API key provided", 401); } - this.apiKey = apiKey; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + this.apiKey = apiKey || ''; + this.apiUrl = baseUrl; } /** @@ -464,7 +470,7 @@ export default class FirecrawlApp { let statusData = response.data if ("data" in statusData) { let data = statusData.data; - while ('next' in statusData) { + while (typeof statusData === 'object' && 'next' in statusData) { statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -698,7 +704,7 @@ export default class FirecrawlApp { let statusData = response.data if ("data" in statusData) { let data = statusData.data; - while ('next' in statusData) { + while (typeof statusData === 'object' && 'next' in statusData) { statusData = (await this.getRequest(statusData.next, headers)).data; data = data.concat(statusData.data); } @@ -857,42 +863,46 @@ export default class FirecrawlApp { headers: AxiosRequestHeaders, checkInterval: number ): Promise { - while (true) { - let statusResponse: AxiosResponse = await this.getRequest( - `${this.apiUrl}/v1/crawl/${id}`, - headers - ); - if (statusResponse.status === 200) { - let statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - let data = statusData.data; - while ('next' in statusData) { - statusResponse = await this.getRequest(statusData.next, headers); - statusData = statusResponse.data; - data = data.concat(statusData.data); + try { + while (true) { + let statusResponse: AxiosResponse = await this.getRequest( + `${this.apiUrl}/v1/crawl/${id}`, + headers + ); + if (statusResponse.status === 200) { + let statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while (typeof statusData === 'object' && 'next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new FirecrawlError("Crawl job completed but no data was returned", 500); } - statusData.data = data; - return statusData; - } else { - throw new FirecrawlError("Crawl job completed but no data was returned", 500); - } - } else if ( - ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) - ) { - checkInterval = Math.max(checkInterval, 2); - await new Promise((resolve) => - setTimeout(resolve, checkInterval * 1000) - ); + } else if ( + ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) + ) { + checkInterval = Math.max(checkInterval, 2); + await new Promise((resolve) => + setTimeout(resolve, checkInterval * 1000) + ); + } else { + throw new FirecrawlError( + `Crawl job failed or was stopped. Status: ${statusData.status}`, + 500 + ); + } } else { - throw new FirecrawlError( - `Crawl job failed or was stopped. Status: ${statusData.status}`, - 500 - ); + this.handleError(statusResponse, "check crawl status"); } - } else { - this.handleError(statusResponse, "check crawl status"); } + } catch (error: any) { + throw new FirecrawlError(error, 500); } }