From a0b8a6cad3613cc2cecc1acacdf1a5dcfd50363f Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:43:33 +0200 Subject: [PATCH 01/33] feat(js-sdk): build both cjs and esm versions --- apps/js-sdk/firecrawl/build/cjs/index.js | 271 ++++++++++++++++++ apps/js-sdk/firecrawl/build/cjs/package.json | 1 + .../js-sdk/firecrawl/build/{ => esm}/index.js | 0 apps/js-sdk/firecrawl/build/esm/package.json | 1 + apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/firecrawl/types/index.d.ts | 22 +- apps/js-sdk/package-lock.json | 34 ++- apps/js-sdk/package.json | 1 + 8 files changed, 324 insertions(+), 14 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js create mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json rename apps/js-sdk/firecrawl/build/{ => esm}/index.js (100%) create mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js new file mode 100644 index 00000000..dbc2d6b9 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -0,0 +1,271 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const axios_1 = __importDefault(require("axios")); +const zod_1 = require("zod"); +const zod_to_json_schema_1 = require("zod-to-json-schema"); +/** + * Main class for interacting with the Firecrawl API. + */ +class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null) { + var _a; + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof zod_1.z.ZodSchema) { + schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios_1.default.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios_1.default.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} +exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json new file mode 100644 index 00000000..b731bd61 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/package.json @@ -0,0 +1 @@ +{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/esm/index.js similarity index 100% rename from apps/js-sdk/firecrawl/build/index.js rename to apps/js-sdk/firecrawl/build/esm/index.js diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json new file mode 100644 index 00000000..6990891f --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/package.json @@ -0,0 +1 @@ +{"type": "module"} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 0fef67b0..178e5c66 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,11 +2,15 @@ "name": "@mendable/firecrawl-js", "version": "0.0.29", "description": "JavaScript SDK for Firecrawl API", - "main": "build/index.js", + "main": "build/cjs/index.js", "types": "types/index.d.ts", "type": "module", + "exports": { + "require": "./build/cjs/index.js", + "import": "./build/esm/index.js" + }, "scripts": { - "build": "tsc", + "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "jest src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 91a58043..bd6cfc20 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -73,16 +73,16 @@ export interface ScrapeResponse { error?: string; } /** -* Response interface for searching operations. -*/ + * Response interface for searching operations. + */ export interface SearchResponse { success: boolean; data?: FirecrawlDocument[]; error?: string; } /** -* Response interface for crawling operations. -*/ + * Response interface for crawling operations. + */ export interface CrawlResponse { success: boolean; jobId?: string; @@ -90,24 +90,28 @@ export interface CrawlResponse { error?: string; } /** -* Response interface for job status checks. -*/ + * Response interface for job status checks. + */ export interface JobStatusResponse { success: boolean; status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; jobId?: string; data?: FirecrawlDocument[]; partial_data?: FirecrawlDocument[]; error?: string; } /** - * Generic parameter interface. - */ + * Generic parameter interface. + */ export interface Params { [key: string]: any; extractorOptions?: { extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + mode?: "llm-extraction"; extractionPrompt?: string; }; } diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 2bf3f001..ca337062 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,6 +13,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -450,6 +451,15 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, + "node_modules/@types/node": { + "version": "20.14.11", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", + "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", + "peer": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -728,6 +738,24 @@ "node": ">=14.17" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "peer": true + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -750,9 +778,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.0", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", - "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", + "version": "3.23.1", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", + "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 0e93fe3c..2d2c36e8 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,6 +15,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From 2e62de4f8b58e74113555ade9a6b89689a73f4a6 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:45:51 +0200 Subject: [PATCH 02/33] fix(js-sdk): remove built files from repo and add to gitignore --- apps/js-sdk/firecrawl/.gitignore | 2 + apps/js-sdk/firecrawl/build/cjs/index.js | 271 ------------------- apps/js-sdk/firecrawl/build/cjs/package.json | 1 - apps/js-sdk/firecrawl/build/esm/index.js | 265 ------------------ apps/js-sdk/firecrawl/build/esm/package.json | 1 - 5 files changed, 2 insertions(+), 538 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js delete mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json delete mode 100644 apps/js-sdk/firecrawl/build/esm/index.js delete mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index c6bba591..96e545b3 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -128,3 +128,5 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* + +build diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js deleted file mode 100644 index dbc2d6b9..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ /dev/null @@ -1,271 +0,0 @@ -"use strict"; -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -var __importDefault = (this && this.__importDefault) || function (mod) { - return (mod && mod.__esModule) ? mod : { "default": mod }; -}; -Object.defineProperty(exports, "__esModule", { value: true }); -const axios_1 = __importDefault(require("axios")); -const zod_1 = require("zod"); -const zod_to_json_schema_1 = require("zod-to-json-schema"); -/** - * Main class for interacting with the Firecrawl API. - */ -class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof zod_1.z.ZodSchema) { - schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios_1.default.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios_1.default.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} -exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json deleted file mode 100644 index b731bd61..00000000 --- a/apps/js-sdk/firecrawl/build/cjs/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js deleted file mode 100644 index 99de5e2b..00000000 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ /dev/null @@ -1,265 +0,0 @@ -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -import axios from "axios"; -import { z } from "zod"; -import { zodToJsonSchema } from "zod-to-json-schema"; -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json deleted file mode 100644 index 6990891f..00000000 --- a/apps/js-sdk/firecrawl/build/esm/package.json +++ /dev/null @@ -1 +0,0 @@ -{"type": "module"} From 361269974ea6b73a33d03ce388f28f6ae030a16d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Thu, 18 Jul 2024 13:48:39 +0200 Subject: [PATCH 03/33] fix(js-sdk): remove autogenerated index.d.ts from git and add to gitignore --- apps/js-sdk/firecrawl/.gitignore | 1 + apps/js-sdk/firecrawl/types/index.d.ts | 193 ------------------------- 2 files changed, 1 insertion(+), 193 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 96e545b3..1acd6303 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -130,3 +130,4 @@ dist .pnp.* build +types diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts deleted file mode 100644 index bd6cfc20..00000000 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ /dev/null @@ -1,193 +0,0 @@ -import { AxiosResponse, AxiosRequestHeaders } from "axios"; -import { z } from "zod"; -/** - * Configuration interface for FirecrawlApp. - */ -export interface FirecrawlAppConfig { - apiKey?: string | null; - apiUrl?: string | null; -} -/** - * Metadata for a Firecrawl document. - */ -export interface FirecrawlDocumentMetadata { - title?: string; - description?: string; - language?: string; - keywords?: string; - robots?: string; - ogTitle?: string; - ogDescription?: string; - ogUrl?: string; - ogImage?: string; - ogAudio?: string; - ogDeterminer?: string; - ogLocale?: string; - ogLocaleAlternate?: string[]; - ogSiteName?: string; - ogVideo?: string; - dctermsCreated?: string; - dcDateCreated?: string; - dcDate?: string; - dctermsType?: string; - dcType?: string; - dctermsAudience?: string; - dctermsSubject?: string; - dcSubject?: string; - dcDescription?: string; - dctermsKeywords?: string; - modifiedTime?: string; - publishedTime?: string; - articleTag?: string; - articleSection?: string; - sourceURL?: string; - pageStatusCode?: number; - pageError?: string; - [key: string]: any; -} -/** - * Document interface for Firecrawl. - */ -export interface FirecrawlDocument { - id?: string; - url?: string; - content: string; - markdown?: string; - html?: string; - llm_extraction?: Record; - createdAt?: Date; - updatedAt?: Date; - type?: string; - metadata: FirecrawlDocumentMetadata; - childrenLinks?: string[]; - provider?: string; - warning?: string; - index?: number; -} -/** - * Response interface for scraping operations. - */ -export interface ScrapeResponse { - success: boolean; - data?: FirecrawlDocument; - error?: string; -} -/** - * Response interface for searching operations. - */ -export interface SearchResponse { - success: boolean; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for crawling operations. - */ -export interface CrawlResponse { - success: boolean; - jobId?: string; - data?: FirecrawlDocument[]; - error?: string; -} -/** - * Response interface for job status checks. - */ -export interface JobStatusResponse { - success: boolean; - status: string; - current?: number; - current_url?: string; - current_step?: string; - total?: number; - jobId?: string; - data?: FirecrawlDocument[]; - partial_data?: FirecrawlDocument[]; - error?: string; -} -/** - * Generic parameter interface. - */ -export interface Params { - [key: string]: any; - extractorOptions?: { - extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction"; - extractionPrompt?: string; - }; -} -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - private apiKey; - private apiUrl; - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey, apiUrl }: FirecrawlAppConfig); - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url: string, params?: Params | null): Promise; - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query: string, params?: Params | null): Promise; - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId: string): Promise; - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url: string, headers: AxiosRequestHeaders): Promise; - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response: AxiosResponse, action: string): void; -} From 49e3e64787ab0080724b3cff82a715a630955451 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 29 Jul 2024 14:13:46 -0300 Subject: [PATCH 04/33] bugfix for pdfs and logging pdf events, also added trycatchs for docx --- apps/api/src/scraper/WebScraper/index.ts | 47 +++++++++++- .../scraper/WebScraper/utils/docxProcessor.ts | 76 ++++++++++++++----- .../scraper/WebScraper/utils/pdfProcessor.ts | 36 +++++++-- 3 files changed, 129 insertions(+), 30 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index eff709fa..9171b805 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -20,6 +20,7 @@ import { getWebScraperQueue } from "../../../src/services/queue-service"; import { fetchAndProcessDocx } from "./utils/docxProcessor"; import { getAdjustedMaxDepth, getURLDepth } from "./utils/maxDepthUtils"; import { Logger } from "../../lib/logger"; +import { ScrapeEvents } from "../../lib/scrape-events"; export class WebScraperDataProvider { private jobId: string; @@ -316,10 +317,28 @@ export class WebScraperDataProvider { private async fetchPdfDocuments(pdfLinks: string[]): Promise { return Promise.all( pdfLinks.map(async (pdfLink) => { + const timer = Date.now(); + const logInsertPromise = ScrapeEvents.insert(this.jobId, { + type: "scrape", + url: pdfLink, + worker: process.env.FLY_MACHINE_ID, + method: "pdf-scrape", + result: null, + }); + const { content, pageStatusCode, pageError } = await fetchAndProcessPdf( pdfLink, this.pageOptions.parsePDF ); + + const insertedLogId = await logInsertPromise; + ScrapeEvents.updateScrapeResult(insertedLogId, { + response_size: content.length, + success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100), + error: pageError, + response_code: pageStatusCode, + time_taken: Date.now() - timer, + }); return { content: content, metadata: { sourceURL: pdfLink, pageStatusCode, pageError }, @@ -330,12 +349,32 @@ export class WebScraperDataProvider { } private async fetchDocxDocuments(docxLinks: string[]): Promise { return Promise.all( - docxLinks.map(async (p) => { - const { content, pageStatusCode, pageError } = - await fetchAndProcessDocx(p); + docxLinks.map(async (docxLink) => { + const timer = Date.now(); + const logInsertPromise = ScrapeEvents.insert(this.jobId, { + type: "scrape", + url: docxLink, + worker: process.env.FLY_MACHINE_ID, + method: "docx-scrape", + result: null, + }); + + const { content, pageStatusCode, pageError } = await fetchAndProcessDocx( + docxLink + ); + + const insertedLogId = await logInsertPromise; + ScrapeEvents.updateScrapeResult(insertedLogId, { + response_size: content.length, + success: !(pageStatusCode && pageStatusCode >= 400) && !!content && (content.trim().length >= 100), + error: pageError, + response_code: pageStatusCode, + time_taken: Date.now() - timer, + }); + return { content, - metadata: { sourceURL: p, pageStatusCode, pageError }, + metadata: { sourceURL: docxLink, pageStatusCode, pageError }, provider: "web-scraper", }; }) diff --git a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts index a01b8a28..8f6dc97c 100644 --- a/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/docxProcessor.ts @@ -4,38 +4,76 @@ import { createWriteStream } from "node:fs"; import path from "path"; import os from "os"; import mammoth from "mammoth"; +import { Logger } from "../../../lib/logger"; export async function fetchAndProcessDocx(url: string): Promise<{ content: string; pageStatusCode: number; pageError: string }> { - const { tempFilePath, pageStatusCode, pageError } = await downloadDocx(url); - const content = await processDocxToText(tempFilePath); - fs.unlinkSync(tempFilePath); // Clean up the temporary file + let tempFilePath = ''; + let pageStatusCode = 200; + let pageError = ''; + let content = ''; + + try { + const downloadResult = await downloadDocx(url); + tempFilePath = downloadResult.tempFilePath; + pageStatusCode = downloadResult.pageStatusCode; + pageError = downloadResult.pageError; + content = await processDocxToText(tempFilePath); + } catch (error) { + Logger.error(`Failed to fetch and process DOCX: ${error.message}`); + pageStatusCode = 500; + pageError = error.message; + content = ''; + } finally { + if (tempFilePath) { + fs.unlinkSync(tempFilePath); // Clean up the temporary file + } + } + return { content, pageStatusCode, pageError }; } async function downloadDocx(url: string): Promise<{ tempFilePath: string; pageStatusCode: number; pageError: string }> { - const response = await axios({ - url, - method: "GET", - responseType: "stream", - }); + try { + const response = await axios({ + url, + method: "GET", + responseType: "stream", + }); - const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`); - const writer = createWriteStream(tempFilePath); + const tempFilePath = path.join(os.tmpdir(), `tempDocx-${Date.now()}.docx`); + const writer = createWriteStream(tempFilePath); - response.data.pipe(writer); + response.data.pipe(writer); - return new Promise((resolve, reject) => { - writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); - writer.on("error", reject); - }); + return new Promise((resolve, reject) => { + writer.on("finish", () => resolve({ tempFilePath, pageStatusCode: response.status, pageError: response.statusText != "OK" ? response.statusText : undefined })); + writer.on("error", () => { + Logger.error('Failed to write DOCX file to disk'); + reject(new Error('Failed to write DOCX file to disk')); + }); + }); + } catch (error) { + Logger.error(`Failed to download DOCX: ${error.message}`); + return { tempFilePath: "", pageStatusCode: 500, pageError: error.message }; + } } export async function processDocxToText(filePath: string): Promise { - const content = await extractTextFromDocx(filePath); - return content; + try { + const content = await extractTextFromDocx(filePath); + return content; + } catch (error) { + Logger.error(`Failed to process DOCX to text: ${error.message}`); + return ""; + } } async function extractTextFromDocx(filePath: string): Promise { - const result = await mammoth.extractRawText({ path: filePath }); - return result.value; + try { + const result = await mammoth.extractRawText({ path: filePath }); + return result.value; + } catch (error) { + Logger.error(`Failed to extract text from DOCX: ${error.message}`); + return ""; + } } diff --git a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts index 660d27eb..b27db99a 100644 --- a/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts +++ b/apps/api/src/scraper/WebScraper/utils/pdfProcessor.ts @@ -76,7 +76,6 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro let attempt = 0; const maxAttempts = 10; // Maximum number of attempts let resultAvailable = false; - while (attempt < maxAttempts && !resultAvailable) { try { resultResponse = await axios.get(resultUrl, { headers, timeout: (axiosTimeout * 2) }); @@ -90,13 +89,22 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro } catch (error) { Logger.debug("Error fetching result w/ LlamaIndex"); attempt++; + if (attempt >= maxAttempts) { + Logger.error("Max attempts reached, unable to fetch result."); + break; // Exit the loop if max attempts are reached + } await new Promise((resolve) => setTimeout(resolve, 500)); // Wait for 0.5 seconds before retrying // You may want to handle specific errors differently } } if (!resultAvailable) { - content = await processPdf(filePath); + try { + content = await processPdf(filePath); + } catch (error) { + Logger.error(`Failed to process PDF: ${error}`); + content = ""; + } } content = resultResponse.data[resultType]; } catch (error) { @@ -104,15 +112,29 @@ export async function processPdfToText(filePath: string, parsePDF: boolean): Pro content = await processPdf(filePath); } } else if (parsePDF) { - content = await processPdf(filePath); + try { + content = await processPdf(filePath); + } catch (error) { + Logger.error(`Failed to process PDF: ${error}`); + content = ""; + } } else { - content = fs.readFileSync(filePath, "utf-8"); + try { + content = fs.readFileSync(filePath, "utf-8"); + } catch (error) { + Logger.error(`Failed to read PDF file: ${error}`); + content = ""; + } } return content; } async function processPdf(file: string) { - const fileContent = fs.readFileSync(file); - const data = await pdf(fileContent); - return data.text; + try { + const fileContent = fs.readFileSync(file); + const data = await pdf(fileContent); + return data.text; + } catch (error) { + throw error; + } } \ No newline at end of file From 2d1ab43c271b267d1bfd3ed13cada59ad63e50d5 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 30 Jul 2024 15:59:42 -0300 Subject: [PATCH 05/33] Update SELF_HOST.md --- SELF_HOST.md | 176 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 146 insertions(+), 30 deletions(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index f3a4a4b1..1cb07af1 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -1,36 +1,77 @@ -## Self-hosting Firecrawl +# Self-hosting Firecrawl -_We're currently working on a more in-depth guide on how to self-host, but in the meantime, here is a simplified version._ +#### Contributor? -Refer to [CONTRIBUTING.md](https://github.com/mendableai/firecrawl/blob/main/CONTRIBUTING.md) for instructions on how to run it locally. +Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally so you can run it on your own and contribute. -## Getting Started +If you're contributing, note that the process is similar to other open-source repos, i.e., fork Firecrawl, make changes, run tests, PR. -First, clone this repository and copy the example env file from the API folder `.env.example` to `.env`. +If you have any questions or would like help getting on board, reach out to hello@mendable.ai for more information or submit an issue! -### Steps +## Why? -1. Clone the repository: - - ```bash - git clone https://github.com/mendableai/firecrawl.git - cd firecrawl - cp ./apps/api/.env.example ./.env - ``` - -2. For running the simplest version of FireCrawl, edit the `USE_DB_AUTHENTICATION` in `.env` to not use the database authentication: - - ```plaintext - USE_DB_AUTHENTICATION=false - ``` - -3. Update the Redis URL in the .env file to align with the Docker configuration: - - ```plaintext - REDIS_URL=redis://redis:6379 - ``` - -4. #### Option: Running with TypeScript Playwright Service +Self-hosting Firecrawl is particularly beneficial for organizations with stringent security policies that require data to remain within controlled environments. Here are some key reasons to consider self-hosting: + +- **Enhanced Security and Compliance:** By self-hosting, you ensure that all data handling and processing complies with internal and external regulations, keeping sensitive information within your secure infrastructure. Note that Firecrawl is a Mendable product and relies on SOC2 Type2 certification, which means that the platform adheres to high industry standards for managing data security. +- **Customizable Services:** Self-hosting allows you to tailor the services, such as the Playwright service, to meet specific needs or handle particular use cases that may not be supported by the standard cloud offering. +- **Learning and Community Contribution:** By setting up and maintaining your own instance, you gain a deeper understanding of how Firecrawl works, which can also lead to more meaningful contributions to the project. + +### Considerations + +However, there are some limitations and additional responsibilities to be aware of: + +1. **Limited Access to Fire-engine:** Currently, self-hosted instances of Firecrawl do not have access to Fire-engine, which includes advanced features for handling IP blocks, robot detection mechanisms, and more. This means that while you can manage basic scraping tasks, more complex scenarios might require additional configuration or might not be supported. +2. **Manual Configuration Required:** If you need to use scraping methods beyond the basic fetch and Playwright options, you will need to manually configure these in the `.env` file. This requires a deeper understanding of the technologies and might involve more setup time. + +Self-hosting Firecrawl is ideal for those who need full control over their scraping and data processing environments but comes with the trade-off of additional maintenance and configuration efforts. + +## Steps + +1. First, start by installing the dependencies + +- Docker [instructions](https://docs.docker.com/get-docker/) + + +2. Set environment variables + +Create an `.env` in the root directory you can copy over the template in `apps/api/.env.example` + +To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features) + +`.env:` +``` +# ===== Required ENVS ====== +NUM_WORKERS_PER_QUEUE=8 +PORT=3002 +HOST=0.0.0.0 +REDIS_URL=redis://redis:6379 +REDIS_RATE_LIMIT_URL=redis://redis:6379 + +## To turn on DB authentication, you need to set up supabase. +USE_DB_AUTHENTICATION=false + +# ===== Optional ENVS ====== + +# Supabase Setup (used to support DB authentication, advanced logging, etc.) +SUPABASE_ANON_TOKEN= +SUPABASE_URL= +SUPABASE_SERVICE_TOKEN= + +# Other Optionals +TEST_API_KEY= # use if you've set up authentication and want to test with a real API key +SCRAPING_BEE_API_KEY= #Set if you'd like to use scraping Be to handle JS blocking +OPENAI_API_KEY= # add for LLM dependednt features (image alt generation, etc.) +BULL_AUTH_KEY= @ +LOGTAIL_KEY= # Use if you're configuring basic logging with logtail +PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback +LLAMAPARSE_API_KEY= #Set if you have a llamaparse key you'd like to use to parse pdfs +SERPER_API_KEY= #Set if you have a serper key you'd like to use as a search api +SLACK_WEBHOOK_URL= # set if you'd like to send slack server health status messages +POSTHOG_API_KEY= # set if you'd like to send posthog events like job logs +POSTHOG_HOST= # set if you'd like to send posthog events like job logs +``` + +3. *(Optional) Running with TypeScript Playwright Service* * Update the `docker-compose.yml` file to change the Playwright service: @@ -49,16 +90,91 @@ First, clone this repository and copy the example env file from the API folder ` ``` * Don't forget to set the proxy server in your `.env` file as needed. -5. Build and run the Docker containers: + +4. Build and run the Docker containers: ```bash docker compose build docker compose up ``` - This will run a local instance of Firecrawl which can be accessed at `http://localhost:3002`. +You should be able to see the Bull Queue Manager UI on `http://localhost:3002/admin/@/queues`. + +5. *(Optional)* Test the API + +If you’d like to test the crawl endpoint, you can run this: + + ```bash + curl -X POST http://localhost:3002/v0/crawl \ + -H 'Content-Type: application/json' \ + -d '{ + "url": "https://mendable.ai" + }' + ``` + +## Troubleshooting + +This section provides solutions to common issues you might encounter while setting up or running your self-hosted instance of Firecrawl. + +### Supabase client is not configured + +**Symptom:** +```bash +[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Attempted to access Supabase client when it's not configured. +[YYYY-MM-DDTHH:MM:SS.SSSz]ERROR - Error inserting scrape event: Error: Supabase client is not configured. +``` + +**Explanation:** +This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances. + +### You're bypassing authentication + +**Symptom:** +```bash +[YYYY-MM-DDTHH:MM:SS.SSSz]WARN - You're bypassing authentication +``` + +**Explanation:** +This error occurs because the Supabase client setup is not completed. You should be able to scrape and crawl with no problems. Right now it's not possible to configure Supabase in self-hosted instances. + +### Docker containers fail to start + +**Symptom:** +Docker containers exit unexpectedly or fail to start. + +**Solution:** +Check the Docker logs for any error messages using the command: +```bash +docker logs [container_name] +``` + +- Ensure all required environment variables are set correctly in the .env file. +- Verify that all Docker services defined in docker-compose.yml are correctly configured and the necessary images are available. + +### Connection issues with Redis + +**Symptom:** +Errors related to connecting to Redis, such as timeouts or "Connection refused". + +**Solution:** +- Ensure that the Redis service is up and running in your Docker environment. +- Verify that the REDIS_URL and REDIS_RATE_LIMIT_URL in your .env file point to the correct Redis instance. +- Check network settings and firewall rules that may block the connection to the Redis port. + +### API endpoint does not respond + +**Symptom:** +API requests to the Firecrawl instance timeout or return no response. + +**Solution:** +- Ensure that the Firecrawl service is running by checking the Docker container status. +- Verify that the PORT and HOST settings in your .env file are correct and that no other service is using the same port. +- Check the network configuration to ensure that the host is accessible from the client making the API request. + +By addressing these common issues, you can ensure a smoother setup and operation of your self-hosted Firecrawl instance. + ## Install Firecrawl on a Kubernetes Cluster (Simple Version) -Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. +Read the [examples/kubernetes-cluster-install/README.md](https://github.com/mendableai/firecrawl/blob/main/examples/kubernetes-cluster-install/README.md) for instructions on how to install Firecrawl on a Kubernetes Cluster. \ No newline at end of file From c7a38a4ae2fad16e9990f71ae3ec5a345276b3f7 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 30 Jul 2024 18:07:36 -0300 Subject: [PATCH 06/33] Update SELF_HOST.md --- SELF_HOST.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SELF_HOST.md b/SELF_HOST.md index 1cb07af1..43bc3757 100644 --- a/SELF_HOST.md +++ b/SELF_HOST.md @@ -6,7 +6,7 @@ Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions o If you're contributing, note that the process is similar to other open-source repos, i.e., fork Firecrawl, make changes, run tests, PR. -If you have any questions or would like help getting on board, reach out to hello@mendable.ai for more information or submit an issue! +If you have any questions or would like help getting on board, join our Discord community [here](https://discord.gg/gSmWdAkdwd) for more information or submit an issue on Github [here](https://github.com/mendableai/firecrawl/issues/new/choose)! ## Why? From a28ecc1f61a8809a62ab5bedd86169d00c0f0744 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Jul 2024 18:59:35 -0400 Subject: [PATCH 07/33] Nick: caching --- apps/api/package.json | 1 + apps/api/pnpm-lock.yaml | 16 +++++ apps/api/src/controllers/scrape.ts | 10 +-- apps/api/src/lib/scrape-events.ts | 2 +- .../src/services/billing/credit_billing.ts | 72 +++++++++++++++---- 5 files changed, 79 insertions(+), 22 deletions(-) diff --git a/apps/api/package.json b/apps/api/package.json index 15e97377..72235176 100644 --- a/apps/api/package.json +++ b/apps/api/package.json @@ -94,6 +94,7 @@ "promptable": "^0.0.10", "puppeteer": "^22.12.1", "rate-limiter-flexible": "2.4.2", + "redlock": "5.0.0-beta.2", "resend": "^3.4.0", "robots-parser": "^3.0.1", "scrapingbee": "^1.7.4", diff --git a/apps/api/pnpm-lock.yaml b/apps/api/pnpm-lock.yaml index ec83e18b..ad0e577c 100644 --- a/apps/api/pnpm-lock.yaml +++ b/apps/api/pnpm-lock.yaml @@ -149,6 +149,9 @@ importers: rate-limiter-flexible: specifier: 2.4.2 version: 2.4.2 + redlock: + specifier: 5.0.0-beta.2 + version: 5.0.0-beta.2 resend: specifier: ^3.4.0 version: 3.4.0 @@ -3533,6 +3536,9 @@ packages: resolution: {integrity: sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==} engines: {node: '>= 0.4.0'} + node-abort-controller@3.1.1: + resolution: {integrity: sha512-AGK2yQKIjRuqnc6VkX2Xj5d+QW8xZ87pa1UK6yA6ouUyuxfHuMP6umE5QK7UmTeOAymo+Zx1Fxiuw9rVx8taHQ==} + node-domexception@1.0.0: resolution: {integrity: sha512-/jKZoMpw0F8GRwl4/eLROPA3cfcXtLApP0QzLmUT/HuPCZWyB7IY9ZrMeKw2O/nFIqPQB3PVM9aYm0F312AXDQ==} engines: {node: '>=10.5.0'} @@ -3956,6 +3962,10 @@ packages: redis@4.6.14: resolution: {integrity: sha512-GrNg/e33HtsQwNXL7kJT+iNFPSwE1IPmd7wzV3j4f2z0EYxZfZE7FVTmUysgAtqQQtg5NXF5SNLR9OdO/UHOfw==} + redlock@5.0.0-beta.2: + resolution: {integrity: sha512-2RDWXg5jgRptDrB1w9O/JgSZC0j7y4SlaXnor93H/UJm/QyDiFgBKNtrh0TI6oCXqYSaSoXxFh6Sd3VtYfhRXw==} + engines: {node: '>=12'} + regenerator-runtime@0.14.1: resolution: {integrity: sha512-dYnhHh0nJoMfnkZs6GmmhFknAGRrLznOu5nc9ML+EJxGvrx6H7teuevqVqCuPcPK//3eDrrjQhehXVx9cnkGdw==} @@ -8605,6 +8615,8 @@ snapshots: netmask@2.0.2: {} + node-abort-controller@3.1.1: {} + node-domexception@1.0.0: {} node-ensure@0.0.0: {} @@ -9108,6 +9120,10 @@ snapshots: '@redis/search': 1.1.6(@redis/client@1.5.16) '@redis/time-series': 1.0.5(@redis/client@1.5.16) + redlock@5.0.0-beta.2: + dependencies: + node-abort-controller: 3.1.1 + regenerator-runtime@0.14.1: {} require-directory@2.1.1: {} diff --git a/apps/api/src/controllers/scrape.ts b/apps/api/src/controllers/scrape.ts index f594eea8..6c94f4c0 100644 --- a/apps/api/src/controllers/scrape.ts +++ b/apps/api/src/controllers/scrape.ts @@ -118,18 +118,12 @@ export async function scrapeController(req: Request, res: Response) { } catch (error) { Logger.error(error); earlyReturn = true; - return res.status(402).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); + return res.status(500).json({ error: "Error checking team credits. Please contact hello@firecrawl.com for help." }); } }; - // Async check saves 500ms in average case - // Don't async check in llm extraction mode as it could be expensive - if (extractorOptions.mode.includes("llm-extraction")) { - await checkCredits(); - } else { - checkCredits(); - } + await checkCredits(); const jobId = uuidv4(); diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index ab4ef681..8d677279 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -46,7 +46,7 @@ export class ScrapeEvents { }).select().single(); return (result.data as any).id; } catch (error) { - Logger.error(`Error inserting scrape event: ${error}`); + // Logger.error(`Error inserting scrape event: ${error}`); return null; } } diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index 9369cdbb..b39c42c2 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -3,9 +3,37 @@ import { withAuth } from "../../lib/withAuth"; import { sendNotification } from "../notification/email_notification"; import { supabase_service } from "../supabase"; import { Logger } from "../../lib/logger"; - +import { getValue, setValue } from "../redis"; +import Redlock from "redlock"; +import Client from "ioredis"; const FREE_CREDITS = 500; +const redlock = new Redlock( + // You should have one client for each independent redis node + // or cluster. + [new Client(process.env.REDIS_RATE_LIMIT_URL)], + { + // The expected clock drift; for more details see: + // http://redis.io/topics/distlock + driftFactor: 0.01, // multiplied by lock ttl to determine drift time + + // The max number of times Redlock will attempt to lock a resource + // before erroring. + retryCount: 10, + + // the time in ms between attempts + retryDelay: 200, // time in ms + + // the max time in ms randomly added to retries + // to improve performance under high contention + // see https://www.awsarchitectureblog.com/2015/03/backoff.html + retryJitter: 200, // time in ms + + // The minimum remaining time on a lock before an extension is automatically + // attempted with the `using` API. + automaticExtensionThreshold: 500, // time in ms + } +); export async function billTeam(team_id: string, credits: number) { return withAuth(supaBillTeam)(team_id, credits); } @@ -254,23 +282,41 @@ export async function supaCheckTeamCredits(team_id: string, credits: number) { } let totalCreditsUsed = 0; + const cacheKey = `credit_usage_${subscription.id}_${subscription.current_period_start}_${subscription.current_period_end}_lc`; + const redLockKey = `lock_${cacheKey}`; + const lockTTL = 10000; // 10 seconds + try { - const { data: creditUsages, error: creditUsageError } = - await supabase_service.rpc("get_credit_usage_2", { - sub_id: subscription.id, - start_time: subscription.current_period_start, - end_time: subscription.current_period_end, - }); + const lock = await redlock.acquire([redLockKey], lockTTL); - if (creditUsageError) { - Logger.error(`Error calculating credit usage: ${creditUsageError}`); - } + try { + const cachedCreditUsage = await getValue(cacheKey); - if (creditUsages && creditUsages.length > 0) { - totalCreditsUsed = creditUsages[0].total_credits_used; + if (cachedCreditUsage) { + totalCreditsUsed = parseInt(cachedCreditUsage); + } else { + const { data: creditUsages, error: creditUsageError } = + await supabase_service.rpc("get_credit_usage_2", { + sub_id: subscription.id, + start_time: subscription.current_period_start, + end_time: subscription.current_period_end, + }); + + if (creditUsageError) { + Logger.error(`Error calculating credit usage: ${creditUsageError}`); + } + + if (creditUsages && creditUsages.length > 0) { + totalCreditsUsed = creditUsages[0].total_credits_used; + await setValue(cacheKey, totalCreditsUsed.toString(), 1800); // Cache for 30 minutes + // Logger.info(`Cache set for credit usage: ${totalCreditsUsed}`); + } + } + } finally { + await lock.release(); } } catch (error) { - Logger.error(`Error calculating credit usage: ${error}`); + Logger.error(`Error acquiring lock or calculating credit usage: ${error}`); } // Adjust total credits used by subtracting coupon value From 6d99dedd3cec15d9543aaccba2a5a6b5d78559ec Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Jul 2024 19:11:01 -0400 Subject: [PATCH 08/33] Nick: fixed tests --- .../api/src/scraper/WebScraper/__tests__/single_url.test.ts | 6 +++--- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts index 4b720835..d555e030 100644 --- a/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts +++ b/apps/api/src/scraper/WebScraper/__tests__/single_url.test.ts @@ -23,8 +23,8 @@ describe('scrapSingleUrl', () => { }, 10000); }); -it('should return a list of links on the mendable.ai page', async () => { - const url = 'https://mendable.ai'; +it('should return a list of links on the firecrawl.ai page', async () => { + const url = 'https://example.com'; const pageOptions: PageOptions = { includeHtml: true }; const result = await scrapSingleUrl("TEST", url, pageOptions); @@ -33,5 +33,5 @@ it('should return a list of links on the mendable.ai page', async () => { expect(result.linksOnPage).toBeDefined(); expect(Array.isArray(result.linksOnPage)).toBe(true); expect(result.linksOnPage.length).toBeGreaterThan(0); - expect(result.linksOnPage).toContain('https://mendable.ai/blog') + expect(result.linksOnPage).toContain('https://www.iana.org/domains/example') }, 10000); diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 0f4c2320..461d8e16 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -81,6 +81,7 @@ export async function scrapWithFireEngine({ timeout: universalTimeout + waitParam, } ); + console.log(response.data?.pageStatusCode); if (response.status !== 200) { Logger.debug( From f9827b2151a698da1734834d2c8c2e7b302e4f35 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Jul 2024 19:13:17 -0400 Subject: [PATCH 09/33] Update credit_billing.ts --- apps/api/src/services/billing/credit_billing.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/billing/credit_billing.ts b/apps/api/src/services/billing/credit_billing.ts index b39c42c2..765d028e 100644 --- a/apps/api/src/services/billing/credit_billing.ts +++ b/apps/api/src/services/billing/credit_billing.ts @@ -6,6 +6,7 @@ import { Logger } from "../../lib/logger"; import { getValue, setValue } from "../redis"; import Redlock from "redlock"; import Client from "ioredis"; + const FREE_CREDITS = 500; const redlock = new Redlock( @@ -19,10 +20,10 @@ const redlock = new Redlock( // The max number of times Redlock will attempt to lock a resource // before erroring. - retryCount: 10, + retryCount: 5, // the time in ms between attempts - retryDelay: 200, // time in ms + retryDelay: 100, // time in ms // the max time in ms randomly added to retries // to improve performance under high contention From ad6f6eff4b19ca89bd864cf31cbfff641576829f Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 30 Jul 2024 19:15:54 -0400 Subject: [PATCH 10/33] Update fireEngine.ts --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 461d8e16..0f4c2320 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -81,7 +81,6 @@ export async function scrapWithFireEngine({ timeout: universalTimeout + waitParam, } ); - console.log(response.data?.pageStatusCode); if (response.status !== 200) { Logger.debug( From f48ff36b32978fb4d0c54dd7efcbaad5f22e9a5d Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Wed, 31 Jul 2024 09:28:43 -0300 Subject: [PATCH 11/33] added .inc files and forced lower case comparison --- apps/api/src/scraper/WebScraper/crawler.ts | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 5ee8cda8..60dc301b 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -164,7 +164,7 @@ export class WebCrawler { concurrencyLimit, inProgress ); - + if ( urls.length === 0 && this.filterLinks([this.initialUrl], limit, this.maxCrawledDepth).length > 0 @@ -420,9 +420,10 @@ export class WebCrawler { ".woff", ".ttf", ".woff2", - ".webp" + ".webp", + ".inc" ]; - return fileExtensions.some((ext) => url.endsWith(ext)); + return fileExtensions.some((ext) => url.toLowerCase().endsWith(ext)); } private isSocialMediaOrEmail(url: string): boolean { From 8568b61015331b2622548a66574351f1cd63c424 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 2 Aug 2024 11:03:01 -0300 Subject: [PATCH 12/33] bugfix for sitemaps --- apps/api/src/scraper/WebScraper/crawler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/crawler.ts b/apps/api/src/scraper/WebScraper/crawler.ts index 60dc301b..fc0eee3e 100644 --- a/apps/api/src/scraper/WebScraper/crawler.ts +++ b/apps/api/src/scraper/WebScraper/crawler.ts @@ -476,7 +476,7 @@ export class WebCrawler { try { const response = await axios.get(baseUrlSitemap, { timeout: axiosTimeout }); if (response.status === 200) { - sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap }); + sitemapLinks = await getLinksFromSitemap({ sitemapUrl: baseUrlSitemap, mode: 'fire-engine' }); } } catch (error) { Logger.debug(`Failed to fetch sitemap from ${baseUrlSitemap}: ${error}`); From 405163063217330893b061a6af6419993e2c29f4 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Fri, 2 Aug 2024 11:32:48 -0300 Subject: [PATCH 13/33] Update sitemap.ts --- apps/api/src/scraper/WebScraper/sitemap.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/scraper/WebScraper/sitemap.ts b/apps/api/src/scraper/WebScraper/sitemap.ts index 3dfc9a1c..b1a6a6ff 100644 --- a/apps/api/src/scraper/WebScraper/sitemap.ts +++ b/apps/api/src/scraper/WebScraper/sitemap.ts @@ -19,7 +19,7 @@ export async function getLinksFromSitemap( try { let content: string; try { - if (mode === 'axios') { + if (mode === 'axios' || process.env.FIRE_ENGINE_BETA_URL === '') { const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); content = response.data; } else if (mode === 'fire-engine') { From b448e3c3ad9acc312fa49eb67c77ef71e5b5c6db Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 2 Aug 2024 14:26:35 -0400 Subject: [PATCH 14/33] Update website_params.ts --- .../scraper/WebScraper/utils/custom/website_params.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index c688061d..02e3e2e9 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -232,4 +232,14 @@ export const urlSpecificParams = { } }, }, + "amazon.com":{ + defaultScraper: "fire-engine", + params:{ + fireEngineOptions:{ + mobileProxy: true, + method: "get", + engine: "chrome-cdp", + }, + }, + }, }; From 2e83a8a8b42c4eacd5bccca3155084f0b06cac5d Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 2 Aug 2024 14:27:19 -0400 Subject: [PATCH 15/33] Delete check-redis.yml --- .github/workflows/check-redis.yml | 20 -------------------- 1 file changed, 20 deletions(-) delete mode 100644 .github/workflows/check-redis.yml diff --git a/.github/workflows/check-redis.yml b/.github/workflows/check-redis.yml deleted file mode 100644 index e5e9ff0d..00000000 --- a/.github/workflows/check-redis.yml +++ /dev/null @@ -1,20 +0,0 @@ -name: Check Redis -on: - schedule: - - cron: '*/5 * * * *' - -env: - BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} - -jobs: - clean-jobs: - runs-on: ubuntu-latest - steps: - - name: Send GET request to check queues - run: | - response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/redis-health) - if [ "$response" -ne 200 ]; then - echo "Failed to check queues. Response: $response" - exit 1 - fi - echo "Successfully checked queues. Response: $response" From 39aecd974bf7b628284cd08dab52da2fdcbbe7c5 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 2 Aug 2024 17:43:45 -0400 Subject: [PATCH 16/33] Update redis-health.ts --- .../api/src/controllers/admin/redis-health.ts | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/apps/api/src/controllers/admin/redis-health.ts b/apps/api/src/controllers/admin/redis-health.ts index e35d6db9..3b1e2518 100644 --- a/apps/api/src/controllers/admin/redis-health.ts +++ b/apps/api/src/controllers/admin/redis-health.ts @@ -1,7 +1,6 @@ import { Request, Response } from "express"; import Redis from "ioredis"; import { Logger } from "../../lib/logger"; -import { sendSlackWebhook } from "../../services/alerts/slack"; import { redisRateLimitClient } from "../../services/rate-limiter"; export async function redisHealthController(req: Request, res: Response) { @@ -63,22 +62,22 @@ export async function redisHealthController(req: Request, res: Response) { Logger.info( `Redis instances health check: ${JSON.stringify(healthStatus)}` ); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${JSON.stringify( - healthStatus - )}`, - true - ); + // await sendSlackWebhook( + // `[REDIS DOWN] Redis instances health check: ${JSON.stringify( + // healthStatus + // )}`, + // true + // ); return res .status(500) .json({ status: "unhealthy", details: healthStatus }); } } catch (error) { Logger.error(`Redis health check failed: ${error}`); - await sendSlackWebhook( - `[REDIS DOWN] Redis instances health check: ${error.message}`, - true - ); + // await sendSlackWebhook( + // `[REDIS DOWN] Redis instances health check: ${error.message}`, + // true + // ); return res .status(500) .json({ status: "unhealthy", message: error.message }); From 1742e4ceaeb322bb42c1162a8b8f3050c5e09406 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 2 Aug 2024 19:25:15 -0400 Subject: [PATCH 17/33] Nick: --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 3 ++- apps/api/src/scraper/WebScraper/utils/custom/website_params.ts | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index 0f4c2320..ba67043c 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -47,7 +47,7 @@ export async function scrapWithFireEngine({ try { const reqParams = await generateRequestParams(url); const waitParam = reqParams["params"]?.wait ?? waitFor; - const engineParam = reqParams["params"]?.engine ?? fireEngineOptions?.engine ?? "playwright"; + const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -64,6 +64,7 @@ export async function scrapWithFireEngine({ `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); + const response = await axios.post( process.env.FIRE_ENGINE_BETA_URL + endpoint, { diff --git a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts index 02e3e2e9..8583e614 100644 --- a/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts +++ b/apps/api/src/scraper/WebScraper/utils/custom/website_params.ts @@ -236,8 +236,6 @@ export const urlSpecificParams = { defaultScraper: "fire-engine", params:{ fireEngineOptions:{ - mobileProxy: true, - method: "get", engine: "chrome-cdp", }, }, From f32e8de156236dda6f1dd34a1c020a7174821b77 Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:13:31 -0300 Subject: [PATCH 18/33] fixes the empty excludes.filter undefined bug --- apps/api/src/lib/entities.ts | 4 ++-- apps/api/src/scraper/WebScraper/index.ts | 27 ++++++++++++++++++------ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..8083b905 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -42,8 +42,8 @@ export type SearchOptions = { export type CrawlerOptions = { returnOnlyUrls?: boolean; - includes?: string[]; - excludes?: string[]; + includes?: string | string[]; + excludes?: string | string[]; maxCrawledLinks?: number; maxDepth?: number; limit?: number; diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index 9171b805..e667fa6b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -27,8 +27,8 @@ export class WebScraperDataProvider { private bullJobId: string; private urls: string[] = [""]; private mode: "single_urls" | "sitemap" | "crawl" = "single_urls"; - private includes: string[]; - private excludes: string[]; + private includes: string | string[]; + private excludes: string | string[]; private maxCrawledLinks: number; private maxCrawledDepth: number = 10; private returnOnlyUrls: boolean; @@ -171,8 +171,8 @@ export class WebScraperDataProvider { const crawler = new WebCrawler({ jobId: this.jobId, initialUrl: this.urls[0], - includes: this.includes, - excludes: this.excludes, + includes: Array.isArray(this.includes) ? this.includes : this.includes.split(','), + excludes: Array.isArray(this.excludes) ? this.excludes : this.excludes.split(','), maxCrawledLinks: this.maxCrawledLinks, maxCrawledDepth: getAdjustedMaxDepth(this.urls[0], this.maxCrawledDepth), limit: this.limit, @@ -445,6 +445,10 @@ export class WebScraperDataProvider { const url = new URL(document.metadata.sourceURL); const path = url.pathname; + if (!Array.isArray(this.excludes)) { + this.excludes = this.excludes.split(','); + } + if (this.excludes.length > 0 && this.excludes[0] !== "") { // Check if the link should be excluded if ( @@ -456,6 +460,10 @@ export class WebScraperDataProvider { } } + if (!Array.isArray(this.includes)) { + this.includes = this.includes.split(','); + } + if (this.includes.length > 0 && this.includes[0] !== "") { // Check if the link matches the include patterns, if any are specified if (this.includes.length > 0) { @@ -567,8 +575,15 @@ export class WebScraperDataProvider { options.crawlerOptions?.replaceAllPathsWithAbsolutePaths ?? options.pageOptions?.replaceAllPathsWithAbsolutePaths ?? false; - //! @nicolas, for some reason this was being injected and breaking everything. Don't have time to find source of the issue so adding this check - this.excludes = this.excludes.filter((item) => item !== ""); + + if (typeof options.crawlerOptions?.excludes === 'string') { + this.excludes = options.crawlerOptions?.excludes.split(',').filter((item) => item.trim() !== ""); + } + + if (typeof options.crawlerOptions?.includes === 'string') { + this.includes = options.crawlerOptions?.includes.split(',').filter((item) => item.trim() !== ""); + } + this.crawlerMode = options.crawlerOptions?.mode ?? "default"; this.ignoreSitemap = options.crawlerOptions?.ignoreSitemap ?? false; this.allowBackwardCrawling = From 3edc3a3d1580b7ca10a51dbd852a37def6103c0c Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Mon, 5 Aug 2024 18:17:37 -0300 Subject: [PATCH 19/33] added fullpagescreenshot capabilities, wip on fire-engine side --- apps/api/openapi.json | 10 ++++++++++ apps/api/src/lib/default-values.ts | 1 + apps/api/src/lib/entities.ts | 1 + apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 4 ++++ apps/api/src/scraper/WebScraper/single_url.ts | 2 ++ 5 files changed, 18 insertions(+) diff --git a/apps/api/openapi.json b/apps/api/openapi.json index e0b583f0..fb0c4305 100644 --- a/apps/api/openapi.json +++ b/apps/api/openapi.json @@ -84,6 +84,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", @@ -317,6 +322,11 @@ "description": "Include a screenshot of the top of the page that you are scraping.", "default": false }, + "fullPageScreenshot": { + "type": "boolean", + "description": "Include a full page screenshot of the page that you are scraping.", + "default": false + }, "waitFor": { "type": "integer", "description": "Wait x amount of milliseconds for the page to load to fetch content", diff --git a/apps/api/src/lib/default-values.ts b/apps/api/src/lib/default-values.ts index 3b303781..152f47d7 100644 --- a/apps/api/src/lib/default-values.ts +++ b/apps/api/src/lib/default-values.ts @@ -7,6 +7,7 @@ export const defaultPageOptions = { includeHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, parsePDF: true }; diff --git a/apps/api/src/lib/entities.ts b/apps/api/src/lib/entities.ts index 9ffa4810..4dc2050d 100644 --- a/apps/api/src/lib/entities.ts +++ b/apps/api/src/lib/entities.ts @@ -18,6 +18,7 @@ export type PageOptions = { fetchPageContent?: boolean; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; headers?: Record; replaceAllPathsWithAbsolutePaths?: boolean; parsePDF?: boolean; diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index ba67043c..dfe23a89 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -11,6 +11,7 @@ import { Logger } from "../../../lib/logger"; * @param url The URL to scrape * @param waitFor The time to wait for the page to load * @param screenshot Whether to take a screenshot + * @param fullPageScreenshot Whether to take a full page screenshot * @param pageOptions The options for the page * @param headers The headers to send with the request * @param options The options for the request @@ -20,6 +21,7 @@ export async function scrapWithFireEngine({ url, waitFor = 0, screenshot = false, + fullPageScreenshot = false, pageOptions = { parsePDF: true }, fireEngineOptions = {}, headers, @@ -28,6 +30,7 @@ export async function scrapWithFireEngine({ url: string; waitFor?: number; screenshot?: boolean; + fullPageScreenshot?: boolean; pageOptions?: { scrollXPaths?: string[]; parsePDF?: boolean }; fireEngineOptions?: FireEngineOptions; headers?: Record; @@ -71,6 +74,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, + fullPageScreenshot: fullPageScreenshot, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 4a44b23f..0fa2fc8b 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -128,6 +128,7 @@ export async function scrapSingleUrl( includeRawHtml: false, waitFor: 0, screenshot: false, + fullPageScreenshot: false, headers: undefined, }, extractorOptions: ExtractorOptions = { @@ -171,6 +172,7 @@ export async function scrapSingleUrl( url, waitFor: pageOptions.waitFor, screenshot: pageOptions.screenshot, + fullPageScreenshot: pageOptions.fullPageScreenshot, pageOptions: pageOptions, headers: pageOptions.headers, fireEngineOptions: { From 4d24a99d50358a31a5aa30fde79309545227c3dc Mon Sep 17 00:00:00 2001 From: rafaelsideguide <150964962+rafaelsideguide@users.noreply.github.com> Date: Tue, 6 Aug 2024 09:34:43 -0300 Subject: [PATCH 20/33] fix params --- apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts | 5 +++-- apps/api/src/scraper/WebScraper/single_url.ts | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts index dfe23a89..0bb9986f 100644 --- a/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts +++ b/apps/api/src/scraper/WebScraper/scrapers/fireEngine.ts @@ -52,6 +52,7 @@ export async function scrapWithFireEngine({ const waitParam = reqParams["params"]?.wait ?? waitFor; const engineParam = reqParams["params"]?.engine ?? reqParams["params"]?.fireEngineOptions?.engine ?? fireEngineOptions?.engine ?? "playwright"; const screenshotParam = reqParams["params"]?.screenshot ?? screenshot; + const fullPageScreenshotParam = reqParams["params"]?.fullPageScreenshot ?? fullPageScreenshot; const fireEngineOptionsParam : FireEngineOptions = reqParams["params"]?.fireEngineOptions ?? fireEngineOptions; @@ -64,7 +65,7 @@ export async function scrapWithFireEngine({ let engine = engineParam; // do we want fireEngineOptions as first choice? Logger.info( - `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, method: ${fireEngineOptionsParam?.method ?? "null"} }` + `⛏️ Fire-Engine (${engine}): Scraping ${url} | params: { wait: ${waitParam}, screenshot: ${screenshotParam}, fullPageScreenshot: ${fullPageScreenshot}, method: ${fireEngineOptionsParam?.method ?? "null"} }` ); @@ -74,7 +75,7 @@ export async function scrapWithFireEngine({ url: url, wait: waitParam, screenshot: screenshotParam, - fullPageScreenshot: fullPageScreenshot, + fullPageScreenshot: fullPageScreenshotParam, headers: headers, pageOptions: pageOptions, ...fireEngineOptionsParam, diff --git a/apps/api/src/scraper/WebScraper/single_url.ts b/apps/api/src/scraper/WebScraper/single_url.ts index 0fa2fc8b..12e075fd 100644 --- a/apps/api/src/scraper/WebScraper/single_url.ts +++ b/apps/api/src/scraper/WebScraper/single_url.ts @@ -308,7 +308,7 @@ export async function scrapSingleUrl( const scrapersInOrder = getScrapingFallbackOrder( defaultScraper, pageOptions && pageOptions.waitFor && pageOptions.waitFor > 0, - pageOptions && pageOptions.screenshot && pageOptions.screenshot === true, + pageOptions && (pageOptions.screenshot || pageOptions.fullPageScreenshot) && (pageOptions.screenshot === true || pageOptions.fullPageScreenshot === true), pageOptions && pageOptions.headers && pageOptions.headers !== undefined ); From a67a5c04c91409e80e81a19536736e70f9c478d8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:02:56 -0400 Subject: [PATCH 21/33] Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs" This reverts commit bb90e03dea5dc2f475dbf8dc70425af9b3dfe246, reversing changes made to 3321ca9398b5a129ffc1689ec2a88b2468cf16e9. --- apps/js-sdk/firecrawl/types/index.d.ts | 189 +++++++++++++++++++++++++ 1 file changed, 189 insertions(+) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 00000000..91a58043 --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,189 @@ +import { AxiosResponse, AxiosRequestHeaders } from "axios"; +import { z } from "zod"; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; + apiUrl?: string | null; +} +/** + * Metadata for a Firecrawl document. + */ +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; + [key: string]: any; +} +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + index?: number; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: FirecrawlDocument; + error?: string; +} +/** +* Response interface for searching operations. +*/ +export interface SearchResponse { + success: boolean; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for crawling operations. +*/ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} +/** +* Response interface for job status checks. +*/ +export interface JobStatusResponse { + success: boolean; + status: string; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + extractionPrompt?: string; + }; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + private apiUrl; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey, apiUrl }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 5da4472842497dbe4d462609293917e0b2199fc8 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:41:06 -0400 Subject: [PATCH 22/33] Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs" This reverts commit bb90e03dea5dc2f475dbf8dc70425af9b3dfe246, reversing changes made to 3321ca9398b5a129ffc1689ec2a88b2468cf16e9. --- apps/js-sdk/firecrawl/.gitignore | 3 - apps/js-sdk/firecrawl/build/index.js | 265 +++++++++++++++++++++++++++ apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/package-lock.json | 34 +--- apps/js-sdk/package.json | 1 - 5 files changed, 270 insertions(+), 41 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/index.js diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 1acd6303..c6bba591 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -128,6 +128,3 @@ dist .yarn/build-state.yml .yarn/install-state.gz .pnp.* - -build -types diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js new file mode 100644 index 00000000..99de5e2b --- /dev/null +++ b/apps/js-sdk/firecrawl/build/index.js @@ -0,0 +1,265 @@ +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +import axios from "axios"; +import { z } from "zod"; +import { zodToJsonSchema } from "zod-to-json-schema"; +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null) { + var _a; + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query_1) { + return __awaiter(this, arguments, void 0, function* (query, params = null) { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url_1) { + return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 5a6d6bfb..71d2362e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,15 +2,11 @@ "name": "@mendable/firecrawl-js", "version": "0.0.29", "description": "JavaScript SDK for Firecrawl API", - "main": "build/cjs/index.js", + "main": "build/index.js", "types": "types/index.d.ts", "type": "module", - "exports": { - "require": "./build/cjs/index.js", - "import": "./build/esm/index.js" - }, "scripts": { - "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", + "build": "tsc", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index ca337062..2bf3f001 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,7 +13,6 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", - "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -451,15 +450,6 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, - "node_modules/@types/node": { - "version": "20.14.11", - "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", - "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", - "peer": true, - "dependencies": { - "undici-types": "~5.26.4" - } - }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -738,24 +728,6 @@ "node": ">=14.17" } }, - "node_modules/undici-types": { - "version": "5.26.5", - "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", - "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", - "peer": true - }, - "node_modules/uuid": { - "version": "10.0.0", - "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", - "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", - "funding": [ - "https://github.com/sponsors/broofa", - "https://github.com/sponsors/ctavan" - ], - "bin": { - "uuid": "dist/bin/uuid" - } - }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -778,9 +750,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.1", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", - "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", + "version": "3.23.0", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", + "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 2d2c36e8..0e93fe3c 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,7 +15,6 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", - "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From f294d3922cde4f66a717d3e6b7e1664660b31b19 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Tue, 6 Aug 2024 18:44:45 -0400 Subject: [PATCH 23/33] Nick: revert --- apps/js-sdk/firecrawl/build/cjs/index.js | 271 +++++++++++++++++++ apps/js-sdk/firecrawl/build/cjs/package.json | 1 + apps/js-sdk/firecrawl/build/esm/index.js | 265 ++++++++++++++++++ apps/js-sdk/firecrawl/build/esm/package.json | 1 + apps/js-sdk/firecrawl/build/index.js | 14 +- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/types/index.d.ts | 22 +- 7 files changed, 559 insertions(+), 17 deletions(-) create mode 100644 apps/js-sdk/firecrawl/build/cjs/index.js create mode 100644 apps/js-sdk/firecrawl/build/cjs/package.json create mode 100644 apps/js-sdk/firecrawl/build/esm/index.js create mode 100644 apps/js-sdk/firecrawl/build/esm/package.json diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js new file mode 100644 index 00000000..da340cae --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -0,0 +1,271 @@ +"use strict"; +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +var __importDefault = (this && this.__importDefault) || function (mod) { + return (mod && mod.__esModule) ? mod : { "default": mod }; +}; +Object.defineProperty(exports, "__esModule", { value: true }); +const axios_1 = __importDefault(require("axios")); +const zod_1 = require("zod"); +const zod_to_json_schema_1 = require("zod-to-json-schema"); +/** + * Main class for interacting with the Firecrawl API. + */ +class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof zod_1.z.ZodSchema) { + schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios_1.default.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios_1.default.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios_1.default.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} +exports.default = FirecrawlApp; diff --git a/apps/js-sdk/firecrawl/build/cjs/package.json b/apps/js-sdk/firecrawl/build/cjs/package.json new file mode 100644 index 00000000..b731bd61 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/cjs/package.json @@ -0,0 +1 @@ +{"type": "commonjs"} diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js new file mode 100644 index 00000000..ef79f180 --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/index.js @@ -0,0 +1,265 @@ +var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { + function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } + return new (P || (P = Promise))(function (resolve, reject) { + function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } + function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } + function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } + step((generator = generator.apply(thisArg, _arguments || [])).next()); + }); +}; +import axios from "axios"; +import { z } from "zod"; +import { zodToJsonSchema } from "zod-to-json-schema"; +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey = null, apiUrl = null }) { + this.apiKey = apiKey || ""; + this.apiUrl = apiUrl || "https://api.firecrawl.dev"; + if (!this.apiKey) { + throw new Error("No API key provided"); + } + } + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = Object.assign({ url }, params); + if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { + let schema = params.extractorOptions.extractionSchema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "scrape URL"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { + const headers = { + "Content-Type": "application/json", + Authorization: `Bearer ${this.apiKey}`, + }; + let jsonData = { query }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); + if (response.status === 200) { + const responseData = response.data; + if (responseData.success) { + return responseData; + } + else { + throw new Error(`Failed to search. Error: ${responseData.error}`); + } + } + else { + this.handleError(response, "search"); + } + } + catch (error) { + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(idempotencyKey); + let jsonData = { url }; + if (params) { + jsonData = Object.assign(Object.assign({}, jsonData), params); + } + try { + const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); + if (response.status === 200) { + const jobId = response.data.jobId; + if (waitUntilDone) { + return this.monitorJobStatus(jobId, headers, pollInterval); + } + else { + return { success: true, jobId }; + } + } + else { + this.handleError(response, "start crawl job"); + } + } + catch (error) { + console.log(error); + throw new Error(error.message); + } + return { success: false, error: "Internal server error." }; + }); + } + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId) { + return __awaiter(this, void 0, void 0, function* () { + const headers = this.prepareHeaders(); + try { + const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (response.status === 200) { + return { + success: true, + status: response.data.status, + current: response.data.current, + current_url: response.data.current_url, + current_step: response.data.current_step, + total: response.data.total, + data: response.data.data, + partial_data: !response.data.data + ? response.data.partial_data + : undefined, + }; + } + else { + this.handleError(response, "check crawl status"); + } + } + catch (error) { + throw new Error(error.message); + } + return { + success: false, + status: "unknown", + current: 0, + current_url: "", + current_step: "", + total: 0, + error: "Internal server error.", + }; + }); + } + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey) { + return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); + } + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url, data, headers) { + return axios.post(url, data, { headers }); + } + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url, headers) { + return axios.get(url, { headers }); + } + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId, headers, checkInterval) { + return __awaiter(this, void 0, void 0, function* () { + while (true) { + const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); + if (statusResponse.status === 200) { + const statusData = statusResponse.data; + if (statusData.status === "completed") { + if ("data" in statusData) { + return statusData.data; + } + else { + throw new Error("Crawl job completed but no data was returned"); + } + } + else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { + if (checkInterval < 2) { + checkInterval = 2; + } + yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again + } + else { + throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); + } + } + else { + this.handleError(statusResponse, "check crawl status"); + } + } + }); + } + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response, action) { + if ([402, 408, 409, 500].includes(response.status)) { + const errorMessage = response.data.error || "Unknown error occurred"; + throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); + } + else { + throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); + } + } +} diff --git a/apps/js-sdk/firecrawl/build/esm/package.json b/apps/js-sdk/firecrawl/build/esm/package.json new file mode 100644 index 00000000..6990891f --- /dev/null +++ b/apps/js-sdk/firecrawl/build/esm/package.json @@ -0,0 +1 @@ +{"type": "module"} diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js index 99de5e2b..ef79f180 100644 --- a/apps/js-sdk/firecrawl/build/index.js +++ b/apps/js-sdk/firecrawl/build/index.js @@ -31,9 +31,9 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the scrape request. * @returns {Promise} The response from the scrape operation. */ - scrapeUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null) { - var _a; + scrapeUrl(url, params = null) { + var _a; + return __awaiter(this, void 0, void 0, function* () { const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -74,8 +74,8 @@ export default class FirecrawlApp { * @param {Params | null} params - Additional parameters for the search request. * @returns {Promise} The response from the search operation. */ - search(query_1) { - return __awaiter(this, arguments, void 0, function* (query, params = null) { + search(query, params = null) { + return __awaiter(this, void 0, void 0, function* () { const headers = { "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}`, @@ -114,8 +114,8 @@ export default class FirecrawlApp { * @param {string} idempotencyKey - Optional idempotency key for the request. * @returns {Promise} The response from the crawl operation. */ - crawlUrl(url_1) { - return __awaiter(this, arguments, void 0, function* (url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { + return __awaiter(this, void 0, void 0, function* () { const headers = this.prepareHeaders(idempotencyKey); let jsonData = { url }; if (params) { diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 71d2362e..e6a398e4 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "0.0.29", + "version": "0.0.34", "description": "JavaScript SDK for Firecrawl API", "main": "build/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 91a58043..bd6cfc20 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -73,16 +73,16 @@ export interface ScrapeResponse { error?: string; } /** -* Response interface for searching operations. -*/ + * Response interface for searching operations. + */ export interface SearchResponse { success: boolean; data?: FirecrawlDocument[]; error?: string; } /** -* Response interface for crawling operations. -*/ + * Response interface for crawling operations. + */ export interface CrawlResponse { success: boolean; jobId?: string; @@ -90,24 +90,28 @@ export interface CrawlResponse { error?: string; } /** -* Response interface for job status checks. -*/ + * Response interface for job status checks. + */ export interface JobStatusResponse { success: boolean; status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; jobId?: string; data?: FirecrawlDocument[]; partial_data?: FirecrawlDocument[]; error?: string; } /** - * Generic parameter interface. - */ + * Generic parameter interface. + */ export interface Params { [key: string]: any; extractorOptions?: { extractionSchema: z.ZodSchema | any; - mode?: "llm-extraction" | "llm-extraction-from-raw-html"; + mode?: "llm-extraction"; extractionPrompt?: string; }; } From 5f7724205f59bd7f415da16805216d5a7ad4657d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 01:06:21 +0200 Subject: [PATCH 24/33] fix(js-sdk): re-add types --- apps/js-sdk/firecrawl/.gitignore | 1 - apps/js-sdk/firecrawl/types/index.d.ts | 193 +++++++++++++++++++++++++ 2 files changed, 193 insertions(+), 1 deletion(-) create mode 100644 apps/js-sdk/firecrawl/types/index.d.ts diff --git a/apps/js-sdk/firecrawl/.gitignore b/apps/js-sdk/firecrawl/.gitignore index 1acd6303..96e545b3 100644 --- a/apps/js-sdk/firecrawl/.gitignore +++ b/apps/js-sdk/firecrawl/.gitignore @@ -130,4 +130,3 @@ dist .pnp.* build -types diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts new file mode 100644 index 00000000..bd6cfc20 --- /dev/null +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -0,0 +1,193 @@ +import { AxiosResponse, AxiosRequestHeaders } from "axios"; +import { z } from "zod"; +/** + * Configuration interface for FirecrawlApp. + */ +export interface FirecrawlAppConfig { + apiKey?: string | null; + apiUrl?: string | null; +} +/** + * Metadata for a Firecrawl document. + */ +export interface FirecrawlDocumentMetadata { + title?: string; + description?: string; + language?: string; + keywords?: string; + robots?: string; + ogTitle?: string; + ogDescription?: string; + ogUrl?: string; + ogImage?: string; + ogAudio?: string; + ogDeterminer?: string; + ogLocale?: string; + ogLocaleAlternate?: string[]; + ogSiteName?: string; + ogVideo?: string; + dctermsCreated?: string; + dcDateCreated?: string; + dcDate?: string; + dctermsType?: string; + dcType?: string; + dctermsAudience?: string; + dctermsSubject?: string; + dcSubject?: string; + dcDescription?: string; + dctermsKeywords?: string; + modifiedTime?: string; + publishedTime?: string; + articleTag?: string; + articleSection?: string; + sourceURL?: string; + pageStatusCode?: number; + pageError?: string; + [key: string]: any; +} +/** + * Document interface for Firecrawl. + */ +export interface FirecrawlDocument { + id?: string; + url?: string; + content: string; + markdown?: string; + html?: string; + llm_extraction?: Record; + createdAt?: Date; + updatedAt?: Date; + type?: string; + metadata: FirecrawlDocumentMetadata; + childrenLinks?: string[]; + provider?: string; + warning?: string; + index?: number; +} +/** + * Response interface for scraping operations. + */ +export interface ScrapeResponse { + success: boolean; + data?: FirecrawlDocument; + error?: string; +} +/** + * Response interface for searching operations. + */ +export interface SearchResponse { + success: boolean; + data?: FirecrawlDocument[]; + error?: string; +} +/** + * Response interface for crawling operations. + */ +export interface CrawlResponse { + success: boolean; + jobId?: string; + data?: FirecrawlDocument[]; + error?: string; +} +/** + * Response interface for job status checks. + */ +export interface JobStatusResponse { + success: boolean; + status: string; + current?: number; + current_url?: string; + current_step?: string; + total?: number; + jobId?: string; + data?: FirecrawlDocument[]; + partial_data?: FirecrawlDocument[]; + error?: string; +} +/** + * Generic parameter interface. + */ +export interface Params { + [key: string]: any; + extractorOptions?: { + extractionSchema: z.ZodSchema | any; + mode?: "llm-extraction"; + extractionPrompt?: string; + }; +} +/** + * Main class for interacting with the Firecrawl API. + */ +export default class FirecrawlApp { + private apiKey; + private apiUrl; + /** + * Initializes a new instance of the FirecrawlApp class. + * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. + */ + constructor({ apiKey, apiUrl }: FirecrawlAppConfig); + /** + * Scrapes a URL using the Firecrawl API. + * @param {string} url - The URL to scrape. + * @param {Params | null} params - Additional parameters for the scrape request. + * @returns {Promise} The response from the scrape operation. + */ + scrapeUrl(url: string, params?: Params | null): Promise; + /** + * Searches for a query using the Firecrawl API. + * @param {string} query - The query to search for. + * @param {Params | null} params - Additional parameters for the search request. + * @returns {Promise} The response from the search operation. + */ + search(query: string, params?: Params | null): Promise; + /** + * Initiates a crawl job for a URL using the Firecrawl API. + * @param {string} url - The URL to crawl. + * @param {Params | null} params - Additional parameters for the crawl request. + * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. + * @param {number} pollInterval - Time in seconds for job status checks. + * @param {string} idempotencyKey - Optional idempotency key for the request. + * @returns {Promise} The response from the crawl operation. + */ + crawlUrl(url: string, params?: Params | null, waitUntilDone?: boolean, pollInterval?: number, idempotencyKey?: string): Promise; + /** + * Checks the status of a crawl job using the Firecrawl API. + * @param {string} jobId - The job ID of the crawl operation. + * @returns {Promise} The response containing the job status. + */ + checkCrawlStatus(jobId: string): Promise; + /** + * Prepares the headers for an API request. + * @returns {AxiosRequestHeaders} The prepared headers. + */ + prepareHeaders(idempotencyKey?: string): AxiosRequestHeaders; + /** + * Sends a POST request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {Params} data - The data to send in the request. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the POST request. + */ + postRequest(url: string, data: Params, headers: AxiosRequestHeaders): Promise; + /** + * Sends a GET request to the specified URL. + * @param {string} url - The URL to send the request to. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @returns {Promise} The response from the GET request. + */ + getRequest(url: string, headers: AxiosRequestHeaders): Promise; + /** + * Monitors the status of a crawl job until completion or failure. + * @param {string} jobId - The job ID of the crawl operation. + * @param {AxiosRequestHeaders} headers - The headers for the request. + * @param {number} timeout - Timeout in seconds for job status checks. + * @returns {Promise} The final job status or data. + */ + monitorJobStatus(jobId: string, headers: AxiosRequestHeaders, checkInterval: number): Promise; + /** + * Handles errors from API responses. + * @param {AxiosResponse} response - The response from the API. + * @param {string} action - The action being performed when the error occurred. + */ + handleError(response: AxiosResponse, action: string): void; +} From 020a5efdb761ee96253c1100f748537cd1b1dd00 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 01:27:26 +0200 Subject: [PATCH 25/33] Revert "Revert "Merge pull request #432 from mendableai/mog/js-sdk-cjs"" This reverts commit 5da4472842497dbe4d462609293917e0b2199fc8. --- apps/js-sdk/firecrawl/build/index.js | 265 --------------------------- apps/js-sdk/firecrawl/package.json | 8 +- apps/js-sdk/package-lock.json | 34 +++- apps/js-sdk/package.json | 1 + 4 files changed, 38 insertions(+), 270 deletions(-) delete mode 100644 apps/js-sdk/firecrawl/build/index.js diff --git a/apps/js-sdk/firecrawl/build/index.js b/apps/js-sdk/firecrawl/build/index.js deleted file mode 100644 index ef79f180..00000000 --- a/apps/js-sdk/firecrawl/build/index.js +++ /dev/null @@ -1,265 +0,0 @@ -var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, generator) { - function adopt(value) { return value instanceof P ? value : new P(function (resolve) { resolve(value); }); } - return new (P || (P = Promise))(function (resolve, reject) { - function fulfilled(value) { try { step(generator.next(value)); } catch (e) { reject(e); } } - function rejected(value) { try { step(generator["throw"](value)); } catch (e) { reject(e); } } - function step(result) { result.done ? resolve(result.value) : adopt(result.value).then(fulfilled, rejected); } - step((generator = generator.apply(thisArg, _arguments || [])).next()); - }); -}; -import axios from "axios"; -import { z } from "zod"; -import { zodToJsonSchema } from "zod-to-json-schema"; -/** - * Main class for interacting with the Firecrawl API. - */ -export default class FirecrawlApp { - /** - * Initializes a new instance of the FirecrawlApp class. - * @param {FirecrawlAppConfig} config - Configuration options for the FirecrawlApp instance. - */ - constructor({ apiKey = null, apiUrl = null }) { - this.apiKey = apiKey || ""; - this.apiUrl = apiUrl || "https://api.firecrawl.dev"; - if (!this.apiKey) { - throw new Error("No API key provided"); - } - } - /** - * Scrapes a URL using the Firecrawl API. - * @param {string} url - The URL to scrape. - * @param {Params | null} params - Additional parameters for the scrape request. - * @returns {Promise} The response from the scrape operation. - */ - scrapeUrl(url, params = null) { - var _a; - return __awaiter(this, void 0, void 0, function* () { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = Object.assign({ url }, params); - if ((_a = params === null || params === void 0 ? void 0 : params.extractorOptions) === null || _a === void 0 ? void 0 : _a.extractionSchema) { - let schema = params.extractorOptions.extractionSchema; - // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { - schema = zodToJsonSchema(schema); - } - jsonData = Object.assign(Object.assign({}, jsonData), { extractorOptions: Object.assign(Object.assign({}, params.extractorOptions), { extractionSchema: schema, mode: params.extractorOptions.mode || "llm-extraction" }) }); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/scrape", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to scrape URL. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "scrape URL"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Searches for a query using the Firecrawl API. - * @param {string} query - The query to search for. - * @param {Params | null} params - Additional parameters for the search request. - * @returns {Promise} The response from the search operation. - */ - search(query, params = null) { - return __awaiter(this, void 0, void 0, function* () { - const headers = { - "Content-Type": "application/json", - Authorization: `Bearer ${this.apiKey}`, - }; - let jsonData = { query }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield axios.post(this.apiUrl + "/v0/search", jsonData, { headers }); - if (response.status === 200) { - const responseData = response.data; - if (responseData.success) { - return responseData; - } - else { - throw new Error(`Failed to search. Error: ${responseData.error}`); - } - } - else { - this.handleError(response, "search"); - } - } - catch (error) { - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Initiates a crawl job for a URL using the Firecrawl API. - * @param {string} url - The URL to crawl. - * @param {Params | null} params - Additional parameters for the crawl request. - * @param {boolean} waitUntilDone - Whether to wait for the crawl job to complete. - * @param {number} pollInterval - Time in seconds for job status checks. - * @param {string} idempotencyKey - Optional idempotency key for the request. - * @returns {Promise} The response from the crawl operation. - */ - crawlUrl(url, params = null, waitUntilDone = true, pollInterval = 2, idempotencyKey) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(idempotencyKey); - let jsonData = { url }; - if (params) { - jsonData = Object.assign(Object.assign({}, jsonData), params); - } - try { - const response = yield this.postRequest(this.apiUrl + "/v0/crawl", jsonData, headers); - if (response.status === 200) { - const jobId = response.data.jobId; - if (waitUntilDone) { - return this.monitorJobStatus(jobId, headers, pollInterval); - } - else { - return { success: true, jobId }; - } - } - else { - this.handleError(response, "start crawl job"); - } - } - catch (error) { - console.log(error); - throw new Error(error.message); - } - return { success: false, error: "Internal server error." }; - }); - } - /** - * Checks the status of a crawl job using the Firecrawl API. - * @param {string} jobId - The job ID of the crawl operation. - * @returns {Promise} The response containing the job status. - */ - checkCrawlStatus(jobId) { - return __awaiter(this, void 0, void 0, function* () { - const headers = this.prepareHeaders(); - try { - const response = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (response.status === 200) { - return { - success: true, - status: response.data.status, - current: response.data.current, - current_url: response.data.current_url, - current_step: response.data.current_step, - total: response.data.total, - data: response.data.data, - partial_data: !response.data.data - ? response.data.partial_data - : undefined, - }; - } - else { - this.handleError(response, "check crawl status"); - } - } - catch (error) { - throw new Error(error.message); - } - return { - success: false, - status: "unknown", - current: 0, - current_url: "", - current_step: "", - total: 0, - error: "Internal server error.", - }; - }); - } - /** - * Prepares the headers for an API request. - * @returns {AxiosRequestHeaders} The prepared headers. - */ - prepareHeaders(idempotencyKey) { - return Object.assign({ "Content-Type": "application/json", Authorization: `Bearer ${this.apiKey}` }, (idempotencyKey ? { "x-idempotency-key": idempotencyKey } : {})); - } - /** - * Sends a POST request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {Params} data - The data to send in the request. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the POST request. - */ - postRequest(url, data, headers) { - return axios.post(url, data, { headers }); - } - /** - * Sends a GET request to the specified URL. - * @param {string} url - The URL to send the request to. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @returns {Promise} The response from the GET request. - */ - getRequest(url, headers) { - return axios.get(url, { headers }); - } - /** - * Monitors the status of a crawl job until completion or failure. - * @param {string} jobId - The job ID of the crawl operation. - * @param {AxiosRequestHeaders} headers - The headers for the request. - * @param {number} timeout - Timeout in seconds for job status checks. - * @returns {Promise} The final job status or data. - */ - monitorJobStatus(jobId, headers, checkInterval) { - return __awaiter(this, void 0, void 0, function* () { - while (true) { - const statusResponse = yield this.getRequest(this.apiUrl + `/v0/crawl/status/${jobId}`, headers); - if (statusResponse.status === 200) { - const statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - return statusData.data; - } - else { - throw new Error("Crawl job completed but no data was returned"); - } - } - else if (["active", "paused", "pending", "queued"].includes(statusData.status)) { - if (checkInterval < 2) { - checkInterval = 2; - } - yield new Promise((resolve) => setTimeout(resolve, checkInterval * 1000)); // Wait for the specified timeout before checking again - } - else { - throw new Error(`Crawl job failed or was stopped. Status: ${statusData.status}`); - } - } - else { - this.handleError(statusResponse, "check crawl status"); - } - } - }); - } - /** - * Handles errors from API responses. - * @param {AxiosResponse} response - The response from the API. - * @param {string} action - The action being performed when the error occurred. - */ - handleError(response, action) { - if ([402, 408, 409, 500].includes(response.status)) { - const errorMessage = response.data.error || "Unknown error occurred"; - throw new Error(`Failed to ${action}. Status code: ${response.status}. Error: ${errorMessage}`); - } - else { - throw new Error(`Unexpected error occurred while trying to ${action}. Status code: ${response.status}`); - } - } -} diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index e6a398e4..f50e7a4e 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -2,11 +2,15 @@ "name": "@mendable/firecrawl-js", "version": "0.0.34", "description": "JavaScript SDK for Firecrawl API", - "main": "build/index.js", + "main": "build/cjs/index.js", "types": "types/index.d.ts", "type": "module", + "exports": { + "require": "./build/cjs/index.js", + "import": "./build/esm/index.js" + }, "scripts": { - "build": "tsc", + "build": "tsc --module commonjs --moduleResolution node10 --outDir build/cjs/ && echo '{\"type\": \"commonjs\"}' > build/cjs/package.json && npx tsc --module NodeNext --moduleResolution NodeNext --outDir build/esm/ && echo '{\"type\": \"module\"}' > build/esm/package.json", "build-and-publish": "npm run build && npm publish --access public", "publish-beta": "npm run build && npm publish --access public --tag beta", "test": "NODE_OPTIONS=--experimental-vm-modules jest --verbose src/__tests__/**/*.test.ts" diff --git a/apps/js-sdk/package-lock.json b/apps/js-sdk/package-lock.json index 2bf3f001..ca337062 100644 --- a/apps/js-sdk/package-lock.json +++ b/apps/js-sdk/package-lock.json @@ -13,6 +13,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { @@ -450,6 +451,15 @@ "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.4.tgz", "integrity": "sha512-vxhUy4J8lyeyinH7Azl1pdd43GJhZH/tP2weN8TntQblOY+A0XbT8DJk1/oCPuOOyg/Ja757rG0CgHcWC8OfMA==" }, + "node_modules/@types/node": { + "version": "20.14.11", + "resolved": "https://registry.npmjs.org/@types/node/-/node-20.14.11.tgz", + "integrity": "sha512-kprQpL8MMeszbz6ojB5/tU8PLN4kesnN8Gjzw349rDlNgsSzg90lAVj3llK99Dh7JON+t9AuscPPFW6mPbTnSA==", + "peer": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, "node_modules/acorn": { "version": "8.11.3", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.11.3.tgz", @@ -728,6 +738,24 @@ "node": ">=14.17" } }, + "node_modules/undici-types": { + "version": "5.26.5", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-5.26.5.tgz", + "integrity": "sha512-JlCMO+ehdEIKqlFxk6IfVoAUVmgz7cU7zD/h9XZ0qzeosSHmUJVOzSQvvYSYWXkFXC+IfLKSIffhv0sVZup6pA==", + "peer": true + }, + "node_modules/uuid": { + "version": "10.0.0", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-10.0.0.tgz", + "integrity": "sha512-8XkAphELsDnEGrDxUOHB3RGvXz6TeuYSGEZBOjtTtPm2lwhGBjLgOzLHB63IUWfBpNucQjND6d3AOudO+H3RWQ==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, "node_modules/v8-compile-cache-lib": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", @@ -750,9 +778,9 @@ } }, "node_modules/zod-to-json-schema": { - "version": "3.23.0", - "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.0.tgz", - "integrity": "sha512-az0uJ243PxsRIa2x1WmNE/pnuA05gUq/JB8Lwe1EDCCL/Fz9MgjYQ0fPlyc2Tcv6aF2ZA7WM5TWaRZVEFaAIag==", + "version": "3.23.1", + "resolved": "https://registry.npmjs.org/zod-to-json-schema/-/zod-to-json-schema-3.23.1.tgz", + "integrity": "sha512-oT9INvydob1XV0v1d2IadrR74rLtDInLvDFfAa1CG0Pmg/vxATk7I2gSelfj271mbzeM4Da0uuDQE/Nkj3DWNw==", "peerDependencies": { "zod": "^3.23.3" } diff --git a/apps/js-sdk/package.json b/apps/js-sdk/package.json index 0e93fe3c..2d2c36e8 100644 --- a/apps/js-sdk/package.json +++ b/apps/js-sdk/package.json @@ -15,6 +15,7 @@ "axios": "^1.6.8", "ts-node": "^10.9.2", "typescript": "^5.4.5", + "uuid": "^10.0.0", "zod": "^3.23.8" }, "devDependencies": { From b5ec47fd96b2d36a398186f96a09be601271f79d Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 13:53:04 +0200 Subject: [PATCH 26/33] fix(runWebScraper): don't fetch next job --- apps/api/src/main/runWebScraper.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 5e7d2279..76665aa2 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,13 +131,13 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - await job.moveToCompleted(null); + await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } } else { try { - await job.moveToCompleted(result); + await job.moveToCompleted(result, false, false); } catch (error) { // I think the job won't exist here anymore } From 2e2e80d679d067ccc65d18a7443fda3b3d512cdc Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:17:50 +0200 Subject: [PATCH 27/33] fix(scrape-events): updateScrapeResult fix --- apps/api/src/lib/scrape-events.ts | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index 8d677279..cd8cfa9a 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -59,6 +59,12 @@ export class ScrapeEvents { try { const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; + + if (!previousLog) { + Logger.warn("Previous log not found."); + return; + } + await supabase.from("scrape_events").update({ content: { ...previousLog.content, From 8216266d16564fa137ae3fb632114afec697162b Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:19:20 +0200 Subject: [PATCH 28/33] fix(scrape_log): display error properly --- apps/api/src/services/logging/scrape_log.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/services/logging/scrape_log.ts b/apps/api/src/services/logging/scrape_log.ts index 208159da..099e4a0b 100644 --- a/apps/api/src/services/logging/scrape_log.ts +++ b/apps/api/src/services/logging/scrape_log.ts @@ -44,9 +44,9 @@ export async function logScrape( ]); if (error) { - Logger.error(`Error logging proxy:\n${error}`); + Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`); } } catch (error) { - Logger.error(`Error logging proxy:\n${error}`); + Logger.error(`Error logging proxy:\n${JSON.stringify(error)}`); } } From 7bb922071cce3008529128b99f8a0004021f75f3 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:35:20 +0200 Subject: [PATCH 29/33] fix(queue-worker): manually renew lock (testing) --- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/services/queue-worker.ts | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index e667fa6b..c3834bcd 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -94,7 +94,7 @@ export class WebScraperDataProvider { const jobStatus = await job.getState(); if (jobStatus === "failed") { Logger.info( - "Job has failed or has been cancelled by the user. Stopping the job..." + "Job " + job.id + " has failed or has been cancelled by the user. Stopping the job..." ); return [] as Document[]; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e7767809..e46ffc1a 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -22,6 +22,11 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.debug(`🐂 Worker taking job ${job.id}`); + const lockInterval = setInterval(() => { + Logger.debug(`🐂 Renewing lock for ${job.id}`); + job.extendLock(60000); + }, 15000); + try { job.progress({ current: 1, @@ -62,6 +67,7 @@ async function processJob(job: Job, done) { origin: job.data.origin, }); Logger.debug(`🐂 Job done ${job.id}`); + clearInterval(lockInterval); done(null, data); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); @@ -108,8 +114,9 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); + clearInterval(lockInterval); done(null, data); - } + } } wsq.process( From 9df8719efa52f4465e5345a772b473d2a3d96264 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 14:56:04 +0200 Subject: [PATCH 30/33] fix(queue-worker): raise queue log level to info --- apps/api/src/services/queue-worker.ts | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index e46ffc1a..a82f12cd 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -20,10 +20,10 @@ if (process.env.ENV === 'production') { const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { - Logger.debug(`🐂 Worker taking job ${job.id}`); + Logger.info(`🐂 Worker taking job ${job.id}`); const lockInterval = setInterval(() => { - Logger.debug(`🐂 Renewing lock for ${job.id}`); + Logger.info(`🐂 Renewing lock for ${job.id}`); job.extendLock(60000); }, 15000); @@ -66,7 +66,7 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - Logger.debug(`🐂 Job done ${job.id}`); + Logger.info(`🐂 Job done ${job.id}`); clearInterval(lockInterval); done(null, data); } catch (error) { From cdf7bad5b4bc001c739096387f7e078fa043174b Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 15:20:56 +0200 Subject: [PATCH 31/33] fix(runWebScraper): don't move to completed --- apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/services/queue-worker.ts | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index 76665aa2..ba22f28b 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,7 +131,7 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - await job.moveToCompleted(null, false, false); + // await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index a82f12cd..bddf9300 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -68,7 +68,7 @@ async function processJob(job: Job, done) { }); Logger.info(`🐂 Job done ${job.id}`); clearInterval(lockInterval); - done(null, data); + done(null, null); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { From b7c01dcb9b325036a1ca768cf8bfdcfe5e93df27 Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 16:31:50 +0200 Subject: [PATCH 32/33] fix(webScraperQueue): reduce retries to 2 --- apps/api/src/services/queue-service.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/api/src/services/queue-service.ts b/apps/api/src/services/queue-service.ts index d531c2db..0cd65f32 100644 --- a/apps/api/src/services/queue-service.ts +++ b/apps/api/src/services/queue-service.ts @@ -14,7 +14,7 @@ export function getWebScraperQueue() { maxStalledCount: 10, }, defaultJobOptions:{ - attempts: 5 + attempts: 2 } }); Logger.info("Web scraper queue created"); From fe9fdb578b1ce2adbd5966b9df677c44e7c9b07e Mon Sep 17 00:00:00 2001 From: Gergo Moricz Date: Wed, 7 Aug 2024 16:34:25 +0200 Subject: [PATCH 33/33] revert bad hotfixes --- apps/api/src/lib/scrape-events.ts | 6 ------ apps/api/src/main/runWebScraper.ts | 2 +- apps/api/src/scraper/WebScraper/index.ts | 2 +- apps/api/src/services/queue-worker.ts | 11 ++--------- 4 files changed, 4 insertions(+), 17 deletions(-) diff --git a/apps/api/src/lib/scrape-events.ts b/apps/api/src/lib/scrape-events.ts index cd8cfa9a..8d677279 100644 --- a/apps/api/src/lib/scrape-events.ts +++ b/apps/api/src/lib/scrape-events.ts @@ -59,12 +59,6 @@ export class ScrapeEvents { try { const previousLog = (await supabase.from("scrape_events").select().eq("id", logId).single()).data as any; - - if (!previousLog) { - Logger.warn("Previous log not found."); - return; - } - await supabase.from("scrape_events").update({ content: { ...previousLog.content, diff --git a/apps/api/src/main/runWebScraper.ts b/apps/api/src/main/runWebScraper.ts index ba22f28b..76665aa2 100644 --- a/apps/api/src/main/runWebScraper.ts +++ b/apps/api/src/main/runWebScraper.ts @@ -131,7 +131,7 @@ const saveJob = async (job: Job, result: any) => { if (error) throw new Error(error.message); try { - // await job.moveToCompleted(null, false, false); + await job.moveToCompleted(null, false, false); } catch (error) { // I think the job won't exist here anymore } diff --git a/apps/api/src/scraper/WebScraper/index.ts b/apps/api/src/scraper/WebScraper/index.ts index c3834bcd..e667fa6b 100644 --- a/apps/api/src/scraper/WebScraper/index.ts +++ b/apps/api/src/scraper/WebScraper/index.ts @@ -94,7 +94,7 @@ export class WebScraperDataProvider { const jobStatus = await job.getState(); if (jobStatus === "failed") { Logger.info( - "Job " + job.id + " has failed or has been cancelled by the user. Stopping the job..." + "Job has failed or has been cancelled by the user. Stopping the job..." ); return [] as Document[]; } diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index bddf9300..cc92b3ab 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -22,11 +22,6 @@ const wsq = getWebScraperQueue(); async function processJob(job: Job, done) { Logger.info(`🐂 Worker taking job ${job.id}`); - const lockInterval = setInterval(() => { - Logger.info(`🐂 Renewing lock for ${job.id}`); - job.extendLock(60000); - }, 15000); - try { job.progress({ current: 1, @@ -67,8 +62,7 @@ async function processJob(job: Job, done) { origin: job.data.origin, }); Logger.info(`🐂 Job done ${job.id}`); - clearInterval(lockInterval); - done(null, null); + done(null, data); } catch (error) { Logger.error(`🐂 Job errored ${job.id} - ${error}`); if (await getWebScraperQueue().isPaused(false)) { @@ -114,9 +108,8 @@ async function processJob(job: Job, done) { pageOptions: job.data.pageOptions, origin: job.data.origin, }); - clearInterval(lockInterval); done(null, data); - } + } } wsq.process(