From e19f7a102ec3db5ff304045a522c7be65c887a93 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gerg=C5=91=20M=C3=B3ricz?= Date: Tue, 10 Sep 2024 19:29:38 +0200 Subject: [PATCH] feat(js-sdk): paginate next on checkCrawlStatus + better types for CSR --- apps/js-sdk/firecrawl/package.json | 2 +- apps/js-sdk/firecrawl/src/index.ts | 62 ++++++++++++++++++------------ 2 files changed, 38 insertions(+), 26 deletions(-) diff --git a/apps/js-sdk/firecrawl/package.json b/apps/js-sdk/firecrawl/package.json index 7114a625..75ebe390 100644 --- a/apps/js-sdk/firecrawl/package.json +++ b/apps/js-sdk/firecrawl/package.json @@ -1,6 +1,6 @@ { "name": "@mendable/firecrawl-js", - "version": "1.2.2", + "version": "1.2.3", "description": "JavaScript SDK for Firecrawl API", "main": "build/cjs/index.js", "types": "types/index.d.ts", diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index 8b16adfb..55c5be0b 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -131,15 +131,14 @@ export interface CrawlResponse { */ export interface CrawlStatusResponse { success: true; - total: number; + status: "scraping" | "completed" | "failed" | "cancelled"; completed: number; + total: number; creditsUsed: number; expiresAt: Date; - status: "scraping" | "completed" | "failed"; - next: string; - data?: FirecrawlDocument[]; - error?: string; -} + next?: string; + data: FirecrawlDocument[]; +}; /** * Parameters for mapping operations. @@ -329,9 +328,10 @@ export default class FirecrawlApp { /** * Checks the status of a crawl job using the Firecrawl API. * @param id - The ID of the crawl operation. + * @param getAllData - Paginate through all the pages of documents, returning the full list of all documents. (default: `false`) * @returns The response containing the job status. */ - async checkCrawlStatus(id?: string): Promise { + async checkCrawlStatus(id?: string, getAllData = false): Promise { if (!id) { throw new Error("No crawl ID provided"); } @@ -342,17 +342,29 @@ export default class FirecrawlApp { `${this.apiUrl}/v1/crawl/${id}`, headers ); - if (response.status === 200) { + if (response.status === 200 && getAllData) { + let allData = response.data.data; + if (response.data.status === "completed") { + let statusData = response.data + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusData = (await this.getRequest(statusData.next, headers)).data; + data = data.concat(statusData.data); + } + allData = data; + } + } return ({ - success: true, + success: response.data.success, status: response.data.status, total: response.data.total, completed: response.data.completed, creditsUsed: response.data.creditsUsed, expiresAt: new Date(response.data.expiresAt), next: response.data.next, - data: response.data.data, - error: response.data.error + data: allData, + error: response.data.error, }) } else { this.handleError(response, "check crawl status"); @@ -452,7 +464,7 @@ export default class FirecrawlApp { id: string, headers: AxiosRequestHeaders, checkInterval: number - ): Promise { + ): Promise { while (true) { let statusResponse: AxiosResponse = await this.getRequest( `${this.apiUrl}/v1/crawl/${id}`, @@ -460,20 +472,20 @@ export default class FirecrawlApp { ); if (statusResponse.status === 200) { let statusData = statusResponse.data; - if (statusData.status === "completed") { - if ("data" in statusData) { - let data = statusData.data; - while ('next' in statusData) { - statusResponse = await this.getRequest(statusData.next, headers); - statusData = statusResponse.data; - data = data.concat(statusData.data); + if (statusData.status === "completed") { + if ("data" in statusData) { + let data = statusData.data; + while ('next' in statusData) { + statusResponse = await this.getRequest(statusData.next, headers); + statusData = statusResponse.data; + data = data.concat(statusData.data); + } + statusData.data = data; + return statusData; + } else { + throw new Error("Crawl job completed but no data was returned"); } - statusData.data = data; - return statusData; - } else { - throw new Error("Crawl job completed but no data was returned"); - } - } else if ( + } else if ( ["active", "paused", "pending", "queued", "waiting", "scraping"].includes(statusData.status) ) { checkInterval = Math.max(checkInterval, 2);