From a2881e92889643d8b50c587a5cc83e4c8fedf966 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Fri, 30 Aug 2024 13:43:19 -0300 Subject: [PATCH] Nick: llm extract support on node sdk --- apps/js-sdk/firecrawl/build/cjs/index.js | 16 ++++++- apps/js-sdk/firecrawl/build/esm/index.js | 16 ++++++- apps/js-sdk/firecrawl/src/index.ts | 53 ++++++++++++++---------- apps/js-sdk/firecrawl/types/index.d.ts | 8 +++- 4 files changed, 69 insertions(+), 24 deletions(-) diff --git a/apps/js-sdk/firecrawl/build/cjs/index.js b/apps/js-sdk/firecrawl/build/cjs/index.js index c6e93e00..4f7c531f 100644 --- a/apps/js-sdk/firecrawl/build/cjs/index.js +++ b/apps/js-sdk/firecrawl/build/cjs/index.js @@ -35,7 +35,7 @@ class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, }; let jsonData = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { + if (this.version === 'v0' && jsonData?.extractorOptions?.extractionSchema) { let schema = jsonData.extractorOptions.extractionSchema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas if (schema instanceof zod_1.z.ZodSchema) { @@ -50,6 +50,20 @@ class FirecrawlApp { }, }; } + else if (this.version === 'v1' && jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof zod_1.z.ZodSchema) { + schema = (0, zod_to_json_schema_1.zodToJsonSchema)(schema); + } + jsonData = { + ...jsonData, + extract: { + ...jsonData.extract, + schema: schema, + }, + }; + } try { const response = await axios_1.default.post(this.apiUrl + `/${this.version}/scrape`, jsonData, { headers }); if (response.status === 200) { diff --git a/apps/js-sdk/firecrawl/build/esm/index.js b/apps/js-sdk/firecrawl/build/esm/index.js index 3491a673..a9aeb92d 100644 --- a/apps/js-sdk/firecrawl/build/esm/index.js +++ b/apps/js-sdk/firecrawl/build/esm/index.js @@ -30,7 +30,7 @@ export default class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, }; let jsonData = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema) { + if (this.version === 'v0' && jsonData?.extractorOptions?.extractionSchema) { let schema = jsonData.extractorOptions.extractionSchema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas if (schema instanceof z.ZodSchema) { @@ -45,6 +45,20 @@ export default class FirecrawlApp { }, }; } + else if (this.version === 'v1' && jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + // Check if schema is an instance of ZodSchema to correctly identify Zod schemas + if (schema instanceof z.ZodSchema) { + schema = zodToJsonSchema(schema); + } + jsonData = { + ...jsonData, + extract: { + ...jsonData.extract, + schema: schema, + }, + }; + } try { const response = await axios.post(this.apiUrl + `/${this.version}/scrape`, jsonData, { headers }); if (response.status === 200) { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index a5b3af2f..3cdd2a99 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -106,7 +106,7 @@ export interface FirecrawlDocumentV0 { * Defines the options and configurations available for scraping web content. */ export interface ScrapeParams { - formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[]; + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract")[]; headers?: Record; includeTags?: string[]; excludeTags?: string[]; @@ -114,6 +114,11 @@ export interface ScrapeParams { screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile"; waitFor?: number; timeout?: number; + extract?: { + prompt?: string; + schema?: z.ZodSchema | any; + systemPrompt?: string; + }; } /** @@ -345,30 +350,36 @@ export default class FirecrawlApp { Authorization: `Bearer ${this.apiKey}`, } as AxiosRequestHeaders; let jsonData: any = { url, ...params }; - if (jsonData?.extractorOptions?.extractionSchema || jsonData?.extract?.schema) { - let schema = jsonData.extractorOptions?.extractionSchema || jsonData.extract?.schema; + if (this.version === 'v0' && jsonData?.extractorOptions?.extractionSchema) { + let schema = jsonData.extractorOptions.extractionSchema; // Check if schema is an instance of ZodSchema to correctly identify Zod schemas - if (schema instanceof z.ZodSchema) { + if (schema instanceof z.ZodSchema || schema instanceof z.ZodObject) { schema = zodToJsonSchema(schema); } - if(this.version === 'v0') { - jsonData = { - ...jsonData, - extractorOptions: { - ...jsonData.extractorOptions, - extractionSchema: schema, - mode: jsonData.extractorOptions.mode || "llm-extraction", - }, - }; - } else { - jsonData = { - ...jsonData, - extract: { - ...jsonData.extract, - schema: schema, - }, - }; + jsonData = { + ...jsonData, + extractorOptions: { + ...jsonData.extractorOptions, + extractionSchema: schema, + mode: jsonData.extractorOptions.mode || "llm-extraction", + }, + }; + } else if (this.version === 'v1' && jsonData?.extract?.schema) { + let schema = jsonData.extract.schema; + + // Try parsing the schema as a Zod schema + try { + schema = zodToJsonSchema(schema); + } catch (error) { + } + jsonData = { + ...jsonData, + extract: { + ...jsonData.extract, + schema: schema, + }, + }; } try { const response: AxiosResponse = await axios.post( diff --git a/apps/js-sdk/firecrawl/types/index.d.ts b/apps/js-sdk/firecrawl/types/index.d.ts index 3ca10744..2552a1b0 100644 --- a/apps/js-sdk/firecrawl/types/index.d.ts +++ b/apps/js-sdk/firecrawl/types/index.d.ts @@ -69,6 +69,7 @@ export interface FirecrawlDocument { html?: string; rawHtml?: string; links?: string[]; + extract?: Record; screenshot?: string; metadata: FirecrawlDocumentMetadata; } @@ -97,7 +98,7 @@ export interface FirecrawlDocumentV0 { * Defines the options and configurations available for scraping web content. */ export interface ScrapeParams { - formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot")[]; + formats: ("markdown" | "html" | "rawHtml" | "content" | "links" | "screenshot" | "extract")[]; headers?: Record; includeTags?: string[]; excludeTags?: string[]; @@ -105,6 +106,11 @@ export interface ScrapeParams { screenshotMode?: "desktop" | "full-desktop" | "mobile" | "full-mobile"; waitFor?: number; timeout?: number; + extract?: { + prompt?: string; + schema?: z.ZodSchema | any; + systemPrompt?: string; + }; } /** * Parameters for scraping operations on v0.