From a50dc106ef02b73b8cd39a35668ae4330f309146 Mon Sep 17 00:00:00 2001
From: Nicolas <nicolascamara29@gmail.com>
Date: Mon, 24 Mar 2025 12:13:52 -0400
Subject: [PATCH] (feat/deep-research) Deep Research Alpha v1 - Structured
 Outputs + Customizability (#1365)

* Nick:

* Nick: structured output support

* Nick: support for zod and pydantic
---
 .../controllers/v1/deep-research-status.ts    |  1 +
 apps/api/src/controllers/v1/deep-research.ts  | 11 ++-
 .../lib/deep-research/deep-research-redis.ts  |  1 +
 .../deep-research/deep-research-service.ts    | 89 ++++++++++++-------
 .../src/lib/deep-research/research-manager.ts | 59 +++++++-----
 apps/api/src/services/queue-worker.ts         |  3 +
 apps/js-sdk/firecrawl/src/index.ts            | 45 ++++++++--
 apps/python-sdk/firecrawl/firecrawl.py        |  9 +-
 8 files changed, 156 insertions(+), 62 deletions(-)

diff --git a/apps/api/src/controllers/v1/deep-research-status.ts b/apps/api/src/controllers/v1/deep-research-status.ts
index bae49d0a..b261e7be 100644
--- a/apps/api/src/controllers/v1/deep-research-status.ts
+++ b/apps/api/src/controllers/v1/deep-research-status.ts
@@ -37,6 +37,7 @@ export async function deepResearchStatusController(
       finalAnalysis: research.finalAnalysis,
       sources: research.sources,
       activities: research.activities,
+      json: research.json,
       // completedSteps: research.completedSteps,
       // totalSteps: research.totalExpectedSteps,
     },
diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts
index 7e454c3d..3d52d19d 100644
--- a/apps/api/src/controllers/v1/deep-research.ts
+++ b/apps/api/src/controllers/v1/deep-research.ts
@@ -1,5 +1,5 @@
 import { Request, Response } from "express";
-import { RequestWithAuth } from "./types";
+import { extractOptions, RequestWithAuth } from "./types";
 import { getDeepResearchQueue } from "../../services/queue-service";
 import * as Sentry from "@sentry/node";
 import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis";
@@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({
   maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
   timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
   analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
+  systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(),
+  formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']),
   // @deprecated Use query instead
   topic: z.string().describe('The topic or question to research').optional(),
+  jsonOptions: extractOptions.optional(),
 }).refine(data => data.query || data.topic, {
   message: "Either query or topic must be provided"
+}).refine((obj) => {
+  const hasJsonFormat = obj.formats?.includes("json");
+  const hasJsonOptions = obj.jsonOptions !== undefined;
+  return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions);
+}, {
+  message: "When 'json' format is specified, jsonOptions must be provided, and vice versa"
 }).transform(data => ({
   ...data,
   query: data.topic || data.query // Use topic as query if provided
diff --git a/apps/api/src/lib/deep-research/deep-research-redis.ts b/apps/api/src/lib/deep-research/deep-research-redis.ts
index 8b1d9c7a..acefaacc 100644
--- a/apps/api/src/lib/deep-research/deep-research-redis.ts
+++ b/apps/api/src/lib/deep-research/deep-research-redis.ts
@@ -45,6 +45,7 @@ export type StoredDeepResearch = {
   activities: DeepResearchActivity[];
   summaries: string[];
   finalAnalysis?: string;
+  json?: any;
 };
 
 // TTL of 6 hours
diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts
index 8a404d10..590014a8 100644
--- a/apps/api/src/lib/deep-research/deep-research-service.ts
+++ b/apps/api/src/lib/deep-research/deep-research-service.ts
@@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search";
 import { ResearchLLMService, ResearchStateManager } from "./research-manager";
 import { logJob } from "../../services/logging/log_job";
 import { billTeam } from "../../services/billing/credit_billing";
+import { ExtractOptions } from "../../controllers/v1/types";
 
 interface DeepResearchServiceOptions {
   researchId: string;
@@ -15,6 +16,9 @@ interface DeepResearchServiceOptions {
   maxUrls: number;
   timeLimit: number;
   analysisPrompt: string;
+  systemPrompt: string;
+  formats: string[];
+  jsonOptions: ExtractOptions;
   subId?: string;
 }
 
@@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
       await state.incrementDepth();
 
       // Search phase
-      await state.addActivity({
+      await state.addActivity([{
         type: "search",
         status: "processing",
         message: `Generating deeper search queries for "${currentTopic}"`,
         timestamp: new Date().toISOString(),
         depth: state.getCurrentDepth(),
-      });
+      }]);
 
       const nextSearchTopic = state.getNextSearchTopic();
       logger.debug("[Deep Research] Next search topic:", { nextSearchTopic });
@@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
 
       logger.debug("[Deep Research] Generated search queries:", { searchQueries });
 
-      await state.addActivity({
+      await state.addActivity([{
         type: "search",
         status: "processing",
         message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`,
         timestamp: new Date().toISOString(),
         depth: state.getCurrentDepth(),
-      });
+      }]);
+      await state.addActivity(searchQueries.map(searchQuery => ({
+        type: "search",
+        status: "processing", 
+        message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
+        timestamp: new Date().toISOString(),
+        depth: state.getCurrentDepth(),
+      })))
 
       // Run all searches in parallel
       const searchPromises = searchQueries.map(async (searchQuery) => {
-        await state.addActivity({
-          type: "search",
-          status: "processing",
-          message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
-          timestamp: new Date().toISOString(),
-          depth: state.getCurrentDepth(),
-        });
 
         const response = await searchAndScrapeSearchResult(searchQuery.query, {
           teamId: options.teamId,
@@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
           "[Deep Research] No results found for topic:",
           { currentTopic },
         );
-        await state.addActivity({
+        await state.addActivity([{
           type: "search",
           status: "error",
           message: `No results found for any queries about "${currentTopic}"`,
           timestamp: new Date().toISOString(),
           depth: state.getCurrentDepth(),
-        });
+        }]);
         continue;
       }
 
@@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
           "[Deep Research] No new unique results found for topic:",
           { currentTopic },
         );
-        await state.addActivity({
+        await state.addActivity([{
           type: "search",
           status: "error",
           message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`,
           timestamp: new Date().toISOString(),
           depth: state.getCurrentDepth(),
-        });
+        }]);
         continue;
       }
 
-      await state.addActivity({
+      await state.addActivity([{
         type: "search",
         status: "complete",
         message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`,
         timestamp: new Date().toISOString(),
         depth: state.getCurrentDepth(),
-      });
+      }]);
 
       await state.addFindings(
         newSearchResults.map((result) => ({
@@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
       );
 
       // Analysis phase
-      await state.addActivity({
+      await state.addActivity([{
         type: "analyze",
         status: "processing",
         message: "Analyzing findings and planning next steps",
         timestamp: new Date().toISOString(),
         depth: state.getCurrentDepth(),
-      });
+      }]);
 
       const timeRemaining = timeLimit * 1000 - (Date.now() - startTime);
       logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining });
@@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
         state.getFindings(),
         currentTopic,
         timeRemaining,
+        options.systemPrompt ?? "",
       );
 
       if (!analysis) {
         logger.debug("[Deep Research] Analysis failed");
-        await state.addActivity({
+        await state.addActivity([{
           type: "analyze",
           status: "error",
           message: "Failed to analyze findings",
           timestamp: new Date().toISOString(),
           depth: state.getCurrentDepth(),
-        });
+        }]);
 
         state.incrementFailedAttempts();
         if (state.hasReachedMaxFailedAttempts()) {
@@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
 
       state.setNextSearchTopic(analysis.nextSearchTopic || "");
 
-      await state.addActivity({
+      await state.addActivity([{
         type: "analyze",
         status: "complete",
         message: "Analyzed findings",
         timestamp: new Date().toISOString(),
         depth: state.getCurrentDepth(),
-      });
+      }]);
 
       if (!analysis.shouldContinue || analysis.gaps.length === 0) {
         logger.debug("[Deep Research] No more gaps to research, ending search");
@@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
 
     // Final synthesis
     logger.debug("[Deep Research] Starting final synthesis");
-    await state.addActivity({
+    await state.addActivity([{
       type: "synthesis",
       status: "processing",
       message: "Preparing final analysis",
       timestamp: new Date().toISOString(),
       depth: state.getCurrentDepth(),
-    });
+    }]);
 
-    const finalAnalysis = await llmService.generateFinalAnalysis(
-      options.query,
-      state.getFindings(),
-      state.getSummaries(),
-      options.analysisPrompt,
-    );
+    let finalAnalysis = "";
+    let finalAnalysisJson = null;
+    if(options.formats.includes('json')) {
+      finalAnalysisJson = await llmService.generateFinalAnalysis(
+        options.query,
+        state.getFindings(),
+        state.getSummaries(),
+        options.analysisPrompt,
+        options.formats,
+        options.jsonOptions,
+      );
+    }
+    if(options.formats.includes('markdown')) {
+      finalAnalysis = await llmService.generateFinalAnalysis(
+        options.query,
+        state.getFindings(),
+        state.getSummaries(),
+        options.analysisPrompt,
+      );
+    }
 
-    await state.addActivity({
+    await state.addActivity([{
       type: "synthesis",
       status: "complete",
       message: "Research completed",
       timestamp: new Date().toISOString(),
       depth: state.getCurrentDepth(),
-    });
+    }]);
 
     const progress = state.getProgress();
     logger.debug("[Deep Research] Research completed successfully");
@@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
       success: true,
       message: "Research completed",
       num_docs: 1,
-      docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
+      docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }],
       time_taken: (Date.now() - startTime) / 1000,
       team_id: teamId,
       mode: "deep-research",
@@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
     await updateDeepResearch(researchId, {
       status: "completed",
       finalAnalysis: finalAnalysis,
+      json: finalAnalysisJson,
     });
     // Bill team for usage based on URLs analyzed
     billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch(
@@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
       data: {
         finalAnalysis: finalAnalysis,
         sources: state.getSources(),
+        json: finalAnalysisJson,
       },
     };
   } catch (error: any) {
diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts
index d5f4fdd9..70e8067b 100644
--- a/apps/api/src/lib/deep-research/research-manager.ts
+++ b/apps/api/src/lib/deep-research/research-manager.ts
@@ -6,7 +6,7 @@ import {
   updateDeepResearch,
 } from "./deep-research-redis";
 import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
-
+import { ExtractOptions } from "../../controllers/v1/types";
 interface AnalysisResult {
   gaps: string[];
   nextSteps: string[];
@@ -50,13 +50,13 @@ export class ResearchStateManager {
     return this.seenUrls;
   }
 
-  async addActivity(activity: DeepResearchActivity): Promise<void> {
-    if (activity.status === "complete") {
+  async addActivity(activities: DeepResearchActivity[]): Promise<void> {
+    if (activities.some(activity => activity.status === "complete")) {
       this.completedSteps++;
     }
 
     await updateDeepResearch(this.researchId, {
-      activities: [activity],
+      activities: activities,
       completedSteps: this.completedSteps,
     });
   }
@@ -199,6 +199,7 @@ export class ResearchLLMService {
     findings: DeepResearchFinding[],
     currentTopic: string,
     timeRemaining: number,
+    systemPrompt: string,
   ): Promise<AnalysisResult | null> {
     try {
       const timeRemainingMinutes =
@@ -211,6 +212,7 @@ export class ResearchLLMService {
         options: {
           mode: "llm",
           systemPrompt:
+            systemPrompt +
             "You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
             new Date().toISOString().split("T")[0],
           schema: {
@@ -254,33 +256,48 @@ export class ResearchLLMService {
     findings: DeepResearchFinding[],
     summaries: string[],
     analysisPrompt: string,
-  ): Promise<string> {
+    formats?: string[],
+    jsonOptions?: ExtractOptions,
+  ): Promise<any> {
+    if(!formats) {
+      formats = ['markdown'];
+    }
+    if(!jsonOptions) {
+      jsonOptions = undefined;
+    }
+    
     const { extract } = await generateCompletions({
       logger: this.logger.child({
         method: "generateFinalAnalysis",
       }),
-      mode: "no-object",
+      mode: formats.includes('json') ? 'object' : 'no-object',
       options: {
         mode: "llm",
-        systemPrompt:
-          "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
-          new Date().toISOString().split("T")[0],
+        ...(formats.includes('json') && {
+          ...jsonOptions
+        }),
+        systemPrompt: formats.includes('json') 
+          ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
+          : "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
+            new Date().toISOString().split("T")[0],
         prompt: trimToTokenLimit(
           analysisPrompt
             ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
-            : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
+            : formats.includes('json')
+              ? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
+              : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
   
-            Research data:
-            ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
-  
-            Requirements:
-            - Format the report in Markdown with proper headers and sections
-            - Include specific citations to sources where appropriate
-            - Provide detailed analysis in each section
-            - Make it comprehensive and thorough (aim for 4+ pages worth of content)
-            - Include all relevant findings and insights from the research
-            - Cite sources
-            - Use bullet points and lists where appropriate for readability`,
+                Research data:
+                ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
+    
+                Requirements:
+                - Format the report in Markdown with proper headers and sections
+                - Include specific citations to sources where appropriate
+                - Provide detailed analysis in each section
+                - Make it comprehensive and thorough (aim for 4+ pages worth of content)
+                - Include all relevant findings and insights from the research
+                - Cite sources
+                - Use bullet points and lists where appropriate for readability`,
           100000,
         ).text,
       },
diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts
index 66931fce..b6cfc454 100644
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async (
       subId: job.data.subId,
       maxUrls: job.data.request.maxUrls,
       analysisPrompt: job.data.request.analysisPrompt,
+      systemPrompt: job.data.request.systemPrompt,
+      formats: job.data.request.formats,
+      jsonOptions: job.data.request.jsonOptions,
     });  
     
     if(result.success) {
diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts
index de4ead7f..5cc9d119 100644
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@@ -356,7 +356,7 @@ export interface CrawlErrorsResponse {
  * Parameters for deep research operations.
  * Defines options for conducting deep research on a query.
  */
-export interface DeepResearchParams {
+export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any>  {
   /**
    * Maximum depth of research iterations (1-10)
    * @default 7
@@ -377,9 +377,25 @@ export interface DeepResearchParams {
    */
   analysisPrompt?: string;
   /**
+   * The system prompt to use for the research agent
+   */
+  systemPrompt?: string;
+  /**
+   * The formats to use for the final analysis
+   */
+  formats?: ("markdown" | "json")[];
+  /**
+   * The JSON options to use for the final analysis
+   */
+  jsonOptions?:{
+    prompt?: string;
+    schema?: LLMSchema;
+    systemPrompt?: string;
+  };
+  /** 
    * Experimental flag for streaming steps
    */
-  __experimental_streamSteps?: boolean;
+  // __experimental_streamSteps?: boolean;
 }
 
 /**
@@ -1420,7 +1436,7 @@ export default class FirecrawlApp {
    */
   async deepResearch(
     query: string, 
-    params: DeepResearchParams,
+    params: DeepResearchParams<zt.ZodSchema>,
     onActivity?: (activity: {
       type: string;
       status: string;
@@ -1505,12 +1521,31 @@ export default class FirecrawlApp {
    * @param params - Parameters for the deep research operation.
    * @returns The response containing the research job ID.
    */
-  async asyncDeepResearch(query: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
+  async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
     const headers = this.prepareHeaders();
+    let jsonData: any = { query, ...params };
+
+    if (jsonData?.jsonOptions?.schema) {
+      let schema = jsonData.jsonOptions.schema;
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        jsonOptions: {
+          ...jsonData.jsonOptions,
+          schema: schema,
+        },
+      };
+    }
+
     try {
       const response: AxiosResponse = await this.postRequest(
         `${this.apiUrl}/v1/deep-research`,
-        { query, ...params },
+        jsonData,
         headers
       );
 
diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py
index 436f8764..d8d22421 100644
--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel):
     timeLimit: Optional[int] = 270
     maxUrls: Optional[int] = 20
     analysisPrompt: Optional[str] = None
+    systemPrompt: Optional[str] = None
     __experimental_streamSteps: Optional[bool] = None
 
 class DeepResearchResponse(pydantic.BaseModel):
@@ -1171,7 +1172,6 @@ class FirecrawlApp:
             time.sleep(2)  # Polling interval
 
         return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
-
     def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
         """
         Initiates an asynchronous deep research operation.
@@ -1195,8 +1195,15 @@ class FirecrawlApp:
             research_params = params
 
         headers = self._prepare_headers()
+        
         json_data = {'query': query, **research_params.dict(exclude_none=True)}
 
+        # Handle json options schema if present
+        if 'jsonOptions' in json_data:
+            json_opts = json_data['jsonOptions']
+            if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
+                json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
+
         try:
             response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
             if response.status_code == 200: