(feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365)

* Nick: * Nick: structured output support * Nick: support for zod and pydantic
2025-08-01 05:02:00 +08:00 · 2025-03-24 12:13:52 -04:00 · 2025-03-24 12:13:52 -04:00 · a50dc106ef
commit a50dc106ef
parent 3ee58f7a9e
8 changed files with 156 additions and 62 deletions
--- a/apps/api/src/controllers/v1/deep-research-status.ts
+++ b/apps/api/src/controllers/v1/deep-research-status.ts
@ -37,6 +37,7 @@ export async function deepResearchStatusController(
      finalAnalysis: research.finalAnalysis,
      sources: research.sources,
      activities: research.activities,
+      json: research.json,
      // completedSteps: research.completedSteps,
      // totalSteps: research.totalExpectedSteps,
    },
--- a/apps/api/src/controllers/v1/deep-research.ts
+++ b/apps/api/src/controllers/v1/deep-research.ts
@ -1,5 +1,5 @@
 import { Request, Response } from "express";
-import { RequestWithAuth } from "./types";
+import { extractOptions, RequestWithAuth } from "./types";
 import { getDeepResearchQueue } from "../../services/queue-service";
 import * as Sentry from "@sentry/node";
 import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis";
@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({
  maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
  timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
  analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
+  systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(),
+  formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']),
  // @deprecated Use query instead
  topic: z.string().describe('The topic or question to research').optional(),
+  jsonOptions: extractOptions.optional(),
 }).refine(data => data.query || data.topic, {
  message: "Either query or topic must be provided"
+}).refine((obj) => {
+  const hasJsonFormat = obj.formats?.includes("json");
+  const hasJsonOptions = obj.jsonOptions !== undefined;
+  return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions);
+}, {
+  message: "When 'json' format is specified, jsonOptions must be provided, and vice versa"
 }).transform(data => ({
  ...data,
  query: data.topic || data.query // Use topic as query if provided
--- a/apps/api/src/lib/deep-research/deep-research-redis.ts
+++ b/apps/api/src/lib/deep-research/deep-research-redis.ts
@ -45,6 +45,7 @@ export type StoredDeepResearch = {
  activities: DeepResearchActivity[];
  summaries: string[];
  finalAnalysis?: string;
+  json?: any;
 };

 // TTL of 6 hours
--- a/apps/api/src/lib/deep-research/deep-research-service.ts
+++ b/apps/api/src/lib/deep-research/deep-research-service.ts
@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search";
 import { ResearchLLMService, ResearchStateManager } from "./research-manager";
 import { logJob } from "../../services/logging/log_job";
 import { billTeam } from "../../services/billing/credit_billing";
+import { ExtractOptions } from "../../controllers/v1/types";

 interface DeepResearchServiceOptions {
  researchId: string;
@ -15,6 +16,9 @@ interface DeepResearchServiceOptions {
  maxUrls: number;
  timeLimit: number;
  analysisPrompt: string;
+  systemPrompt: string;
+  formats: string[];
+  jsonOptions: ExtractOptions;
  subId?: string;
 }

@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      await state.incrementDepth();

      // Search phase
-      await state.addActivity({
+      await state.addActivity([{
        type: "search",
        status: "processing",
        message: `Generating deeper search queries for "${currentTopic}"`,
        timestamp: new Date().toISOString(),
        depth: state.getCurrentDepth(),
-      });
+      }]);

      const nextSearchTopic = state.getNextSearchTopic();
      logger.debug("[Deep Research] Next search topic:", { nextSearchTopic });
@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {

      logger.debug("[Deep Research] Generated search queries:", { searchQueries });

-      await state.addActivity({
+      await state.addActivity([{
        type: "search",
        status: "processing",
        message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`,
        timestamp: new Date().toISOString(),
        depth: state.getCurrentDepth(),
-      });
+      }]);
+      await state.addActivity(searchQueries.map(searchQuery => ({
+        type: "search",
+        status: "processing", 
+        message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
+        timestamp: new Date().toISOString(),
+        depth: state.getCurrentDepth(),
+      })))

      // Run all searches in parallel
      const searchPromises = searchQueries.map(async (searchQuery) => {
-        await state.addActivity({
-          type: "search",
-          status: "processing",
-          message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
-          timestamp: new Date().toISOString(),
-          depth: state.getCurrentDepth(),
-        });

        const response = await searchAndScrapeSearchResult(searchQuery.query, {
          teamId: options.teamId,
@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
          "[Deep Research] No results found for topic:",
          { currentTopic },
        );
-        await state.addActivity({
+        await state.addActivity([{
          type: "search",
          status: "error",
          message: `No results found for any queries about "${currentTopic}"`,
          timestamp: new Date().toISOString(),
          depth: state.getCurrentDepth(),
-        });
+        }]);
        continue;
      }

@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
          "[Deep Research] No new unique results found for topic:",
          { currentTopic },
        );
-        await state.addActivity({
+        await state.addActivity([{
          type: "search",
          status: "error",
          message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`,
          timestamp: new Date().toISOString(),
          depth: state.getCurrentDepth(),
-        });
+        }]);
        continue;
      }

-      await state.addActivity({
+      await state.addActivity([{
        type: "search",
        status: "complete",
        message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`,
        timestamp: new Date().toISOString(),
        depth: state.getCurrentDepth(),
-      });
+      }]);

      await state.addFindings(
        newSearchResults.map((result) => ({
@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      );

      // Analysis phase
-      await state.addActivity({
+      await state.addActivity([{
        type: "analyze",
        status: "processing",
        message: "Analyzing findings and planning next steps",
        timestamp: new Date().toISOString(),
        depth: state.getCurrentDepth(),
-      });
+      }]);

      const timeRemaining = timeLimit * 1000 - (Date.now() - startTime);
      logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining });
@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
        state.getFindings(),
        currentTopic,
        timeRemaining,
+        options.systemPrompt ?? "",
      );

      if (!analysis) {
        logger.debug("[Deep Research] Analysis failed");
-        await state.addActivity({
+        await state.addActivity([{
          type: "analyze",
          status: "error",
          message: "Failed to analyze findings",
          timestamp: new Date().toISOString(),
          depth: state.getCurrentDepth(),
-        });
+        }]);

        state.incrementFailedAttempts();
        if (state.hasReachedMaxFailedAttempts()) {
@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {

      state.setNextSearchTopic(analysis.nextSearchTopic || "");

-      await state.addActivity({
+      await state.addActivity([{
        type: "analyze",
        status: "complete",
        message: "Analyzed findings",
        timestamp: new Date().toISOString(),
        depth: state.getCurrentDepth(),
-      });
+      }]);

      if (!analysis.shouldContinue || analysis.gaps.length === 0) {
        logger.debug("[Deep Research] No more gaps to research, ending search");
@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {

    // Final synthesis
    logger.debug("[Deep Research] Starting final synthesis");
-    await state.addActivity({
+    await state.addActivity([{
      type: "synthesis",
      status: "processing",
      message: "Preparing final analysis",
      timestamp: new Date().toISOString(),
      depth: state.getCurrentDepth(),
-    });
+    }]);

-    const finalAnalysis = await llmService.generateFinalAnalysis(
-      options.query,
-      state.getFindings(),
-      state.getSummaries(),
-      options.analysisPrompt,
-    );
+    let finalAnalysis = "";
+    let finalAnalysisJson = null;
+    if(options.formats.includes('json')) {
+      finalAnalysisJson = await llmService.generateFinalAnalysis(
+        options.query,
+        state.getFindings(),
+        state.getSummaries(),
+        options.analysisPrompt,
+        options.formats,
+        options.jsonOptions,
+      );
+    }
+    if(options.formats.includes('markdown')) {
+      finalAnalysis = await llmService.generateFinalAnalysis(
+        options.query,
+        state.getFindings(),
+        state.getSummaries(),
+        options.analysisPrompt,
+      );
+    }

-    await state.addActivity({
+    await state.addActivity([{
      type: "synthesis",
      status: "complete",
      message: "Research completed",
      timestamp: new Date().toISOString(),
      depth: state.getCurrentDepth(),
-    });
+    }]);

    const progress = state.getProgress();
    logger.debug("[Deep Research] Research completed successfully");
@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      success: true,
      message: "Research completed",
      num_docs: 1,
-      docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
+      docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }],
      time_taken: (Date.now() - startTime) / 1000,
      team_id: teamId,
      mode: "deep-research",
@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
    await updateDeepResearch(researchId, {
      status: "completed",
      finalAnalysis: finalAnalysis,
+      json: finalAnalysisJson,
    });
    // Bill team for usage based on URLs analyzed
    billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch(
@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      data: {
        finalAnalysis: finalAnalysis,
        sources: state.getSources(),
+        json: finalAnalysisJson,
      },
    };
  } catch (error: any) {
--- a/apps/api/src/lib/deep-research/research-manager.ts
+++ b/apps/api/src/lib/deep-research/research-manager.ts
@ -6,7 +6,7 @@ import {
  updateDeepResearch,
 } from "./deep-research-redis";
 import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
-
+import { ExtractOptions } from "../../controllers/v1/types";
 interface AnalysisResult {
  gaps: string[];
  nextSteps: string[];
@ -50,13 +50,13 @@ export class ResearchStateManager {
    return this.seenUrls;
  }

-  async addActivity(activity: DeepResearchActivity): Promise<void> {
-    if (activity.status === "complete") {
+  async addActivity(activities: DeepResearchActivity[]): Promise<void> {
+    if (activities.some(activity => activity.status === "complete")) {
      this.completedSteps++;
    }

    await updateDeepResearch(this.researchId, {
-      activities: [activity],
+      activities: activities,
      completedSteps: this.completedSteps,
    });
  }
@ -199,6 +199,7 @@ export class ResearchLLMService {
    findings: DeepResearchFinding[],
    currentTopic: string,
    timeRemaining: number,
+    systemPrompt: string,
  ): Promise<AnalysisResult | null> {
    try {
      const timeRemainingMinutes =
@ -211,6 +212,7 @@ export class ResearchLLMService {
        options: {
          mode: "llm",
          systemPrompt:
+            systemPrompt +
            "You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
            new Date().toISOString().split("T")[0],
          schema: {
@ -254,33 +256,48 @@ export class ResearchLLMService {
    findings: DeepResearchFinding[],
    summaries: string[],
    analysisPrompt: string,
-  ): Promise<string> {
+    formats?: string[],
+    jsonOptions?: ExtractOptions,
+  ): Promise<any> {
+    if(!formats) {
+      formats = ['markdown'];
+    }
+    if(!jsonOptions) {
+      jsonOptions = undefined;
+    }
+    
    const { extract } = await generateCompletions({
      logger: this.logger.child({
        method: "generateFinalAnalysis",
      }),
-      mode: "no-object",
+      mode: formats.includes('json') ? 'object' : 'no-object',
      options: {
        mode: "llm",
-        systemPrompt:
-          "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
-          new Date().toISOString().split("T")[0],
+        ...(formats.includes('json') && {
+          ...jsonOptions
+        }),
+        systemPrompt: formats.includes('json') 
+          ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
+          : "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
+            new Date().toISOString().split("T")[0],
        prompt: trimToTokenLimit(
          analysisPrompt
            ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
-            : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
+            : formats.includes('json')
+              ? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
+              : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
  
-            Research data:
-            ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
+                Research data:
+                ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
    
-            Requirements:
-            - Format the report in Markdown with proper headers and sections
-            - Include specific citations to sources where appropriate
-            - Provide detailed analysis in each section
-            - Make it comprehensive and thorough (aim for 4+ pages worth of content)
-            - Include all relevant findings and insights from the research
-            - Cite sources
-            - Use bullet points and lists where appropriate for readability`,
+                Requirements:
+                - Format the report in Markdown with proper headers and sections
+                - Include specific citations to sources where appropriate
+                - Provide detailed analysis in each section
+                - Make it comprehensive and thorough (aim for 4+ pages worth of content)
+                - Include all relevant findings and insights from the research
+                - Cite sources
+                - Use bullet points and lists where appropriate for readability`,
          100000,
        ).text,
      },
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async (
      subId: job.data.subId,
      maxUrls: job.data.request.maxUrls,
      analysisPrompt: job.data.request.analysisPrompt,
+      systemPrompt: job.data.request.systemPrompt,
+      formats: job.data.request.formats,
+      jsonOptions: job.data.request.jsonOptions,
    });  
    
    if(result.success) {
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -356,7 +356,7 @@ export interface CrawlErrorsResponse {
 * Parameters for deep research operations.
 * Defines options for conducting deep research on a query.
 */
-export interface DeepResearchParams {
+export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any>  {
  /**
   * Maximum depth of research iterations (1-10)
   * @default 7
@ -376,10 +376,26 @@ export interface DeepResearchParams {
   * The prompt to use for the final analysis
   */
  analysisPrompt?: string;
+  /**
+   * The system prompt to use for the research agent
+   */
+  systemPrompt?: string;
+  /**
+   * The formats to use for the final analysis
+   */
+  formats?: ("markdown" | "json")[];
+  /**
+   * The JSON options to use for the final analysis
+   */
+  jsonOptions?:{
+    prompt?: string;
+    schema?: LLMSchema;
+    systemPrompt?: string;
+  };
  /** 
   * Experimental flag for streaming steps
   */
-  __experimental_streamSteps?: boolean;
+  // __experimental_streamSteps?: boolean;
 }

 /**
@ -1420,7 +1436,7 @@ export default class FirecrawlApp {
   */
  async deepResearch(
    query: string, 
-    params: DeepResearchParams,
+    params: DeepResearchParams<zt.ZodSchema>,
    onActivity?: (activity: {
      type: string;
      status: string;
@ -1505,12 +1521,31 @@ export default class FirecrawlApp {
   * @param params - Parameters for the deep research operation.
   * @returns The response containing the research job ID.
   */
-  async asyncDeepResearch(query: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
+  async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
    const headers = this.prepareHeaders();
+    let jsonData: any = { query, ...params };
+
+    if (jsonData?.jsonOptions?.schema) {
+      let schema = jsonData.jsonOptions.schema;
+      // Try parsing the schema as a Zod schema
+      try {
+        schema = zodToJsonSchema(schema);
+      } catch (error) {
+        
+      }
+      jsonData = {
+        ...jsonData,
+        jsonOptions: {
+          ...jsonData.jsonOptions,
+          schema: schema,
+        },
+      };
+    }
+
    try {
      const response: AxiosResponse = await this.postRequest(
        `${this.apiUrl}/v1/deep-research`,
-        { query, ...params },
+        jsonData,
        headers
      );

--- a/apps/python-sdk/firecrawl/firecrawl.py
+++ b/apps/python-sdk/firecrawl/firecrawl.py
@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel):
    timeLimit: Optional[int] = 270
    maxUrls: Optional[int] = 20
    analysisPrompt: Optional[str] = None
+    systemPrompt: Optional[str] = None
    __experimental_streamSteps: Optional[bool] = None

 class DeepResearchResponse(pydantic.BaseModel):
@ -1171,7 +1172,6 @@ class FirecrawlApp:
            time.sleep(2)  # Polling interval

        return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
-
    def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
        """
        Initiates an asynchronous deep research operation.
@ -1195,8 +1195,15 @@ class FirecrawlApp:
            research_params = params

        headers = self._prepare_headers()
+        
        json_data = {'query': query, **research_params.dict(exclude_none=True)}

+        # Handle json options schema if present
+        if 'jsonOptions' in json_data:
+            json_opts = json_data['jsonOptions']
+            if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
+                json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
+
        try:
            response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
            if response.status_code == 200: