From a50dc106ef02b73b8cd39a35668ae4330f309146 Mon Sep 17 00:00:00 2001 From: Nicolas Date: Mon, 24 Mar 2025 12:13:52 -0400 Subject: [PATCH] (feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365) * Nick: * Nick: structured output support * Nick: support for zod and pydantic --- .../controllers/v1/deep-research-status.ts | 1 + apps/api/src/controllers/v1/deep-research.ts | 11 ++- .../lib/deep-research/deep-research-redis.ts | 1 + .../deep-research/deep-research-service.ts | 89 ++++++++++++------- .../src/lib/deep-research/research-manager.ts | 59 +++++++----- apps/api/src/services/queue-worker.ts | 3 + apps/js-sdk/firecrawl/src/index.ts | 45 ++++++++-- apps/python-sdk/firecrawl/firecrawl.py | 9 +- 8 files changed, 156 insertions(+), 62 deletions(-) diff --git a/apps/api/src/controllers/v1/deep-research-status.ts b/apps/api/src/controllers/v1/deep-research-status.ts index bae49d0a..b261e7be 100644 --- a/apps/api/src/controllers/v1/deep-research-status.ts +++ b/apps/api/src/controllers/v1/deep-research-status.ts @@ -37,6 +37,7 @@ export async function deepResearchStatusController( finalAnalysis: research.finalAnalysis, sources: research.sources, activities: research.activities, + json: research.json, // completedSteps: research.completedSteps, // totalSteps: research.totalExpectedSteps, }, diff --git a/apps/api/src/controllers/v1/deep-research.ts b/apps/api/src/controllers/v1/deep-research.ts index 7e454c3d..3d52d19d 100644 --- a/apps/api/src/controllers/v1/deep-research.ts +++ b/apps/api/src/controllers/v1/deep-research.ts @@ -1,5 +1,5 @@ import { Request, Response } from "express"; -import { RequestWithAuth } from "./types"; +import { extractOptions, RequestWithAuth } from "./types"; import { getDeepResearchQueue } from "../../services/queue-service"; import * as Sentry from "@sentry/node"; import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis"; @@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({ maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'), timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'), analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(), + systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(), + formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']), // @deprecated Use query instead topic: z.string().describe('The topic or question to research').optional(), + jsonOptions: extractOptions.optional(), }).refine(data => data.query || data.topic, { message: "Either query or topic must be provided" +}).refine((obj) => { + const hasJsonFormat = obj.formats?.includes("json"); + const hasJsonOptions = obj.jsonOptions !== undefined; + return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions); +}, { + message: "When 'json' format is specified, jsonOptions must be provided, and vice versa" }).transform(data => ({ ...data, query: data.topic || data.query // Use topic as query if provided diff --git a/apps/api/src/lib/deep-research/deep-research-redis.ts b/apps/api/src/lib/deep-research/deep-research-redis.ts index 8b1d9c7a..acefaacc 100644 --- a/apps/api/src/lib/deep-research/deep-research-redis.ts +++ b/apps/api/src/lib/deep-research/deep-research-redis.ts @@ -45,6 +45,7 @@ export type StoredDeepResearch = { activities: DeepResearchActivity[]; summaries: string[]; finalAnalysis?: string; + json?: any; }; // TTL of 6 hours diff --git a/apps/api/src/lib/deep-research/deep-research-service.ts b/apps/api/src/lib/deep-research/deep-research-service.ts index 8a404d10..590014a8 100644 --- a/apps/api/src/lib/deep-research/deep-research-service.ts +++ b/apps/api/src/lib/deep-research/deep-research-service.ts @@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search"; import { ResearchLLMService, ResearchStateManager } from "./research-manager"; import { logJob } from "../../services/logging/log_job"; import { billTeam } from "../../services/billing/credit_billing"; +import { ExtractOptions } from "../../controllers/v1/types"; interface DeepResearchServiceOptions { researchId: string; @@ -15,6 +16,9 @@ interface DeepResearchServiceOptions { maxUrls: number; timeLimit: number; analysisPrompt: string; + systemPrompt: string; + formats: string[]; + jsonOptions: ExtractOptions; subId?: string; } @@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { await state.incrementDepth(); // Search phase - await state.addActivity({ + await state.addActivity([{ type: "search", status: "processing", message: `Generating deeper search queries for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const nextSearchTopic = state.getNextSearchTopic(); logger.debug("[Deep Research] Next search topic:", { nextSearchTopic }); @@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { logger.debug("[Deep Research] Generated search queries:", { searchQueries }); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "processing", message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); + await state.addActivity(searchQueries.map(searchQuery => ({ + type: "search", + status: "processing", + message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`, + timestamp: new Date().toISOString(), + depth: state.getCurrentDepth(), + }))) // Run all searches in parallel const searchPromises = searchQueries.map(async (searchQuery) => { - await state.addActivity({ - type: "search", - status: "processing", - message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`, - timestamp: new Date().toISOString(), - depth: state.getCurrentDepth(), - }); const response = await searchAndScrapeSearchResult(searchQuery.query, { teamId: options.teamId, @@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { "[Deep Research] No results found for topic:", { currentTopic }, ); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "error", message: `No results found for any queries about "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); continue; } @@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { "[Deep Research] No new unique results found for topic:", { currentTopic }, ); - await state.addActivity({ + await state.addActivity([{ type: "search", status: "error", message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); continue; } - await state.addActivity({ + await state.addActivity([{ type: "search", status: "complete", message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`, timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); await state.addFindings( newSearchResults.map((result) => ({ @@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { ); // Analysis phase - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "processing", message: "Analyzing findings and planning next steps", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const timeRemaining = timeLimit * 1000 - (Date.now() - startTime); logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining }); @@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.getFindings(), currentTopic, timeRemaining, + options.systemPrompt ?? "", ); if (!analysis) { logger.debug("[Deep Research] Analysis failed"); - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "error", message: "Failed to analyze findings", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); state.incrementFailedAttempts(); if (state.hasReachedMaxFailedAttempts()) { @@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { state.setNextSearchTopic(analysis.nextSearchTopic || ""); - await state.addActivity({ + await state.addActivity([{ type: "analyze", status: "complete", message: "Analyzed findings", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); if (!analysis.shouldContinue || analysis.gaps.length === 0) { logger.debug("[Deep Research] No more gaps to research, ending search"); @@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { // Final synthesis logger.debug("[Deep Research] Starting final synthesis"); - await state.addActivity({ + await state.addActivity([{ type: "synthesis", status: "processing", message: "Preparing final analysis", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); - const finalAnalysis = await llmService.generateFinalAnalysis( - options.query, - state.getFindings(), - state.getSummaries(), - options.analysisPrompt, - ); + let finalAnalysis = ""; + let finalAnalysisJson = null; + if(options.formats.includes('json')) { + finalAnalysisJson = await llmService.generateFinalAnalysis( + options.query, + state.getFindings(), + state.getSummaries(), + options.analysisPrompt, + options.formats, + options.jsonOptions, + ); + } + if(options.formats.includes('markdown')) { + finalAnalysis = await llmService.generateFinalAnalysis( + options.query, + state.getFindings(), + state.getSummaries(), + options.analysisPrompt, + ); + } - await state.addActivity({ + await state.addActivity([{ type: "synthesis", status: "complete", message: "Research completed", timestamp: new Date().toISOString(), depth: state.getCurrentDepth(), - }); + }]); const progress = state.getProgress(); logger.debug("[Deep Research] Research completed successfully"); @@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { success: true, message: "Research completed", num_docs: 1, - docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }], + docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }], time_taken: (Date.now() - startTime) / 1000, team_id: teamId, mode: "deep-research", @@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { await updateDeepResearch(researchId, { status: "completed", finalAnalysis: finalAnalysis, + json: finalAnalysisJson, }); // Bill team for usage based on URLs analyzed billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch( @@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) { data: { finalAnalysis: finalAnalysis, sources: state.getSources(), + json: finalAnalysisJson, }, }; } catch (error: any) { diff --git a/apps/api/src/lib/deep-research/research-manager.ts b/apps/api/src/lib/deep-research/research-manager.ts index d5f4fdd9..70e8067b 100644 --- a/apps/api/src/lib/deep-research/research-manager.ts +++ b/apps/api/src/lib/deep-research/research-manager.ts @@ -6,7 +6,7 @@ import { updateDeepResearch, } from "./deep-research-redis"; import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; - +import { ExtractOptions } from "../../controllers/v1/types"; interface AnalysisResult { gaps: string[]; nextSteps: string[]; @@ -50,13 +50,13 @@ export class ResearchStateManager { return this.seenUrls; } - async addActivity(activity: DeepResearchActivity): Promise { - if (activity.status === "complete") { + async addActivity(activities: DeepResearchActivity[]): Promise { + if (activities.some(activity => activity.status === "complete")) { this.completedSteps++; } await updateDeepResearch(this.researchId, { - activities: [activity], + activities: activities, completedSteps: this.completedSteps, }); } @@ -199,6 +199,7 @@ export class ResearchLLMService { findings: DeepResearchFinding[], currentTopic: string, timeRemaining: number, + systemPrompt: string, ): Promise { try { const timeRemainingMinutes = @@ -211,6 +212,7 @@ export class ResearchLLMService { options: { mode: "llm", systemPrompt: + systemPrompt + "You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " + new Date().toISOString().split("T")[0], schema: { @@ -254,33 +256,48 @@ export class ResearchLLMService { findings: DeepResearchFinding[], summaries: string[], analysisPrompt: string, - ): Promise { + formats?: string[], + jsonOptions?: ExtractOptions, + ): Promise { + if(!formats) { + formats = ['markdown']; + } + if(!jsonOptions) { + jsonOptions = undefined; + } + const { extract } = await generateCompletions({ logger: this.logger.child({ method: "generateFinalAnalysis", }), - mode: "no-object", + mode: formats.includes('json') ? 'object' : 'no-object', options: { mode: "llm", - systemPrompt: - "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + - new Date().toISOString().split("T")[0], + ...(formats.includes('json') && { + ...jsonOptions + }), + systemPrompt: formats.includes('json') + ? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly." + : "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + + new Date().toISOString().split("T")[0], prompt: trimToTokenLimit( analysisPrompt ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` - : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. + : formats.includes('json') + ? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` + : `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. - Research data: - ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} - - Requirements: - - Format the report in Markdown with proper headers and sections - - Include specific citations to sources where appropriate - - Provide detailed analysis in each section - - Make it comprehensive and thorough (aim for 4+ pages worth of content) - - Include all relevant findings and insights from the research - - Cite sources - - Use bullet points and lists where appropriate for readability`, + Research data: + ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} + + Requirements: + - Format the report in Markdown with proper headers and sections + - Include specific citations to sources where appropriate + - Provide detailed analysis in each section + - Make it comprehensive and thorough (aim for 4+ pages worth of content) + - Include all relevant findings and insights from the research + - Cite sources + - Use bullet points and lists where appropriate for readability`, 100000, ).text, }, diff --git a/apps/api/src/services/queue-worker.ts b/apps/api/src/services/queue-worker.ts index 66931fce..b6cfc454 100644 --- a/apps/api/src/services/queue-worker.ts +++ b/apps/api/src/services/queue-worker.ts @@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async ( subId: job.data.subId, maxUrls: job.data.request.maxUrls, analysisPrompt: job.data.request.analysisPrompt, + systemPrompt: job.data.request.systemPrompt, + formats: job.data.request.formats, + jsonOptions: job.data.request.jsonOptions, }); if(result.success) { diff --git a/apps/js-sdk/firecrawl/src/index.ts b/apps/js-sdk/firecrawl/src/index.ts index de4ead7f..5cc9d119 100644 --- a/apps/js-sdk/firecrawl/src/index.ts +++ b/apps/js-sdk/firecrawl/src/index.ts @@ -356,7 +356,7 @@ export interface CrawlErrorsResponse { * Parameters for deep research operations. * Defines options for conducting deep research on a query. */ -export interface DeepResearchParams { +export interface DeepResearchParams { /** * Maximum depth of research iterations (1-10) * @default 7 @@ -377,9 +377,25 @@ export interface DeepResearchParams { */ analysisPrompt?: string; /** + * The system prompt to use for the research agent + */ + systemPrompt?: string; + /** + * The formats to use for the final analysis + */ + formats?: ("markdown" | "json")[]; + /** + * The JSON options to use for the final analysis + */ + jsonOptions?:{ + prompt?: string; + schema?: LLMSchema; + systemPrompt?: string; + }; + /** * Experimental flag for streaming steps */ - __experimental_streamSteps?: boolean; + // __experimental_streamSteps?: boolean; } /** @@ -1420,7 +1436,7 @@ export default class FirecrawlApp { */ async deepResearch( query: string, - params: DeepResearchParams, + params: DeepResearchParams, onActivity?: (activity: { type: string; status: string; @@ -1505,12 +1521,31 @@ export default class FirecrawlApp { * @param params - Parameters for the deep research operation. * @returns The response containing the research job ID. */ - async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { + async asyncDeepResearch(query: string, params: DeepResearchParams): Promise { const headers = this.prepareHeaders(); + let jsonData: any = { query, ...params }; + + if (jsonData?.jsonOptions?.schema) { + let schema = jsonData.jsonOptions.schema; + // Try parsing the schema as a Zod schema + try { + schema = zodToJsonSchema(schema); + } catch (error) { + + } + jsonData = { + ...jsonData, + jsonOptions: { + ...jsonData.jsonOptions, + schema: schema, + }, + }; + } + try { const response: AxiosResponse = await this.postRequest( `${this.apiUrl}/v1/deep-research`, - { query, ...params }, + jsonData, headers ); diff --git a/apps/python-sdk/firecrawl/firecrawl.py b/apps/python-sdk/firecrawl/firecrawl.py index 436f8764..d8d22421 100644 --- a/apps/python-sdk/firecrawl/firecrawl.py +++ b/apps/python-sdk/firecrawl/firecrawl.py @@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel): timeLimit: Optional[int] = 270 maxUrls: Optional[int] = 20 analysisPrompt: Optional[str] = None + systemPrompt: Optional[str] = None __experimental_streamSteps: Optional[bool] = None class DeepResearchResponse(pydantic.BaseModel): @@ -1171,7 +1172,6 @@ class FirecrawlApp: time.sleep(2) # Polling interval return {'success': False, 'error': 'Deep research job terminated unexpectedly'} - def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: """ Initiates an asynchronous deep research operation. @@ -1195,8 +1195,15 @@ class FirecrawlApp: research_params = params headers = self._prepare_headers() + json_data = {'query': query, **research_params.dict(exclude_none=True)} + # Handle json options schema if present + if 'jsonOptions' in json_data: + json_opts = json_data['jsonOptions'] + if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'): + json_data['jsonOptions']['schema'] = json_opts['schema'].schema() + try: response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers) if response.status_code == 200: