(feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365)

* Nick:

* Nick: structured output support

* Nick: support for zod and pydantic
This commit is contained in:
Nicolas 2025-03-24 12:13:52 -04:00 committed by GitHub
parent 3ee58f7a9e
commit a50dc106ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 156 additions and 62 deletions

View File

@ -37,6 +37,7 @@ export async function deepResearchStatusController(
finalAnalysis: research.finalAnalysis, finalAnalysis: research.finalAnalysis,
sources: research.sources, sources: research.sources,
activities: research.activities, activities: research.activities,
json: research.json,
// completedSteps: research.completedSteps, // completedSteps: research.completedSteps,
// totalSteps: research.totalExpectedSteps, // totalSteps: research.totalExpectedSteps,
}, },

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express"; import { Request, Response } from "express";
import { RequestWithAuth } from "./types"; import { extractOptions, RequestWithAuth } from "./types";
import { getDeepResearchQueue } from "../../services/queue-service"; import { getDeepResearchQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node"; import * as Sentry from "@sentry/node";
import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis"; import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis";
@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'), maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'), timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(), analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(),
formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']),
// @deprecated Use query instead // @deprecated Use query instead
topic: z.string().describe('The topic or question to research').optional(), topic: z.string().describe('The topic or question to research').optional(),
jsonOptions: extractOptions.optional(),
}).refine(data => data.query || data.topic, { }).refine(data => data.query || data.topic, {
message: "Either query or topic must be provided" message: "Either query or topic must be provided"
}).refine((obj) => {
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions);
}, {
message: "When 'json' format is specified, jsonOptions must be provided, and vice versa"
}).transform(data => ({ }).transform(data => ({
...data, ...data,
query: data.topic || data.query // Use topic as query if provided query: data.topic || data.query // Use topic as query if provided

View File

@ -45,6 +45,7 @@ export type StoredDeepResearch = {
activities: DeepResearchActivity[]; activities: DeepResearchActivity[];
summaries: string[]; summaries: string[];
finalAnalysis?: string; finalAnalysis?: string;
json?: any;
}; };
// TTL of 6 hours // TTL of 6 hours

View File

@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search";
import { ResearchLLMService, ResearchStateManager } from "./research-manager"; import { ResearchLLMService, ResearchStateManager } from "./research-manager";
import { logJob } from "../../services/logging/log_job"; import { logJob } from "../../services/logging/log_job";
import { billTeam } from "../../services/billing/credit_billing"; import { billTeam } from "../../services/billing/credit_billing";
import { ExtractOptions } from "../../controllers/v1/types";
interface DeepResearchServiceOptions { interface DeepResearchServiceOptions {
researchId: string; researchId: string;
@ -15,6 +16,9 @@ interface DeepResearchServiceOptions {
maxUrls: number; maxUrls: number;
timeLimit: number; timeLimit: number;
analysisPrompt: string; analysisPrompt: string;
systemPrompt: string;
formats: string[];
jsonOptions: ExtractOptions;
subId?: string; subId?: string;
} }
@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
await state.incrementDepth(); await state.incrementDepth();
// Search phase // Search phase
await state.addActivity({ await state.addActivity([{
type: "search", type: "search",
status: "processing", status: "processing",
message: `Generating deeper search queries for "${currentTopic}"`, message: `Generating deeper search queries for "${currentTopic}"`,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
const nextSearchTopic = state.getNextSearchTopic(); const nextSearchTopic = state.getNextSearchTopic();
logger.debug("[Deep Research] Next search topic:", { nextSearchTopic }); logger.debug("[Deep Research] Next search topic:", { nextSearchTopic });
@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
logger.debug("[Deep Research] Generated search queries:", { searchQueries }); logger.debug("[Deep Research] Generated search queries:", { searchQueries });
await state.addActivity({ await state.addActivity([{
type: "search", type: "search",
status: "processing", status: "processing",
message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`, message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
await state.addActivity(searchQueries.map(searchQuery => ({
type: "search",
status: "processing",
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
})))
// Run all searches in parallel // Run all searches in parallel
const searchPromises = searchQueries.map(async (searchQuery) => { const searchPromises = searchQueries.map(async (searchQuery) => {
await state.addActivity({
type: "search",
status: "processing",
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
const response = await searchAndScrapeSearchResult(searchQuery.query, { const response = await searchAndScrapeSearchResult(searchQuery.query, {
teamId: options.teamId, teamId: options.teamId,
@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
"[Deep Research] No results found for topic:", "[Deep Research] No results found for topic:",
{ currentTopic }, { currentTopic },
); );
await state.addActivity({ await state.addActivity([{
type: "search", type: "search",
status: "error", status: "error",
message: `No results found for any queries about "${currentTopic}"`, message: `No results found for any queries about "${currentTopic}"`,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
continue; continue;
} }
@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
"[Deep Research] No new unique results found for topic:", "[Deep Research] No new unique results found for topic:",
{ currentTopic }, { currentTopic },
); );
await state.addActivity({ await state.addActivity([{
type: "search", type: "search",
status: "error", status: "error",
message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`, message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
continue; continue;
} }
await state.addActivity({ await state.addActivity([{
type: "search", type: "search",
status: "complete", status: "complete",
message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`, message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`,
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
await state.addFindings( await state.addFindings(
newSearchResults.map((result) => ({ newSearchResults.map((result) => ({
@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
); );
// Analysis phase // Analysis phase
await state.addActivity({ await state.addActivity([{
type: "analyze", type: "analyze",
status: "processing", status: "processing",
message: "Analyzing findings and planning next steps", message: "Analyzing findings and planning next steps",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
const timeRemaining = timeLimit * 1000 - (Date.now() - startTime); const timeRemaining = timeLimit * 1000 - (Date.now() - startTime);
logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining }); logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining });
@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
state.getFindings(), state.getFindings(),
currentTopic, currentTopic,
timeRemaining, timeRemaining,
options.systemPrompt ?? "",
); );
if (!analysis) { if (!analysis) {
logger.debug("[Deep Research] Analysis failed"); logger.debug("[Deep Research] Analysis failed");
await state.addActivity({ await state.addActivity([{
type: "analyze", type: "analyze",
status: "error", status: "error",
message: "Failed to analyze findings", message: "Failed to analyze findings",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
state.incrementFailedAttempts(); state.incrementFailedAttempts();
if (state.hasReachedMaxFailedAttempts()) { if (state.hasReachedMaxFailedAttempts()) {
@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
state.setNextSearchTopic(analysis.nextSearchTopic || ""); state.setNextSearchTopic(analysis.nextSearchTopic || "");
await state.addActivity({ await state.addActivity([{
type: "analyze", type: "analyze",
status: "complete", status: "complete",
message: "Analyzed findings", message: "Analyzed findings",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
if (!analysis.shouldContinue || analysis.gaps.length === 0) { if (!analysis.shouldContinue || analysis.gaps.length === 0) {
logger.debug("[Deep Research] No more gaps to research, ending search"); logger.debug("[Deep Research] No more gaps to research, ending search");
@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
// Final synthesis // Final synthesis
logger.debug("[Deep Research] Starting final synthesis"); logger.debug("[Deep Research] Starting final synthesis");
await state.addActivity({ await state.addActivity([{
type: "synthesis", type: "synthesis",
status: "processing", status: "processing",
message: "Preparing final analysis", message: "Preparing final analysis",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
const finalAnalysis = await llmService.generateFinalAnalysis( let finalAnalysis = "";
options.query, let finalAnalysisJson = null;
state.getFindings(), if(options.formats.includes('json')) {
state.getSummaries(), finalAnalysisJson = await llmService.generateFinalAnalysis(
options.analysisPrompt, options.query,
); state.getFindings(),
state.getSummaries(),
options.analysisPrompt,
options.formats,
options.jsonOptions,
);
}
if(options.formats.includes('markdown')) {
finalAnalysis = await llmService.generateFinalAnalysis(
options.query,
state.getFindings(),
state.getSummaries(),
options.analysisPrompt,
);
}
await state.addActivity({ await state.addActivity([{
type: "synthesis", type: "synthesis",
status: "complete", status: "complete",
message: "Research completed", message: "Research completed",
timestamp: new Date().toISOString(), timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(), depth: state.getCurrentDepth(),
}); }]);
const progress = state.getProgress(); const progress = state.getProgress();
logger.debug("[Deep Research] Research completed successfully"); logger.debug("[Deep Research] Research completed successfully");
@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
success: true, success: true,
message: "Research completed", message: "Research completed",
num_docs: 1, num_docs: 1,
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }], docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }],
time_taken: (Date.now() - startTime) / 1000, time_taken: (Date.now() - startTime) / 1000,
team_id: teamId, team_id: teamId,
mode: "deep-research", mode: "deep-research",
@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
await updateDeepResearch(researchId, { await updateDeepResearch(researchId, {
status: "completed", status: "completed",
finalAnalysis: finalAnalysis, finalAnalysis: finalAnalysis,
json: finalAnalysisJson,
}); });
// Bill team for usage based on URLs analyzed // Bill team for usage based on URLs analyzed
billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch( billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch(
@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
data: { data: {
finalAnalysis: finalAnalysis, finalAnalysis: finalAnalysis,
sources: state.getSources(), sources: state.getSources(),
json: finalAnalysisJson,
}, },
}; };
} catch (error: any) { } catch (error: any) {

View File

@ -6,7 +6,7 @@ import {
updateDeepResearch, updateDeepResearch,
} from "./deep-research-redis"; } from "./deep-research-redis";
import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract"; import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
import { ExtractOptions } from "../../controllers/v1/types";
interface AnalysisResult { interface AnalysisResult {
gaps: string[]; gaps: string[];
nextSteps: string[]; nextSteps: string[];
@ -50,13 +50,13 @@ export class ResearchStateManager {
return this.seenUrls; return this.seenUrls;
} }
async addActivity(activity: DeepResearchActivity): Promise<void> { async addActivity(activities: DeepResearchActivity[]): Promise<void> {
if (activity.status === "complete") { if (activities.some(activity => activity.status === "complete")) {
this.completedSteps++; this.completedSteps++;
} }
await updateDeepResearch(this.researchId, { await updateDeepResearch(this.researchId, {
activities: [activity], activities: activities,
completedSteps: this.completedSteps, completedSteps: this.completedSteps,
}); });
} }
@ -199,6 +199,7 @@ export class ResearchLLMService {
findings: DeepResearchFinding[], findings: DeepResearchFinding[],
currentTopic: string, currentTopic: string,
timeRemaining: number, timeRemaining: number,
systemPrompt: string,
): Promise<AnalysisResult | null> { ): Promise<AnalysisResult | null> {
try { try {
const timeRemainingMinutes = const timeRemainingMinutes =
@ -211,6 +212,7 @@ export class ResearchLLMService {
options: { options: {
mode: "llm", mode: "llm",
systemPrompt: systemPrompt:
systemPrompt +
"You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " + "You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
new Date().toISOString().split("T")[0], new Date().toISOString().split("T")[0],
schema: { schema: {
@ -254,33 +256,48 @@ export class ResearchLLMService {
findings: DeepResearchFinding[], findings: DeepResearchFinding[],
summaries: string[], summaries: string[],
analysisPrompt: string, analysisPrompt: string,
): Promise<string> { formats?: string[],
jsonOptions?: ExtractOptions,
): Promise<any> {
if(!formats) {
formats = ['markdown'];
}
if(!jsonOptions) {
jsonOptions = undefined;
}
const { extract } = await generateCompletions({ const { extract } = await generateCompletions({
logger: this.logger.child({ logger: this.logger.child({
method: "generateFinalAnalysis", method: "generateFinalAnalysis",
}), }),
mode: "no-object", mode: formats.includes('json') ? 'object' : 'no-object',
options: { options: {
mode: "llm", mode: "llm",
systemPrompt: ...(formats.includes('json') && {
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " + ...jsonOptions
new Date().toISOString().split("T")[0], }),
systemPrompt: formats.includes('json')
? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
: "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
new Date().toISOString().split("T")[0],
prompt: trimToTokenLimit( prompt: trimToTokenLimit(
analysisPrompt analysisPrompt
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}` ? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis. : formats.includes('json')
? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
Research data: Research data:
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")} ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
Requirements: Requirements:
- Format the report in Markdown with proper headers and sections - Format the report in Markdown with proper headers and sections
- Include specific citations to sources where appropriate - Include specific citations to sources where appropriate
- Provide detailed analysis in each section - Provide detailed analysis in each section
- Make it comprehensive and thorough (aim for 4+ pages worth of content) - Make it comprehensive and thorough (aim for 4+ pages worth of content)
- Include all relevant findings and insights from the research - Include all relevant findings and insights from the research
- Cite sources - Cite sources
- Use bullet points and lists where appropriate for readability`, - Use bullet points and lists where appropriate for readability`,
100000, 100000,
).text, ).text,
}, },

View File

@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async (
subId: job.data.subId, subId: job.data.subId,
maxUrls: job.data.request.maxUrls, maxUrls: job.data.request.maxUrls,
analysisPrompt: job.data.request.analysisPrompt, analysisPrompt: job.data.request.analysisPrompt,
systemPrompt: job.data.request.systemPrompt,
formats: job.data.request.formats,
jsonOptions: job.data.request.jsonOptions,
}); });
if(result.success) { if(result.success) {

View File

@ -356,7 +356,7 @@ export interface CrawlErrorsResponse {
* Parameters for deep research operations. * Parameters for deep research operations.
* Defines options for conducting deep research on a query. * Defines options for conducting deep research on a query.
*/ */
export interface DeepResearchParams { export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> {
/** /**
* Maximum depth of research iterations (1-10) * Maximum depth of research iterations (1-10)
* @default 7 * @default 7
@ -377,9 +377,25 @@ export interface DeepResearchParams {
*/ */
analysisPrompt?: string; analysisPrompt?: string;
/** /**
* The system prompt to use for the research agent
*/
systemPrompt?: string;
/**
* The formats to use for the final analysis
*/
formats?: ("markdown" | "json")[];
/**
* The JSON options to use for the final analysis
*/
jsonOptions?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
};
/**
* Experimental flag for streaming steps * Experimental flag for streaming steps
*/ */
__experimental_streamSteps?: boolean; // __experimental_streamSteps?: boolean;
} }
/** /**
@ -1420,7 +1436,7 @@ export default class FirecrawlApp {
*/ */
async deepResearch( async deepResearch(
query: string, query: string,
params: DeepResearchParams, params: DeepResearchParams<zt.ZodSchema>,
onActivity?: (activity: { onActivity?: (activity: {
type: string; type: string;
status: string; status: string;
@ -1505,12 +1521,31 @@ export default class FirecrawlApp {
* @param params - Parameters for the deep research operation. * @param params - Parameters for the deep research operation.
* @returns The response containing the research job ID. * @returns The response containing the research job ID.
*/ */
async asyncDeepResearch(query: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> { async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
const headers = this.prepareHeaders(); const headers = this.prepareHeaders();
let jsonData: any = { query, ...params };
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try { try {
const response: AxiosResponse = await this.postRequest( const response: AxiosResponse = await this.postRequest(
`${this.apiUrl}/v1/deep-research`, `${this.apiUrl}/v1/deep-research`,
{ query, ...params }, jsonData,
headers headers
); );

View File

@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel):
timeLimit: Optional[int] = 270 timeLimit: Optional[int] = 270
maxUrls: Optional[int] = 20 maxUrls: Optional[int] = 20
analysisPrompt: Optional[str] = None analysisPrompt: Optional[str] = None
systemPrompt: Optional[str] = None
__experimental_streamSteps: Optional[bool] = None __experimental_streamSteps: Optional[bool] = None
class DeepResearchResponse(pydantic.BaseModel): class DeepResearchResponse(pydantic.BaseModel):
@ -1171,7 +1172,6 @@ class FirecrawlApp:
time.sleep(2) # Polling interval time.sleep(2) # Polling interval
return {'success': False, 'error': 'Deep research job terminated unexpectedly'} return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]: def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
""" """
Initiates an asynchronous deep research operation. Initiates an asynchronous deep research operation.
@ -1195,8 +1195,15 @@ class FirecrawlApp:
research_params = params research_params = params
headers = self._prepare_headers() headers = self._prepare_headers()
json_data = {'query': query, **research_params.dict(exclude_none=True)} json_data = {'query': query, **research_params.dict(exclude_none=True)}
# Handle json options schema if present
if 'jsonOptions' in json_data:
json_opts = json_data['jsonOptions']
if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
try: try:
response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers) response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
if response.status_code == 200: if response.status_code == 200: