(feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365)

* Nick:

* Nick: structured output support

* Nick: support for zod and pydantic
This commit is contained in:
Nicolas 2025-03-24 12:13:52 -04:00 committed by GitHub
parent 3ee58f7a9e
commit a50dc106ef
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 156 additions and 62 deletions

View File

@ -37,6 +37,7 @@ export async function deepResearchStatusController(
finalAnalysis: research.finalAnalysis,
sources: research.sources,
activities: research.activities,
json: research.json,
// completedSteps: research.completedSteps,
// totalSteps: research.totalExpectedSteps,
},

View File

@ -1,5 +1,5 @@
import { Request, Response } from "express";
import { RequestWithAuth } from "./types";
import { extractOptions, RequestWithAuth } from "./types";
import { getDeepResearchQueue } from "../../services/queue-service";
import * as Sentry from "@sentry/node";
import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis";
@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(),
formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']),
// @deprecated Use query instead
topic: z.string().describe('The topic or question to research').optional(),
jsonOptions: extractOptions.optional(),
}).refine(data => data.query || data.topic, {
message: "Either query or topic must be provided"
}).refine((obj) => {
const hasJsonFormat = obj.formats?.includes("json");
const hasJsonOptions = obj.jsonOptions !== undefined;
return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions);
}, {
message: "When 'json' format is specified, jsonOptions must be provided, and vice versa"
}).transform(data => ({
...data,
query: data.topic || data.query // Use topic as query if provided

View File

@ -45,6 +45,7 @@ export type StoredDeepResearch = {
activities: DeepResearchActivity[];
summaries: string[];
finalAnalysis?: string;
json?: any;
};
// TTL of 6 hours

View File

@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search";
import { ResearchLLMService, ResearchStateManager } from "./research-manager";
import { logJob } from "../../services/logging/log_job";
import { billTeam } from "../../services/billing/credit_billing";
import { ExtractOptions } from "../../controllers/v1/types";
interface DeepResearchServiceOptions {
researchId: string;
@ -15,6 +16,9 @@ interface DeepResearchServiceOptions {
maxUrls: number;
timeLimit: number;
analysisPrompt: string;
systemPrompt: string;
formats: string[];
jsonOptions: ExtractOptions;
subId?: string;
}
@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
await state.incrementDepth();
// Search phase
await state.addActivity({
await state.addActivity([{
type: "search",
status: "processing",
message: `Generating deeper search queries for "${currentTopic}"`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
const nextSearchTopic = state.getNextSearchTopic();
logger.debug("[Deep Research] Next search topic:", { nextSearchTopic });
@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
logger.debug("[Deep Research] Generated search queries:", { searchQueries });
await state.addActivity({
await state.addActivity([{
type: "search",
status: "processing",
message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
await state.addActivity(searchQueries.map(searchQuery => ({
type: "search",
status: "processing",
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
})))
// Run all searches in parallel
const searchPromises = searchQueries.map(async (searchQuery) => {
await state.addActivity({
type: "search",
status: "processing",
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
const response = await searchAndScrapeSearchResult(searchQuery.query, {
teamId: options.teamId,
@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
"[Deep Research] No results found for topic:",
{ currentTopic },
);
await state.addActivity({
await state.addActivity([{
type: "search",
status: "error",
message: `No results found for any queries about "${currentTopic}"`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
continue;
}
@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
"[Deep Research] No new unique results found for topic:",
{ currentTopic },
);
await state.addActivity({
await state.addActivity([{
type: "search",
status: "error",
message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
continue;
}
await state.addActivity({
await state.addActivity([{
type: "search",
status: "complete",
message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`,
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
await state.addFindings(
newSearchResults.map((result) => ({
@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
);
// Analysis phase
await state.addActivity({
await state.addActivity([{
type: "analyze",
status: "processing",
message: "Analyzing findings and planning next steps",
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
const timeRemaining = timeLimit * 1000 - (Date.now() - startTime);
logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining });
@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
state.getFindings(),
currentTopic,
timeRemaining,
options.systemPrompt ?? "",
);
if (!analysis) {
logger.debug("[Deep Research] Analysis failed");
await state.addActivity({
await state.addActivity([{
type: "analyze",
status: "error",
message: "Failed to analyze findings",
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
state.incrementFailedAttempts();
if (state.hasReachedMaxFailedAttempts()) {
@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
state.setNextSearchTopic(analysis.nextSearchTopic || "");
await state.addActivity({
await state.addActivity([{
type: "analyze",
status: "complete",
message: "Analyzed findings",
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
if (!analysis.shouldContinue || analysis.gaps.length === 0) {
logger.debug("[Deep Research] No more gaps to research, ending search");
@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
// Final synthesis
logger.debug("[Deep Research] Starting final synthesis");
await state.addActivity({
await state.addActivity([{
type: "synthesis",
status: "processing",
message: "Preparing final analysis",
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
const finalAnalysis = await llmService.generateFinalAnalysis(
options.query,
state.getFindings(),
state.getSummaries(),
options.analysisPrompt,
);
let finalAnalysis = "";
let finalAnalysisJson = null;
if(options.formats.includes('json')) {
finalAnalysisJson = await llmService.generateFinalAnalysis(
options.query,
state.getFindings(),
state.getSummaries(),
options.analysisPrompt,
options.formats,
options.jsonOptions,
);
}
if(options.formats.includes('markdown')) {
finalAnalysis = await llmService.generateFinalAnalysis(
options.query,
state.getFindings(),
state.getSummaries(),
options.analysisPrompt,
);
}
await state.addActivity({
await state.addActivity([{
type: "synthesis",
status: "complete",
message: "Research completed",
timestamp: new Date().toISOString(),
depth: state.getCurrentDepth(),
});
}]);
const progress = state.getProgress();
logger.debug("[Deep Research] Research completed successfully");
@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
success: true,
message: "Research completed",
num_docs: 1,
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }],
time_taken: (Date.now() - startTime) / 1000,
team_id: teamId,
mode: "deep-research",
@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
await updateDeepResearch(researchId, {
status: "completed",
finalAnalysis: finalAnalysis,
json: finalAnalysisJson,
});
// Bill team for usage based on URLs analyzed
billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch(
@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
data: {
finalAnalysis: finalAnalysis,
sources: state.getSources(),
json: finalAnalysisJson,
},
};
} catch (error: any) {

View File

@ -6,7 +6,7 @@ import {
updateDeepResearch,
} from "./deep-research-redis";
import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
import { ExtractOptions } from "../../controllers/v1/types";
interface AnalysisResult {
gaps: string[];
nextSteps: string[];
@ -50,13 +50,13 @@ export class ResearchStateManager {
return this.seenUrls;
}
async addActivity(activity: DeepResearchActivity): Promise<void> {
if (activity.status === "complete") {
async addActivity(activities: DeepResearchActivity[]): Promise<void> {
if (activities.some(activity => activity.status === "complete")) {
this.completedSteps++;
}
await updateDeepResearch(this.researchId, {
activities: [activity],
activities: activities,
completedSteps: this.completedSteps,
});
}
@ -199,6 +199,7 @@ export class ResearchLLMService {
findings: DeepResearchFinding[],
currentTopic: string,
timeRemaining: number,
systemPrompt: string,
): Promise<AnalysisResult | null> {
try {
const timeRemainingMinutes =
@ -211,6 +212,7 @@ export class ResearchLLMService {
options: {
mode: "llm",
systemPrompt:
systemPrompt +
"You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
new Date().toISOString().split("T")[0],
schema: {
@ -254,33 +256,48 @@ export class ResearchLLMService {
findings: DeepResearchFinding[],
summaries: string[],
analysisPrompt: string,
): Promise<string> {
formats?: string[],
jsonOptions?: ExtractOptions,
): Promise<any> {
if(!formats) {
formats = ['markdown'];
}
if(!jsonOptions) {
jsonOptions = undefined;
}
const { extract } = await generateCompletions({
logger: this.logger.child({
method: "generateFinalAnalysis",
}),
mode: "no-object",
mode: formats.includes('json') ? 'object' : 'no-object',
options: {
mode: "llm",
systemPrompt:
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
new Date().toISOString().split("T")[0],
...(formats.includes('json') && {
...jsonOptions
}),
systemPrompt: formats.includes('json')
? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
: "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
new Date().toISOString().split("T")[0],
prompt: trimToTokenLimit(
analysisPrompt
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
: formats.includes('json')
? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
Research data:
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
Research data:
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
Requirements:
- Format the report in Markdown with proper headers and sections
- Include specific citations to sources where appropriate
- Provide detailed analysis in each section
- Make it comprehensive and thorough (aim for 4+ pages worth of content)
- Include all relevant findings and insights from the research
- Cite sources
- Use bullet points and lists where appropriate for readability`,
Requirements:
- Format the report in Markdown with proper headers and sections
- Include specific citations to sources where appropriate
- Provide detailed analysis in each section
- Make it comprehensive and thorough (aim for 4+ pages worth of content)
- Include all relevant findings and insights from the research
- Cite sources
- Use bullet points and lists where appropriate for readability`,
100000,
).text,
},

View File

@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async (
subId: job.data.subId,
maxUrls: job.data.request.maxUrls,
analysisPrompt: job.data.request.analysisPrompt,
systemPrompt: job.data.request.systemPrompt,
formats: job.data.request.formats,
jsonOptions: job.data.request.jsonOptions,
});
if(result.success) {

View File

@ -356,7 +356,7 @@ export interface CrawlErrorsResponse {
* Parameters for deep research operations.
* Defines options for conducting deep research on a query.
*/
export interface DeepResearchParams {
export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> {
/**
* Maximum depth of research iterations (1-10)
* @default 7
@ -376,10 +376,26 @@ export interface DeepResearchParams {
* The prompt to use for the final analysis
*/
analysisPrompt?: string;
/**
* The system prompt to use for the research agent
*/
systemPrompt?: string;
/**
* The formats to use for the final analysis
*/
formats?: ("markdown" | "json")[];
/**
* The JSON options to use for the final analysis
*/
jsonOptions?:{
prompt?: string;
schema?: LLMSchema;
systemPrompt?: string;
};
/**
* Experimental flag for streaming steps
*/
__experimental_streamSteps?: boolean;
// __experimental_streamSteps?: boolean;
}
/**
@ -1420,7 +1436,7 @@ export default class FirecrawlApp {
*/
async deepResearch(
query: string,
params: DeepResearchParams,
params: DeepResearchParams<zt.ZodSchema>,
onActivity?: (activity: {
type: string;
status: string;
@ -1505,12 +1521,31 @@ export default class FirecrawlApp {
* @param params - Parameters for the deep research operation.
* @returns The response containing the research job ID.
*/
async asyncDeepResearch(query: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
const headers = this.prepareHeaders();
let jsonData: any = { query, ...params };
if (jsonData?.jsonOptions?.schema) {
let schema = jsonData.jsonOptions.schema;
// Try parsing the schema as a Zod schema
try {
schema = zodToJsonSchema(schema);
} catch (error) {
}
jsonData = {
...jsonData,
jsonOptions: {
...jsonData.jsonOptions,
schema: schema,
},
};
}
try {
const response: AxiosResponse = await this.postRequest(
`${this.apiUrl}/v1/deep-research`,
{ query, ...params },
jsonData,
headers
);

View File

@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel):
timeLimit: Optional[int] = 270
maxUrls: Optional[int] = 20
analysisPrompt: Optional[str] = None
systemPrompt: Optional[str] = None
__experimental_streamSteps: Optional[bool] = None
class DeepResearchResponse(pydantic.BaseModel):
@ -1171,7 +1172,6 @@ class FirecrawlApp:
time.sleep(2) # Polling interval
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
"""
Initiates an asynchronous deep research operation.
@ -1195,8 +1195,15 @@ class FirecrawlApp:
research_params = params
headers = self._prepare_headers()
json_data = {'query': query, **research_params.dict(exclude_none=True)}
# Handle json options schema if present
if 'jsonOptions' in json_data:
json_opts = json_data['jsonOptions']
if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
try:
response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
if response.status_code == 200: