mirror of
https://git.mirrors.martin98.com/https://github.com/mendableai/firecrawl
synced 2025-08-01 05:02:00 +08:00
(feat/deep-research) Deep Research Alpha v1 - Structured Outputs + Customizability (#1365)
* Nick: * Nick: structured output support * Nick: support for zod and pydantic
This commit is contained in:
parent
3ee58f7a9e
commit
a50dc106ef
@ -37,6 +37,7 @@ export async function deepResearchStatusController(
|
||||
finalAnalysis: research.finalAnalysis,
|
||||
sources: research.sources,
|
||||
activities: research.activities,
|
||||
json: research.json,
|
||||
// completedSteps: research.completedSteps,
|
||||
// totalSteps: research.totalExpectedSteps,
|
||||
},
|
||||
|
@ -1,5 +1,5 @@
|
||||
import { Request, Response } from "express";
|
||||
import { RequestWithAuth } from "./types";
|
||||
import { extractOptions, RequestWithAuth } from "./types";
|
||||
import { getDeepResearchQueue } from "../../services/queue-service";
|
||||
import * as Sentry from "@sentry/node";
|
||||
import { saveDeepResearch } from "../../lib/deep-research/deep-research-redis";
|
||||
@ -11,10 +11,19 @@ export const deepResearchRequestSchema = z.object({
|
||||
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
|
||||
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
|
||||
analysisPrompt: z.string().describe('The prompt to use for the final analysis').optional(),
|
||||
systemPrompt: z.string().describe('The system prompt to use for the research agent').optional(),
|
||||
formats: z.array(z.enum(['markdown', 'json'])).default(['markdown']),
|
||||
// @deprecated Use query instead
|
||||
topic: z.string().describe('The topic or question to research').optional(),
|
||||
jsonOptions: extractOptions.optional(),
|
||||
}).refine(data => data.query || data.topic, {
|
||||
message: "Either query or topic must be provided"
|
||||
}).refine((obj) => {
|
||||
const hasJsonFormat = obj.formats?.includes("json");
|
||||
const hasJsonOptions = obj.jsonOptions !== undefined;
|
||||
return (hasJsonFormat && hasJsonOptions) || (!hasJsonFormat && !hasJsonOptions);
|
||||
}, {
|
||||
message: "When 'json' format is specified, jsonOptions must be provided, and vice versa"
|
||||
}).transform(data => ({
|
||||
...data,
|
||||
query: data.topic || data.query // Use topic as query if provided
|
||||
|
@ -45,6 +45,7 @@ export type StoredDeepResearch = {
|
||||
activities: DeepResearchActivity[];
|
||||
summaries: string[];
|
||||
finalAnalysis?: string;
|
||||
json?: any;
|
||||
};
|
||||
|
||||
// TTL of 6 hours
|
||||
|
@ -5,6 +5,7 @@ import { searchAndScrapeSearchResult } from "../../controllers/v1/search";
|
||||
import { ResearchLLMService, ResearchStateManager } from "./research-manager";
|
||||
import { logJob } from "../../services/logging/log_job";
|
||||
import { billTeam } from "../../services/billing/credit_billing";
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
|
||||
interface DeepResearchServiceOptions {
|
||||
researchId: string;
|
||||
@ -15,6 +16,9 @@ interface DeepResearchServiceOptions {
|
||||
maxUrls: number;
|
||||
timeLimit: number;
|
||||
analysisPrompt: string;
|
||||
systemPrompt: string;
|
||||
formats: string[];
|
||||
jsonOptions: ExtractOptions;
|
||||
subId?: string;
|
||||
}
|
||||
|
||||
@ -54,13 +58,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
await state.incrementDepth();
|
||||
|
||||
// Search phase
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "search",
|
||||
status: "processing",
|
||||
message: `Generating deeper search queries for "${currentTopic}"`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
const nextSearchTopic = state.getNextSearchTopic();
|
||||
logger.debug("[Deep Research] Next search topic:", { nextSearchTopic });
|
||||
@ -74,23 +78,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
|
||||
logger.debug("[Deep Research] Generated search queries:", { searchQueries });
|
||||
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "search",
|
||||
status: "processing",
|
||||
message: `Starting ${searchQueries.length} parallel searches for "${currentTopic}"`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
await state.addActivity(searchQueries.map(searchQuery => ({
|
||||
type: "search",
|
||||
status: "processing",
|
||||
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
})))
|
||||
|
||||
// Run all searches in parallel
|
||||
const searchPromises = searchQueries.map(async (searchQuery) => {
|
||||
await state.addActivity({
|
||||
type: "search",
|
||||
status: "processing",
|
||||
message: `Searching for "${searchQuery.query}" - Goal: ${searchQuery.researchGoal}`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
|
||||
const response = await searchAndScrapeSearchResult(searchQuery.query, {
|
||||
teamId: options.teamId,
|
||||
@ -126,13 +130,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
"[Deep Research] No results found for topic:",
|
||||
{ currentTopic },
|
||||
);
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "search",
|
||||
status: "error",
|
||||
message: `No results found for any queries about "${currentTopic}"`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -163,23 +167,23 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
"[Deep Research] No new unique results found for topic:",
|
||||
{ currentTopic },
|
||||
);
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "search",
|
||||
status: "error",
|
||||
message: `Found ${searchResults.length} results but all URLs were already processed for "${currentTopic}"`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
continue;
|
||||
}
|
||||
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "search",
|
||||
status: "complete",
|
||||
message: `Found ${newSearchResults.length} new relevant results across ${searchQueries.length} parallel queries`,
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
await state.addFindings(
|
||||
newSearchResults.map((result) => ({
|
||||
@ -189,13 +193,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
);
|
||||
|
||||
// Analysis phase
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "analyze",
|
||||
status: "processing",
|
||||
message: "Analyzing findings and planning next steps",
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
const timeRemaining = timeLimit * 1000 - (Date.now() - startTime);
|
||||
logger.debug("[Deep Research] Time remaining (ms):", { timeRemaining });
|
||||
@ -204,17 +208,18 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
state.getFindings(),
|
||||
currentTopic,
|
||||
timeRemaining,
|
||||
options.systemPrompt ?? "",
|
||||
);
|
||||
|
||||
if (!analysis) {
|
||||
logger.debug("[Deep Research] Analysis failed");
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "analyze",
|
||||
status: "error",
|
||||
message: "Failed to analyze findings",
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
state.incrementFailedAttempts();
|
||||
if (state.hasReachedMaxFailedAttempts()) {
|
||||
@ -232,13 +237,13 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
|
||||
state.setNextSearchTopic(analysis.nextSearchTopic || "");
|
||||
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "analyze",
|
||||
status: "complete",
|
||||
message: "Analyzed findings",
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
if (!analysis.shouldContinue || analysis.gaps.length === 0) {
|
||||
logger.debug("[Deep Research] No more gaps to research, ending search");
|
||||
@ -251,28 +256,42 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
|
||||
// Final synthesis
|
||||
logger.debug("[Deep Research] Starting final synthesis");
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "synthesis",
|
||||
status: "processing",
|
||||
message: "Preparing final analysis",
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
const finalAnalysis = await llmService.generateFinalAnalysis(
|
||||
options.query,
|
||||
state.getFindings(),
|
||||
state.getSummaries(),
|
||||
options.analysisPrompt,
|
||||
);
|
||||
let finalAnalysis = "";
|
||||
let finalAnalysisJson = null;
|
||||
if(options.formats.includes('json')) {
|
||||
finalAnalysisJson = await llmService.generateFinalAnalysis(
|
||||
options.query,
|
||||
state.getFindings(),
|
||||
state.getSummaries(),
|
||||
options.analysisPrompt,
|
||||
options.formats,
|
||||
options.jsonOptions,
|
||||
);
|
||||
}
|
||||
if(options.formats.includes('markdown')) {
|
||||
finalAnalysis = await llmService.generateFinalAnalysis(
|
||||
options.query,
|
||||
state.getFindings(),
|
||||
state.getSummaries(),
|
||||
options.analysisPrompt,
|
||||
);
|
||||
}
|
||||
|
||||
await state.addActivity({
|
||||
await state.addActivity([{
|
||||
type: "synthesis",
|
||||
status: "complete",
|
||||
message: "Research completed",
|
||||
timestamp: new Date().toISOString(),
|
||||
depth: state.getCurrentDepth(),
|
||||
});
|
||||
}]);
|
||||
|
||||
const progress = state.getProgress();
|
||||
logger.debug("[Deep Research] Research completed successfully");
|
||||
@ -283,7 +302,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
success: true,
|
||||
message: "Research completed",
|
||||
num_docs: 1,
|
||||
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
|
||||
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources(), json: finalAnalysisJson }],
|
||||
time_taken: (Date.now() - startTime) / 1000,
|
||||
team_id: teamId,
|
||||
mode: "deep-research",
|
||||
@ -296,6 +315,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
await updateDeepResearch(researchId, {
|
||||
status: "completed",
|
||||
finalAnalysis: finalAnalysis,
|
||||
json: finalAnalysisJson,
|
||||
});
|
||||
// Bill team for usage based on URLs analyzed
|
||||
billTeam(teamId, subId, Math.min(urlsAnalyzed, options.maxUrls), logger).catch(
|
||||
@ -310,6 +330,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
|
||||
data: {
|
||||
finalAnalysis: finalAnalysis,
|
||||
sources: state.getSources(),
|
||||
json: finalAnalysisJson,
|
||||
},
|
||||
};
|
||||
} catch (error: any) {
|
||||
|
@ -6,7 +6,7 @@ import {
|
||||
updateDeepResearch,
|
||||
} from "./deep-research-redis";
|
||||
import { generateCompletions, trimToTokenLimit } from "../../scraper/scrapeURL/transformers/llmExtract";
|
||||
|
||||
import { ExtractOptions } from "../../controllers/v1/types";
|
||||
interface AnalysisResult {
|
||||
gaps: string[];
|
||||
nextSteps: string[];
|
||||
@ -50,13 +50,13 @@ export class ResearchStateManager {
|
||||
return this.seenUrls;
|
||||
}
|
||||
|
||||
async addActivity(activity: DeepResearchActivity): Promise<void> {
|
||||
if (activity.status === "complete") {
|
||||
async addActivity(activities: DeepResearchActivity[]): Promise<void> {
|
||||
if (activities.some(activity => activity.status === "complete")) {
|
||||
this.completedSteps++;
|
||||
}
|
||||
|
||||
await updateDeepResearch(this.researchId, {
|
||||
activities: [activity],
|
||||
activities: activities,
|
||||
completedSteps: this.completedSteps,
|
||||
});
|
||||
}
|
||||
@ -199,6 +199,7 @@ export class ResearchLLMService {
|
||||
findings: DeepResearchFinding[],
|
||||
currentTopic: string,
|
||||
timeRemaining: number,
|
||||
systemPrompt: string,
|
||||
): Promise<AnalysisResult | null> {
|
||||
try {
|
||||
const timeRemainingMinutes =
|
||||
@ -211,6 +212,7 @@ export class ResearchLLMService {
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
systemPrompt +
|
||||
"You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
|
||||
new Date().toISOString().split("T")[0],
|
||||
schema: {
|
||||
@ -254,33 +256,48 @@ export class ResearchLLMService {
|
||||
findings: DeepResearchFinding[],
|
||||
summaries: string[],
|
||||
analysisPrompt: string,
|
||||
): Promise<string> {
|
||||
formats?: string[],
|
||||
jsonOptions?: ExtractOptions,
|
||||
): Promise<any> {
|
||||
if(!formats) {
|
||||
formats = ['markdown'];
|
||||
}
|
||||
if(!jsonOptions) {
|
||||
jsonOptions = undefined;
|
||||
}
|
||||
|
||||
const { extract } = await generateCompletions({
|
||||
logger: this.logger.child({
|
||||
method: "generateFinalAnalysis",
|
||||
}),
|
||||
mode: "no-object",
|
||||
mode: formats.includes('json') ? 'object' : 'no-object',
|
||||
options: {
|
||||
mode: "llm",
|
||||
systemPrompt:
|
||||
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
|
||||
new Date().toISOString().split("T")[0],
|
||||
...(formats.includes('json') && {
|
||||
...jsonOptions
|
||||
}),
|
||||
systemPrompt: formats.includes('json')
|
||||
? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
|
||||
: "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
|
||||
new Date().toISOString().split("T")[0],
|
||||
prompt: trimToTokenLimit(
|
||||
analysisPrompt
|
||||
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
|
||||
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
|
||||
: formats.includes('json')
|
||||
? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
|
||||
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
|
||||
|
||||
Research data:
|
||||
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
|
||||
Research data:
|
||||
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
|
||||
|
||||
Requirements:
|
||||
- Format the report in Markdown with proper headers and sections
|
||||
- Include specific citations to sources where appropriate
|
||||
- Provide detailed analysis in each section
|
||||
- Make it comprehensive and thorough (aim for 4+ pages worth of content)
|
||||
- Include all relevant findings and insights from the research
|
||||
- Cite sources
|
||||
- Use bullet points and lists where appropriate for readability`,
|
||||
Requirements:
|
||||
- Format the report in Markdown with proper headers and sections
|
||||
- Include specific citations to sources where appropriate
|
||||
- Provide detailed analysis in each section
|
||||
- Make it comprehensive and thorough (aim for 4+ pages worth of content)
|
||||
- Include all relevant findings and insights from the research
|
||||
- Cite sources
|
||||
- Use bullet points and lists where appropriate for readability`,
|
||||
100000,
|
||||
).text,
|
||||
},
|
||||
|
@ -413,6 +413,9 @@ const processDeepResearchJobInternal = async (
|
||||
subId: job.data.subId,
|
||||
maxUrls: job.data.request.maxUrls,
|
||||
analysisPrompt: job.data.request.analysisPrompt,
|
||||
systemPrompt: job.data.request.systemPrompt,
|
||||
formats: job.data.request.formats,
|
||||
jsonOptions: job.data.request.jsonOptions,
|
||||
});
|
||||
|
||||
if(result.success) {
|
||||
|
@ -356,7 +356,7 @@ export interface CrawlErrorsResponse {
|
||||
* Parameters for deep research operations.
|
||||
* Defines options for conducting deep research on a query.
|
||||
*/
|
||||
export interface DeepResearchParams {
|
||||
export interface DeepResearchParams<LLMSchema extends zt.ZodSchema = any> {
|
||||
/**
|
||||
* Maximum depth of research iterations (1-10)
|
||||
* @default 7
|
||||
@ -376,10 +376,26 @@ export interface DeepResearchParams {
|
||||
* The prompt to use for the final analysis
|
||||
*/
|
||||
analysisPrompt?: string;
|
||||
/**
|
||||
* The system prompt to use for the research agent
|
||||
*/
|
||||
systemPrompt?: string;
|
||||
/**
|
||||
* The formats to use for the final analysis
|
||||
*/
|
||||
formats?: ("markdown" | "json")[];
|
||||
/**
|
||||
* The JSON options to use for the final analysis
|
||||
*/
|
||||
jsonOptions?:{
|
||||
prompt?: string;
|
||||
schema?: LLMSchema;
|
||||
systemPrompt?: string;
|
||||
};
|
||||
/**
|
||||
* Experimental flag for streaming steps
|
||||
*/
|
||||
__experimental_streamSteps?: boolean;
|
||||
// __experimental_streamSteps?: boolean;
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1420,7 +1436,7 @@ export default class FirecrawlApp {
|
||||
*/
|
||||
async deepResearch(
|
||||
query: string,
|
||||
params: DeepResearchParams,
|
||||
params: DeepResearchParams<zt.ZodSchema>,
|
||||
onActivity?: (activity: {
|
||||
type: string;
|
||||
status: string;
|
||||
@ -1505,12 +1521,31 @@ export default class FirecrawlApp {
|
||||
* @param params - Parameters for the deep research operation.
|
||||
* @returns The response containing the research job ID.
|
||||
*/
|
||||
async asyncDeepResearch(query: string, params: DeepResearchParams): Promise<DeepResearchResponse | ErrorResponse> {
|
||||
async asyncDeepResearch(query: string, params: DeepResearchParams<zt.ZodSchema>): Promise<DeepResearchResponse | ErrorResponse> {
|
||||
const headers = this.prepareHeaders();
|
||||
let jsonData: any = { query, ...params };
|
||||
|
||||
if (jsonData?.jsonOptions?.schema) {
|
||||
let schema = jsonData.jsonOptions.schema;
|
||||
// Try parsing the schema as a Zod schema
|
||||
try {
|
||||
schema = zodToJsonSchema(schema);
|
||||
} catch (error) {
|
||||
|
||||
}
|
||||
jsonData = {
|
||||
...jsonData,
|
||||
jsonOptions: {
|
||||
...jsonData.jsonOptions,
|
||||
schema: schema,
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
try {
|
||||
const response: AxiosResponse = await this.postRequest(
|
||||
`${this.apiUrl}/v1/deep-research`,
|
||||
{ query, ...params },
|
||||
jsonData,
|
||||
headers
|
||||
);
|
||||
|
||||
|
@ -49,6 +49,7 @@ class DeepResearchParams(pydantic.BaseModel):
|
||||
timeLimit: Optional[int] = 270
|
||||
maxUrls: Optional[int] = 20
|
||||
analysisPrompt: Optional[str] = None
|
||||
systemPrompt: Optional[str] = None
|
||||
__experimental_streamSteps: Optional[bool] = None
|
||||
|
||||
class DeepResearchResponse(pydantic.BaseModel):
|
||||
@ -1171,7 +1172,6 @@ class FirecrawlApp:
|
||||
time.sleep(2) # Polling interval
|
||||
|
||||
return {'success': False, 'error': 'Deep research job terminated unexpectedly'}
|
||||
|
||||
def async_deep_research(self, query: str, params: Optional[Union[Dict[str, Any], DeepResearchParams]] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Initiates an asynchronous deep research operation.
|
||||
@ -1195,8 +1195,15 @@ class FirecrawlApp:
|
||||
research_params = params
|
||||
|
||||
headers = self._prepare_headers()
|
||||
|
||||
json_data = {'query': query, **research_params.dict(exclude_none=True)}
|
||||
|
||||
# Handle json options schema if present
|
||||
if 'jsonOptions' in json_data:
|
||||
json_opts = json_data['jsonOptions']
|
||||
if json_opts and 'schema' in json_opts and hasattr(json_opts['schema'], 'schema'):
|
||||
json_data['jsonOptions']['schema'] = json_opts['schema'].schema()
|
||||
|
||||
try:
|
||||
response = self._post_request(f'{self.api_url}/v1/deep-research', json_data, headers)
|
||||
if response.status_code == 200:
|
||||
|
Loading…
x
Reference in New Issue
Block a user