(feat/deep-research-alpha) Added Max Urls, Sources and Fixes (#1271)

* Nick: fixes

* Nick:

* Update deep-research-status.ts
This commit is contained in:
Nicolas 2025-02-27 13:24:40 -03:00 committed by GitHub
parent 1d3757b391
commit 289e351c14
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 97 additions and 53 deletions

View File

@ -32,6 +32,8 @@ export async function deepResearchStatusController(
success: research.status === "failed" ? false : true,
data: {
finalAnalysis: research.finalAnalysis,
sources: research.sources,
activities: research.activities,
// completedSteps: research.completedSteps,
// totalSteps: research.totalExpectedSteps,
},
@ -40,6 +42,7 @@ export async function deepResearchStatusController(
currentDepth: research.currentDepth,
maxDepth: research.maxDepth,
status: research.status,
// DO NOT remove - backwards compatibility
activities: research.activities,
sources: research.sources,
// summaries: research.summaries,

View File

@ -8,6 +8,7 @@ import { z } from "zod";
export const deepResearchRequestSchema = z.object({
topic: z.string().describe('The topic or question to research'),
maxDepth: z.number().min(1).max(10).default(7).describe('Maximum depth of research iterations'),
maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
__experimental_streamSteps: z.boolean().optional(),
});

View File

@ -89,6 +89,8 @@ export async function updateDeepResearch(
: current.summaries
};
await redisConnection.set("deep-research:" + id, JSON.stringify(updatedResearch));
await redisConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
}

View File

@ -13,14 +13,16 @@ interface DeepResearchServiceOptions {
plan: string;
topic: string;
maxDepth: number;
maxUrls: number;
timeLimit: number;
subId?: string;
}
export async function performDeepResearch(options: DeepResearchServiceOptions) {
const { researchId, teamId, plan, timeLimit, subId } = options;
const { researchId, teamId, plan, timeLimit, subId, maxUrls } = options;
const startTime = Date.now();
let currentTopic = options.topic;
let urlsAnalyzed = 0;
const logger = _logger.child({
module: "deep-research",
@ -41,7 +43,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
const llmService = new ResearchLLMService(logger);
try {
while (!state.hasReachedMaxDepth()) {
while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
const timeElapsed = Date.now() - startTime;
if (timeElapsed >= timeLimit * 1000) {
@ -135,14 +137,22 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
}
// Filter out already seen URLs and track new ones
const newSearchResults = searchResults.filter((result) => {
const newSearchResults = searchResults.filter(async (result) => {
if (!result.url || state.hasSeenUrl(result.url)) {
return false;
}
state.addSeenUrl(result.url);
urlsAnalyzed++;
return true;
});
await state.addSources(newSearchResults.map((result) => ({
url: result.url ?? "",
title: result.title ?? "",
description: result.description ?? "",
icon: result.metadata?.favicon ?? "",
})));
logger.debug(
"[Deep Research] New unique results count:",
{ length: newSearchResults.length },
@ -272,7 +282,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
success: true,
message: "Research completed",
num_docs: 1,
docs: [{ finalAnalysis: finalAnalysis }],
docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
time_taken: (Date.now() - startTime) / 1000,
team_id: teamId,
mode: "deep-research",
@ -281,17 +291,16 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
origin: "api",
num_tokens: 0,
tokens_billed: 0,
sources: {},
});
await updateDeepResearch(researchId, {
status: "completed",
finalAnalysis: finalAnalysis,
});
// Bill team for usage
billTeam(teamId, subId, state.getFindings().length, logger).catch(
// Bill team for usage based on URLs analyzed
billTeam(teamId, subId, urlsAnalyzed, logger).catch(
(error) => {
logger.error(
`Failed to bill team ${teamId} for ${state.getFindings().length} findings`, { teamId, count: state.getFindings().length, error },
`Failed to bill team ${teamId} for ${urlsAnalyzed} URLs analyzed`, { teamId, count: urlsAnalyzed, error },
);
},
);

View File

@ -25,7 +25,7 @@ export class ResearchStateManager {
private completedSteps: number = 0;
private readonly totalExpectedSteps: number;
private seenUrls: Set<string> = new Set();
private sources: DeepResearchSource[] = [];
constructor(
private readonly researchId: string,
private readonly teamId: string,
@ -61,9 +61,9 @@ export class ResearchStateManager {
});
}
async addSource(source: DeepResearchSource): Promise<void> {
async addSources(sources: DeepResearchSource[]): Promise<void> {
await updateDeepResearch(this.researchId, {
sources: [source],
sources: sources,
});
}
@ -136,6 +136,10 @@ export class ResearchStateManager {
getUrlToSearch(): string {
return this.urlToSearch;
}
getSources(): DeepResearchSource[] {
return this.sources;
}
}
export class ResearchLLMService {
@ -254,17 +258,12 @@ export class ResearchLLMService {
logger: this.logger.child({
method: "generateFinalAnalysis",
}),
mode: "no-object",
options: {
mode: "llm",
systemPrompt:
"You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
new Date().toISOString().split("T")[0],
schema: {
type: "object",
properties: {
report: { type: "string" },
},
},
prompt: trimToTokenLimit(
`Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
@ -285,6 +284,6 @@ export class ResearchLLMService {
markdown: "",
});
return extract.report;
return extract;
}
}

View File

@ -156,6 +156,7 @@ export async function generateCompletions({
previousWarning,
isExtractEndpoint,
model = getModel("gpt-4o-mini"),
mode = "object",
}: {
model?: LanguageModel;
logger: Logger;
@ -163,6 +164,7 @@ export async function generateCompletions({
markdown?: string;
previousWarning?: string;
isExtractEndpoint?: boolean;
mode?: "object" | "no-object";
}): Promise<{
extract: any;
numTokens: number;
@ -192,44 +194,67 @@ export async function generateCompletions({
markdown = trimmedMarkdown;
warning = trimWarning;
let schema = options.schema;
// Normalize the bad json schema users write (mogery)
if (schema && !(schema instanceof z.ZodType)) {
// let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
if (schema && schema.type === "array") {
schema = {
type: "object",
properties: {
items: options.schema,
},
required: ["items"],
additionalProperties: false,
};
} else if (schema && typeof schema === "object" && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
}),
),
required: Object.keys(schema),
additionalProperties: false,
};
}
schema = normalizeSchema(schema);
}
try {
const prompt = options.prompt !== undefined
? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
: `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;
if (mode === "no-object") {
const result = await generateText({
model: model,
prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
temperature: options.temperature ?? 0,
system: options.systemPrompt,
});
extract = result.text;
return {
extract,
warning,
numTokens,
totalUsage: {
promptTokens: numTokens,
completionTokens: result.usage?.completionTokens ?? 0,
totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
},
model: model.modelId,
};
}
let schema = options.schema;
// Normalize the bad json schema users write (mogery)
if (schema && !(schema instanceof z.ZodType)) {
// let schema = options.schema;
if (schema) {
schema = removeDefaultProperty(schema);
}
if (schema && schema.type === "array") {
schema = {
type: "object",
properties: {
items: options.schema,
},
required: ["items"],
additionalProperties: false,
};
} else if (schema && typeof schema === "object" && !schema.type) {
schema = {
type: "object",
properties: Object.fromEntries(
Object.entries(schema).map(([key, value]) => {
return [key, removeDefaultProperty(value)];
}),
),
required: Object.keys(schema),
additionalProperties: false,
};
}
schema = normalizeSchema(schema);
}
const repairConfig = {
experimental_repairText: async ({ text, error }) => {
const { text: fixedText } = await generateText({
@ -241,7 +266,6 @@ export async function generateCompletions({
}
};
const generateObjectConfig = {
model: model,
prompt: prompt,

View File

@ -410,6 +410,7 @@ const processDeepResearchJobInternal = async (
maxDepth: job.data.request.maxDepth,
timeLimit: job.data.request.timeLimit,
subId: job.data.subId,
maxUrls: job.data.request.maxUrls,
});
if(result.success) {

View File

@ -1,6 +1,6 @@
{
"name": "@mendable/firecrawl-js",
"version": "1.18.3-beta.1",
"version": "1.18.4",
"description": "JavaScript SDK for Firecrawl API",
"main": "dist/index.js",
"types": "dist/index.d.ts",

View File

@ -364,6 +364,11 @@ export interface DeepResearchParams {
* @default 270
*/
timeLimit?: number;
/**
* Maximum number of URLs to analyze (1-1000)
* @default 20
*/
maxUrls?: number;
/**
* Experimental flag for streaming steps
*/