(feat/deep-research-alpha) Added Max Urls, Sources and Fixes (#1271)

* Nick: fixes * Nick: * Update deep-research-status.ts
2025-08-11 18:08:59 +08:00 · 2025-02-27 13:24:40 -03:00 · 2025-02-27 13:24:40 -03:00 · 289e351c14
commit 289e351c14
parent 1d3757b391
9 changed files with 97 additions and 53 deletions
--- a/apps/api/src/controllers/v1/deep-research-status.ts
+++ b/apps/api/src/controllers/v1/deep-research-status.ts
@ -32,6 +32,8 @@ export async function deepResearchStatusController(
    success: research.status === "failed" ? false : true,
    data: {
      finalAnalysis: research.finalAnalysis,
+      sources: research.sources,
+      activities: research.activities,
      // completedSteps: research.completedSteps,
      // totalSteps: research.totalExpectedSteps,
    },
@ -40,6 +42,7 @@ export async function deepResearchStatusController(
    currentDepth: research.currentDepth,
    maxDepth: research.maxDepth,
    status: research.status,
+    // DO NOT remove - backwards compatibility
    activities: research.activities,
    sources: research.sources,
    // summaries: research.summaries,
--- a/apps/api/src/controllers/v1/deep-research.ts
+++ b/apps/api/src/controllers/v1/deep-research.ts
@ -8,6 +8,7 @@ import { z } from "zod";
 export const deepResearchRequestSchema = z.object({
  topic: z.string().describe('The topic or question to research'),
  maxDepth: z.number().min(1).max(10).default(7).describe('Maximum depth of research iterations'),
+  maxUrls: z.number().min(1).max(1000).default(20).describe('Maximum number of URLs to analyze'),
  timeLimit: z.number().min(30).max(600).default(300).describe('Time limit in seconds'),
  __experimental_streamSteps: z.boolean().optional(),
 });
--- a/apps/api/src/lib/deep-research/deep-research-redis.ts
+++ b/apps/api/src/lib/deep-research/deep-research-redis.ts
@ -89,6 +89,8 @@ export async function updateDeepResearch(
      : current.summaries
  };

+  
+
  await redisConnection.set("deep-research:" + id, JSON.stringify(updatedResearch));
  await redisConnection.expire("deep-research:" + id, DEEP_RESEARCH_TTL);
 }
--- a/apps/api/src/lib/deep-research/deep-research-service.ts
+++ b/apps/api/src/lib/deep-research/deep-research-service.ts
@ -13,14 +13,16 @@ interface DeepResearchServiceOptions {
  plan: string;
  topic: string;
  maxDepth: number;
+  maxUrls: number;
  timeLimit: number;
  subId?: string;
 }

 export async function performDeepResearch(options: DeepResearchServiceOptions) {
-  const { researchId, teamId, plan, timeLimit, subId } = options;
+  const { researchId, teamId, plan, timeLimit, subId, maxUrls } = options;
  const startTime = Date.now();
  let currentTopic = options.topic;
+  let urlsAnalyzed = 0;

  const logger = _logger.child({
    module: "deep-research",
@ -41,7 +43,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
  const llmService = new ResearchLLMService(logger);

  try {
-    while (!state.hasReachedMaxDepth()) {
+    while (!state.hasReachedMaxDepth() && urlsAnalyzed < maxUrls) {
      logger.debug("[Deep Research] Current depth:", state.getCurrentDepth());
      const timeElapsed = Date.now() - startTime;
      if (timeElapsed >= timeLimit * 1000) {
@ -135,14 +137,22 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      }

      // Filter out already seen URLs and track new ones
-      const newSearchResults = searchResults.filter((result) => {
+      const newSearchResults = searchResults.filter(async (result) => {
        if (!result.url || state.hasSeenUrl(result.url)) {
          return false;
        }
        state.addSeenUrl(result.url);
+        
+        urlsAnalyzed++;
        return true;
      });

+      await state.addSources(newSearchResults.map((result) => ({
+        url: result.url ?? "",
+        title: result.title ?? "",
+        description: result.description ?? "",
+        icon: result.metadata?.favicon ?? "",
+      })));
      logger.debug(
        "[Deep Research] New unique results count:",
        { length: newSearchResults.length },
@ -272,7 +282,7 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      success: true,
      message: "Research completed",
      num_docs: 1,
-      docs: [{ finalAnalysis: finalAnalysis }],
+      docs: [{ finalAnalysis: finalAnalysis, sources: state.getSources() }],
      time_taken: (Date.now() - startTime) / 1000,
      team_id: teamId,
      mode: "deep-research",
@ -281,17 +291,16 @@ export async function performDeepResearch(options: DeepResearchServiceOptions) {
      origin: "api",
      num_tokens: 0,
      tokens_billed: 0,
-      sources: {},
    });
    await updateDeepResearch(researchId, {
      status: "completed",
      finalAnalysis: finalAnalysis,
    });
-    // Bill team for usage
-    billTeam(teamId, subId, state.getFindings().length, logger).catch(
+    // Bill team for usage based on URLs analyzed
+    billTeam(teamId, subId, urlsAnalyzed, logger).catch(
      (error) => {
        logger.error(
-          `Failed to bill team ${teamId} for ${state.getFindings().length} findings`, { teamId, count: state.getFindings().length, error },
+          `Failed to bill team ${teamId} for ${urlsAnalyzed} URLs analyzed`, { teamId, count: urlsAnalyzed, error },
        );
      },
    );
--- a/apps/api/src/lib/deep-research/research-manager.ts
+++ b/apps/api/src/lib/deep-research/research-manager.ts
@ -25,7 +25,7 @@ export class ResearchStateManager {
  private completedSteps: number = 0;
  private readonly totalExpectedSteps: number;
  private seenUrls: Set<string> = new Set();
-
+  private sources: DeepResearchSource[] = [];
  constructor(
    private readonly researchId: string,
    private readonly teamId: string,
@ -61,9 +61,9 @@ export class ResearchStateManager {
    });
  }

-  async addSource(source: DeepResearchSource): Promise<void> {
+  async addSources(sources: DeepResearchSource[]): Promise<void> {
    await updateDeepResearch(this.researchId, {
-      sources: [source],
+      sources: sources,
    });
  }

@ -136,6 +136,10 @@ export class ResearchStateManager {
  getUrlToSearch(): string {
    return this.urlToSearch;
  }
+
+  getSources(): DeepResearchSource[] {
+    return this.sources;
+  }
 }

 export class ResearchLLMService {
@ -254,17 +258,12 @@ export class ResearchLLMService {
      logger: this.logger.child({
        method: "generateFinalAnalysis",
      }),
+      mode: "no-object",
      options: {
        mode: "llm",
        systemPrompt:
          "You are an expert research analyst who creates comprehensive, well-structured reports. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
          new Date().toISOString().split("T")[0],
-        schema: {
-          type: "object",
-          properties: {
-            report: { type: "string" },
-          },
-        },
        prompt: trimToTokenLimit(
          `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
  
@ -285,6 +284,6 @@ export class ResearchLLMService {
      markdown: "",
    });

-    return extract.report;
+    return extract;
  }
 }
--- a/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
+++ b/apps/api/src/scraper/scrapeURL/transformers/llmExtract.ts
@ -156,6 +156,7 @@ export async function generateCompletions({
  previousWarning,
  isExtractEndpoint,
  model = getModel("gpt-4o-mini"),
+  mode = "object",
 }: {
  model?: LanguageModel; 
  logger: Logger;
@ -163,6 +164,7 @@ export async function generateCompletions({
  markdown?: string;
  previousWarning?: string;
  isExtractEndpoint?: boolean;
+  mode?: "object" | "no-object";
 }): Promise<{
  extract: any;
  numTokens: number;
@ -192,44 +194,67 @@ export async function generateCompletions({
  markdown = trimmedMarkdown;
  warning = trimWarning;

-  let schema = options.schema;
-  // Normalize the bad json schema users write (mogery)
-  if (schema && !(schema instanceof z.ZodType)) {
-    // let schema = options.schema;
-    if (schema) {
-      schema = removeDefaultProperty(schema);
-    }
-
-    if (schema && schema.type === "array") {
-      schema = {
-        type: "object",
-        properties: {
-          items: options.schema,
-        },
-        required: ["items"],
-        additionalProperties: false,
-      };
-    } else if (schema && typeof schema === "object" && !schema.type) {
-      schema = {
-        type: "object",
-        properties: Object.fromEntries(
-          Object.entries(schema).map(([key, value]) => {
-            return [key, removeDefaultProperty(value)];
-          }),
-        ),
-        required: Object.keys(schema),
-        additionalProperties: false,
-      };
-    }
-
-    schema = normalizeSchema(schema);
-  }
-
  try {
    const prompt = options.prompt !== undefined
      ? `Transform the following content into structured JSON output based on the provided schema and this user request: ${options.prompt}. If schema is provided, strictly follow it.\n\n${markdown}`
      : `Transform the following content into structured JSON output based on the provided schema if any.\n\n${markdown}`;

+    if (mode === "no-object") {
+      const result = await generateText({
+        model: model,
+        prompt: options.prompt + (markdown ? `\n\nData:${markdown}` : ""),
+        temperature: options.temperature ?? 0,
+        system: options.systemPrompt,
+      });
+
+      extract = result.text;
+      
+      return {
+        extract,
+        warning,
+        numTokens,
+        totalUsage: {
+          promptTokens: numTokens,
+          completionTokens: result.usage?.completionTokens ?? 0,
+          totalTokens: numTokens + (result.usage?.completionTokens ?? 0),
+        },
+        model: model.modelId,
+      };
+    }
+
+    let schema = options.schema;
+    // Normalize the bad json schema users write (mogery)
+    if (schema && !(schema instanceof z.ZodType)) {
+      // let schema = options.schema;
+      if (schema) {
+        schema = removeDefaultProperty(schema);
+      }
+
+      if (schema && schema.type === "array") {
+        schema = {
+          type: "object",
+          properties: {
+            items: options.schema,
+          },
+          required: ["items"],
+          additionalProperties: false,
+        };
+      } else if (schema && typeof schema === "object" && !schema.type) {
+        schema = {
+          type: "object",
+          properties: Object.fromEntries(
+            Object.entries(schema).map(([key, value]) => {
+              return [key, removeDefaultProperty(value)];
+            }),
+          ),
+          required: Object.keys(schema),
+          additionalProperties: false,
+        };
+      }
+
+      schema = normalizeSchema(schema);
+    }
+
    const repairConfig = {
      experimental_repairText: async ({ text, error }) => {
        const { text: fixedText } = await generateText({
@ -241,7 +266,6 @@ export async function generateCompletions({
      }
    };

-
    const generateObjectConfig = {
      model: model,
      prompt: prompt,
--- a/apps/api/src/services/queue-worker.ts
+++ b/apps/api/src/services/queue-worker.ts
@ -410,6 +410,7 @@ const processDeepResearchJobInternal = async (
      maxDepth: job.data.request.maxDepth,
      timeLimit: job.data.request.timeLimit,
      subId: job.data.subId,
+      maxUrls: job.data.request.maxUrls,
    });  
    
    if(result.success) {
--- a/apps/js-sdk/firecrawl/package.json
+++ b/apps/js-sdk/firecrawl/package.json
@ -1,6 +1,6 @@
 {
  "name": "@mendable/firecrawl-js",
-  "version": "1.18.3-beta.1",
+  "version": "1.18.4",
  "description": "JavaScript SDK for Firecrawl API",
  "main": "dist/index.js",
  "types": "dist/index.d.ts",
--- a/apps/js-sdk/firecrawl/src/index.ts
+++ b/apps/js-sdk/firecrawl/src/index.ts
@ -364,6 +364,11 @@ export interface DeepResearchParams {
   * @default 270
   */
  timeLimit?: number;
+  /**
+   * Maximum number of URLs to analyze (1-1000)
+   * @default 20
+   */
+  maxUrls?: number;
  /**
   * Experimental flag for streaming steps
   */