firecrawl/apps/api/src/lib/deep-research/research-manager.ts
Nicolas 6634d236bf
(feat/fire-1) FIRE-1 (#1462)
* wip

* integrating smart-scrape

* integrate smartscrape into llmExtract

* wip

* smart scrape multiple links

* fixes

* fix

* wip

* it worked!

* wip. there's a bug on the batchExtract TypeError: Converting circular structure to JSON

* wip

* retry model

* retry models

* feat/scrape+json+extract interfaces ready

* vertex -> googleapi

* fix/transformArrayToObject. required params on schema is still a bug

* change model

* o3-mini -> gemini

* Update extractSmartScrape.ts

* sessionId

* sessionId

* Nick: f-0 start

* Update extraction-service-f0.ts

* Update types.ts

* Nick:

* Update queue-worker.ts

* Nick: new interface

* rename analyzeSchemaAndPrompt -> F0

* refactor: rename agent ID to model in types and extract logic

* agent

* id->model

* id->model

* refactor: standardize agent model handling and validation across extraction logic

* livecast agent

* (feat/f1) sdks (#1459)

* feat: add FIRE-1 agent support to Python and JavaScript SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add FIRE-1 agent support to scrape methods in both SDKs

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* feat: add prompt and sessionId to AgentOptions interface

Co-Authored-By: hello@sideguide.dev <hello@sideguide.dev>

* Update index.ts

---------

Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Nicolas <nicolascamara29@gmail.com>

* feat(v1): rate limits

* Update types.ts

* Update llmExtract.ts

* add cost tracking

* remove

* Update requests.http

* fix smart scrape cost calc

* log sm cost

* fix counts

* fix

* expose cost tracking

* models fix

* temp: skipLibcheck

* get rid of it

* fix ts

* dont skip lib check

* Update extractSmartScrape.ts

* Update queue-worker.ts

* Update smartScrape.ts

* Update requests.http

* fix(rate-limiter):

* types: fire-1 refine

* bill 150

* fix credits used on crawl

* ban from crawl

* route cost limit warning

* Update generic-ai.ts

* genres

* Update llmExtract.ts

* test server diff

* cletu

---------

Co-authored-by: rafaelmmiller <150964962+rafaelsideguide@users.noreply.github.com>
Co-authored-by: Thomas Kosmas <thomas510111@gmail.com>
Co-authored-by: Ademílson F. Tonato <ademilsonft@outlook.com>
Co-authored-by: devin-ai-integration[bot] <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: hello@sideguide.dev <hello@sideguide.dev>
Co-authored-by: Gergő Móricz <mo.geryy@gmail.com>
2025-04-15 00:19:45 -07:00

320 lines
11 KiB
TypeScript

import { Logger } from "winston";
import {
DeepResearchActivity,
DeepResearchFinding,
DeepResearchSource,
updateDeepResearch,
} from "./deep-research-redis";
import {
generateCompletions,
trimToTokenLimit,
} from "../../scraper/scrapeURL/transformers/llmExtract";
import { ExtractOptions } from "../../controllers/v1/types";
import { getModel } from "../generic-ai";
interface AnalysisResult {
gaps: string[];
nextSteps: string[];
shouldContinue: boolean;
nextSearchTopic?: string;
}
export class ResearchStateManager {
private findings: DeepResearchFinding[] = [];
private summaries: string[] = [];
private nextSearchTopic: string = "";
private urlToSearch: string = "";
private currentDepth: number = 0;
private failedAttempts: number = 0;
private readonly maxFailedAttempts: number = 3;
private completedSteps: number = 0;
private readonly totalExpectedSteps: number;
private seenUrls: Set<string> = new Set();
private sources: DeepResearchSource[] = [];
constructor(
private readonly researchId: string,
private readonly teamId: string,
private readonly maxDepth: number,
private readonly logger: Logger,
private readonly topic: string,
) {
this.totalExpectedSteps = maxDepth * 5; // 5 steps per depth level
this.nextSearchTopic = topic;
}
hasSeenUrl(url: string): boolean {
return this.seenUrls.has(url);
}
addSeenUrl(url: string): void {
this.seenUrls.add(url);
}
getSeenUrls(): Set<string> {
return this.seenUrls;
}
async addActivity(activities: DeepResearchActivity[]): Promise<void> {
if (activities.some((activity) => activity.status === "complete")) {
this.completedSteps++;
}
await updateDeepResearch(this.researchId, {
activities: activities,
completedSteps: this.completedSteps,
});
}
async addSources(sources: DeepResearchSource[]): Promise<void> {
await updateDeepResearch(this.researchId, {
sources: sources,
});
}
async addFindings(findings: DeepResearchFinding[]): Promise<void> {
// Only keep the most recent 50 findings
// To avoid memory issues for now
this.findings = [...this.findings, ...findings].slice(-50);
await updateDeepResearch(this.researchId, {
findings: findings,
});
}
async addSummary(summary: string): Promise<void> {
this.summaries.push(summary);
await updateDeepResearch(this.researchId, {
summaries: [summary],
});
}
async incrementDepth(): Promise<void> {
this.currentDepth++;
await updateDeepResearch(this.researchId, {
currentDepth: this.currentDepth,
});
}
incrementFailedAttempts(): void {
this.failedAttempts++;
}
getFindings(): DeepResearchFinding[] {
return this.findings;
}
getSummaries(): string[] {
return this.summaries;
}
getCurrentDepth(): number {
return this.currentDepth;
}
hasReachedMaxDepth(): boolean {
return this.currentDepth >= this.maxDepth;
}
hasReachedMaxFailedAttempts(): boolean {
return this.failedAttempts >= this.maxFailedAttempts;
}
getProgress(): { completedSteps: number; totalSteps: number } {
return {
completedSteps: this.completedSteps,
totalSteps: this.totalExpectedSteps,
};
}
setNextSearchTopic(topic: string): void {
this.nextSearchTopic = topic;
}
getNextSearchTopic(): string {
return this.nextSearchTopic;
}
setUrlToSearch(url: string): void {
this.urlToSearch = url;
}
getUrlToSearch(): string {
return this.urlToSearch;
}
getSources(): DeepResearchSource[] {
return this.sources;
}
}
export class ResearchLLMService {
constructor(private readonly logger: Logger) {}
async generateSearchQueries(
topic: string,
findings: DeepResearchFinding[] = [],
): Promise<{ query: string; researchGoal: string }[]> {
const { extract } = await generateCompletions({
logger: this.logger.child({
method: "generateSearchQueries",
}),
options: {
mode: "llm",
systemPrompt:
"You are an expert research agent that generates search queries (SERP) to explore topics deeply and thoroughly. Do not generate repeated queries. Today's date is " +
new Date().toISOString().split("T")[0],
schema: {
type: "object",
properties: {
queries: {
type: "array",
items: {
type: "object",
properties: {
query: {
type: "string",
description: "The search query to use",
},
researchGoal: {
type: "string",
description:
"The specific goal this query aims to achieve and how it advances the research",
},
},
},
},
},
},
prompt: `Generate a list of 3-5 search queries to deeply research this topic: "${topic}"
${findings.length > 0 ? `\nBased on these previous findings, generate more specific queries:\n${trimToTokenLimit(findings.map((f) => `- ${f.text}`).join("\n"), 10000).text}` : ""}
Each query should be specific and focused on a particular aspect.
Build upon previous findings when available.
Be specific and go deep, not wide - always following the original topic.
Every search query is a new SERP query so make sure the whole context is added without overwhelming the search engine.
The first SERP query you generate should be a very concise, simple version of the topic. `,
},
markdown: "",
});
return extract.queries;
}
async analyzeAndPlan(
findings: DeepResearchFinding[],
currentTopic: string,
timeRemaining: number,
systemPrompt: string,
): Promise<AnalysisResult | null> {
try {
const timeRemainingMinutes =
Math.round((timeRemaining / 1000 / 60) * 10) / 10;
const { extract } = await generateCompletions({
logger: this.logger.child({
method: "analyzeAndPlan",
}),
options: {
mode: "llm",
systemPrompt:
systemPrompt +
"You are an expert research agent that is analyzing findings. Your goal is to synthesize information and identify gaps for further research. Today's date is " +
new Date().toISOString().split("T")[0],
schema: {
type: "object",
properties: {
analysis: {
type: "object",
properties: {
gaps: { type: "array", items: { type: "string" } },
nextSteps: { type: "array", items: { type: "string" } },
shouldContinue: { type: "boolean" },
nextSearchTopic: { type: "string" },
},
required: ["gaps", "nextSteps", "shouldContinue"],
},
},
},
prompt: trimToTokenLimit(
`You are researching: ${currentTopic}
You have ${timeRemainingMinutes} minutes remaining to complete the research but you don't need to use all of it.
Current findings: ${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
What has been learned? What gaps remain, if any? What specific aspects should be investigated next if any?
If you need to search for more information inside the same topic pick a sub-topic by including a nextSearchTopic -which should be highly related to the original topic/users'query.
Important: If less than 1 minute remains, set shouldContinue to false to allow time for final synthesis.
If I have enough information, set shouldContinue to false.`,
120000,
).text,
},
markdown: "",
});
return extract.analysis;
} catch (error) {
this.logger.error("Analysis error", { error });
return null;
}
}
async generateFinalAnalysis(
topic: string,
findings: DeepResearchFinding[],
summaries: string[],
analysisPrompt: string,
formats?: string[],
jsonOptions?: ExtractOptions,
): Promise<any> {
if (!formats) {
formats = ["markdown"];
}
if (!jsonOptions) {
jsonOptions = undefined;
}
const { extract } = await generateCompletions({
logger: this.logger.child({
method: "generateFinalAnalysis",
}),
mode: formats.includes("json") ? "object" : "no-object",
options: {
mode: "llm",
...(formats.includes("json") && {
...jsonOptions,
}),
systemPrompt: formats.includes("json")
? "You are an expert research analyst who creates comprehensive, structured analysis following the provided JSON schema exactly."
: "You are an expert research analyst who creates comprehensive, well-structured reports. Don't begin the report by saying 'Here is the report', nor 'Below is the report', nor something similar. ALWAYS start with a great title that reflects the research topic and findings. Your reports are detailed, properly formatted in Markdown, and include clear sections with citations. Today's date is " +
new Date().toISOString().split("T")[0],
prompt: trimToTokenLimit(
analysisPrompt
? `${analysisPrompt}\n\nResearch data:\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: formats.includes("json")
? `Analyze the following research data on "${topic}" and structure the output according to the provided schema: Schema: ${JSON.stringify(jsonOptions?.schema)}\n\nFindings:\n\n${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}`
: `Create a comprehensive research report on "${topic}" based on the collected findings and analysis.
Research data:
${findings.map((f) => `[From ${f.source}]: ${f.text}`).join("\n")}
Requirements:
- Format the report in Markdown with proper headers and sections
- Include specific citations to sources where appropriate
- Provide detailed analysis in each section
- Make it comprehensive and thorough (aim for 4+ pages worth of content)
- Include all relevant findings and insights from the research
- Cite sources
- Cite sources throughout the report
- Use bullet points and lists where appropriate for readability
- Don't begin the report by saying "Here is the report", nor "Below is the report", nor something similar.
- ALWAYS Start with a great title that reflects the research topic and findings - concise and to the point. That's the first thing you should output.
Begin!`,
100000,
).text,
},
markdown: "",
model: getModel("o3-mini"),
});
return extract;
}
}