From b67b04ff5d2adaa1507c3cfb4a220de7ab1e7873 Mon Sep 17 00:00:00 2001
From: Li Xin <lixin.henry@bytedance.com>
Date: Mon, 21 Apr 2025 19:50:34 +0800
Subject: [PATCH] feat: support multi-language

---
 src/graph/nodes.py           | 18 +++++++------
 src/graph/types.py           |  4 ++-
 src/prompts/coder.md         | 14 +++++-----
 src/prompts/planner.md       | 51 ++++++++++++++++++------------------
 src/prompts/planner_model.py |  8 ++++--
 src/prompts/researcher.md    |  4 +--
 6 files changed, 54 insertions(+), 45 deletions(-)

diff --git a/src/graph/nodes.py b/src/graph/nodes.py
index 4a7a5ca..d3397eb 100644
--- a/src/graph/nodes.py
+++ b/src/graph/nodes.py
@@ -1,22 +1,23 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 
-import logging
 import json
-from typing import Literal, Annotated
+import logging
+from typing import Annotated, Literal
 
-from langchain_core.messages import HumanMessage, AIMessage
-from langchain_core.tools import tool
+from langchain_core.messages import AIMessage, HumanMessage
 from langchain_core.runnables import RunnableConfig
+from langchain_core.tools import tool
 from langgraph.types import Command, interrupt
 
-from src.llms.llm import get_llm_by_type
+from src.agents.agents import coder_agent, research_agent
 from src.config.agents import AGENT_LLM_MAP
 from src.config.configuration import Configuration
-from src.prompts.template import apply_prompt_template
+from src.llms.llm import get_llm_by_type
 from src.prompts.planner_model import Plan, StepType
+from src.prompts.template import apply_prompt_template
 from src.utils.json_utils import repair_json_output
-from src.agents.agents import research_agent, coder_agent
+
 from .types import State
 
 logger = logging.getLogger(__name__)
@@ -117,6 +118,7 @@ def human_feedback_node(
         update={
             "current_plan": Plan.model_validate(new_plan),
             "plan_iterations": plan_iterations,
+            "locale": new_plan["locale"],
         },
         goto=goto,
     )
@@ -209,7 +211,7 @@ def _execute_agent_step(
     agent_input = {
         "messages": [
             HumanMessage(
-                content=f"#Task\n\n##title\n\n{step.title}\n\n##description\n\n{step.description}"
+                content=f"#Task\n\n##title\n\n{step.title}\n\n##description\n\n{step.description}\n\n##locale\n\n{state.get('locale', 'en-US')}"
             )
         ]
     }
diff --git a/src/graph/types.py b/src/graph/types.py
index 71a208e..128bc67 100644
--- a/src/graph/types.py
+++ b/src/graph/types.py
@@ -2,9 +2,10 @@
 # SPDX-License-Identifier: MIT
 
 import operator
+from typing import Annotated
 
 from langgraph.graph import MessagesState
-from typing import Annotated
+
 from src.prompts.planner_model import Plan
 
 
@@ -12,6 +13,7 @@ class State(MessagesState):
     """State for the agent system, extends MessagesState with next field."""
 
     # Runtime Variables
+    locale: str = "en-US"
     observations: Annotated[list[str], operator.add] = []
     plan_iterations: int = 0
     current_plan: Plan | str = None
diff --git a/src/prompts/coder.md b/src/prompts/coder.md
index 8222f74..9093044 100644
--- a/src/prompts/coder.md
+++ b/src/prompts/coder.md
@@ -23,12 +23,12 @@ You are a professional software engineer proficient in Python scripting. Your ta
 - Use comments in code to improve readability and maintainability.
 - If you want to see the output of a value, you MUST print it out with `print(...)`.
 - Always and only use Python to do the math.
-- Always use the same language as the initial question.
 - Always use `yfinance` for financial market data:
-  - Get historical data with `yf.download()`
-  - Access company info with `Ticker` objects
-  - Use appropriate date ranges for data retrieval
+    - Get historical data with `yf.download()`
+    - Access company info with `Ticker` objects
+    - Use appropriate date ranges for data retrieval
 - Required Python packages are pre-installed:
-  - `pandas` for data manipulation
-  - `numpy` for numerical operations
-  - `yfinance` for financial market data
+    - `pandas` for data manipulation
+    - `numpy` for numerical operations
+    - `yfinance` for financial market data
+- Always output in the locale of **{{ locale }}**.
diff --git a/src/prompts/planner.md b/src/prompts/planner.md
index 619187b..1c5d4be 100644
--- a/src/prompts/planner.md
+++ b/src/prompts/planner.md
@@ -14,7 +14,7 @@ As a Deep Researcher, you can breakdown the major subject into sub-topics and ex
 
 The successful research plan must meet these standards:
 
-1. **Comprehensive Coverage**: 
+1. **Comprehensive Coverage**:
    - Information must cover ALL aspects of the topic
    - Multiple perspectives must be represented
    - Both mainstream and alternative viewpoints should be included
@@ -74,51 +74,51 @@ Different types of steps have different web search requirements:
 ## Exclusions
 
 - **No Direct Calculations in Research Steps**:
-  - Research steps should only gather data and information
-  - All mathematical calculations must be handled by processing steps
-  - Numerical analysis must be delegated to processing steps
-  - Research steps focus on information gathering only
+    - Research steps should only gather data and information
+    - All mathematical calculations must be handled by processing steps
+    - Numerical analysis must be delegated to processing steps
+    - Research steps focus on information gathering only
 
 ## Analysis Framework
 
 When planning information gathering, consider these key aspects and ensure COMPREHENSIVE coverage:
 
-1. **Historical Context**: 
+1. **Historical Context**:
    - What historical data and trends are needed?
    - What is the complete timeline of relevant events?
    - How has the subject evolved over time?
 
-2. **Current State**: 
+2. **Current State**:
    - What current data points need to be collected?
    - What is the present landscape/situation in detail?
    - What are the most recent developments?
 
-3. **Future Indicators**: 
+3. **Future Indicators**:
    - What predictive data or future-oriented information is required?
    - What are all relevant forecasts and projections?
    - What potential future scenarios should be considered?
 
-4. **Stakeholder Data**: 
+4. **Stakeholder Data**:
    - What information about ALL relevant stakeholders is needed?
    - How are different groups affected or involved?
    - What are the various perspectives and interests?
 
-5. **Quantitative Data**: 
+5. **Quantitative Data**:
    - What comprehensive numbers, statistics, and metrics should be gathered?
    - What numerical data is needed from multiple sources?
    - What statistical analyses are relevant?
 
-6. **Qualitative Data**: 
+6. **Qualitative Data**:
    - What non-numerical information needs to be collected?
    - What opinions, testimonials, and case studies are relevant?
    - What descriptive information provides context?
 
-7. **Comparative Data**: 
+7. **Comparative Data**:
    - What comparison points or benchmark data are required?
    - What similar cases or alternatives should be examined?
    - How does this compare across different contexts?
 
-8. **Risk Data**: 
+8. **Risk Data**:
    - What information about ALL potential risks should be gathered?
    - What are the challenges, limitations, and obstacles?
    - What contingencies and mitigations exist?
@@ -135,16 +135,16 @@ When planning information gathering, consider these key aspects and ensure COMPR
 - To begin with, repeat user's requirement in your own words as `thought`.
 - Rigorously assess if there is sufficient context to answer the question using the strict criteria above.
 - If context is sufficient:
-  - Set `has_enough_context` to true
-  - No need to create information gathering steps
+    - Set `has_enough_context` to true
+    - No need to create information gathering steps
 - If context is insufficient (default assumption):
-  - Break down the required information using the Analysis Framework
-  - Create NO MORE THAN {{ max_step_num }} focused and comprehensive steps that cover the most essential aspects
-  - Ensure each step is substantial and covers related information categories
-  - Prioritize breadth and depth within the {{ max_step_num }}-step constraint
-  - For each step, carefully assess if web search is needed:
-    - Research and external data gathering: Set `need_web_search: true`
-    - Internal data processing: Set `need_web_search: false`
+    - Break down the required information using the Analysis Framework
+    - Create NO MORE THAN {{ max_step_num }} focused and comprehensive steps that cover the most essential aspects
+    - Ensure each step is substantial and covers related information categories
+    - Prioritize breadth and depth within the {{ max_step_num }}-step constraint
+    - For each step, carefully assess if web search is needed:
+        - Research and external data gathering: Set `need_web_search: true`
+        - Internal data processing: Set `need_web_search: false`
 - Specify the exact data to be collected in step's `description`. Include a `note` if necessary.
 - Prioritize depth and volume of relevant information - limited information is not acceptable.
 - Use the same language as the user to generate the plan.
@@ -163,6 +163,7 @@ interface Step {
 }
 
 interface Plan {
+  locale: string; // e.g. "en-US" or "zh-CN", based on the user's language or specific request
   has_enough_context: boolean;
   thought: string;
   title: string;
@@ -179,7 +180,7 @@ interface Plan {
 - Never settle for minimal information - the goal is a comprehensive, detailed final report
 - Limited or insufficient information will lead to an inadequate final report
 - Carefully assess each step's web search requirement based on its nature:
-  - Research steps (`need_web_search: true`) for gathering information
-  - Processing steps (`need_web_search: false`) for calculations and data processing
+    - Research steps (`need_web_search: true`) for gathering information
+    - Processing steps (`need_web_search: false`) for calculations and data processing
 - Default to gathering more information unless the strictest sufficient context criteria are met
-- Always Use the same language as the user
+- Always use the same language as the user
diff --git a/src/prompts/planner_model.py b/src/prompts/planner_model.py
index 615d701..b75d00b 100644
--- a/src/prompts/planner_model.py
+++ b/src/prompts/planner_model.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 
-from pydantic import BaseModel, Field
-from typing import List, Optional
 from enum import Enum
+from typing import List, Optional
+
+from pydantic import BaseModel, Field
 
 
 class StepType(str, Enum):
@@ -24,6 +25,9 @@ class Step(BaseModel):
 
 
 class Plan(BaseModel):
+    locale: str = Field(
+        ..., description="e.g. 'en-US' or 'zh-CN', based on the user's language"
+    )
     has_enough_context: bool
     thought: str
     title: str
diff --git a/src/prompts/researcher.md b/src/prompts/researcher.md
index 53699a2..208a6cf 100644
--- a/src/prompts/researcher.md
+++ b/src/prompts/researcher.md
@@ -33,7 +33,7 @@ You are dedicated to conducting thorough investigations and providing comprehens
 
       - [Source Title](https://example.com/page2)
       ```
-- Always use the same language as the initial question.
+- Always output in the locale of **{{ locale }}**.
 - DO NOT include inline citations in the text. Instead, track all sources and list them in the References section at the end using link reference format.
 
 # Notes
@@ -49,4 +49,4 @@ You are dedicated to conducting thorough investigations and providing comprehens
 - When presenting information from multiple sources, clearly indicate which source each piece of information comes from.
 - Include images using `![Image Description](image_url)` in a separate section.
 - The included images should **only** be from the information gathered **from the search results or the crawled content**. **Never** include images that are not from the search results or the crawled content.
-- Always use the same language as the initial question.
+- Always use the locale of **{{ locale }}** for the output.