feat: integrate volcengine tts functionality

2025-10-03 20:46:37 +08:00 · 2025-04-18 15:28:31 +08:00 · 2025-04-18 15:28:31 +08:00 · a6ab97c970
commit a6ab97c970
parent b2f14d1737
6 changed files with 251 additions and 8 deletions
--- a/.env.example
+++ b/.env.example
@ -2,10 +2,14 @@
 DEBUG=True
 APP_ENV=development
-# Add other environment variables as needed
+# Search Engine
 # tavily, duckduckgo, brave_search, arxiv
 SEARCH_API=tavily
 TAVILY_API_KEY=tvly-xxx
 BRAVE_SEARCH_API_KEY=brave-xxx
 # JINA_API_KEY=jina_xxx # Optional, default is None
 # Volcengine TTS
 VOLCENGINE_TTS_APPID=xxx
 VOLCENGINE_TTS_ACCESS_TOKEN=xxx
 # VOLCENGINE_TTS_CLUSTER=volcano_tts # Optional, default is volcano_tts
 # VOLCENGINE_TTS_VOICE_TYPE=BV700_V2_streaming # Optional, default is BV700_V2_streaming
--- a/README.md
+++ b/README.md
@ -17,12 +17,13 @@ cd deer-flow
 # Install dependencies, uv will take care of the python interpreter and venv creation, and install the required packages
 uv sync
-# Configure .env with your Search Engine API keys
+# Configure .env with your API keys
 # Tavily: https://app.tavily.com/home
 # Brave_SEARCH: https://brave.com/search/api/
 # volcengine TTS: Add your TTS credentials if you have them
 cp .env.example .env
-# See the 'Supported Search Engines' section below for all available options
+# See the 'Supported Search Engines' and 'Text-to-Speech Integration' sections below for all available options
 # Configure conf.yaml for your LLM model and API keys
 # Gemini: https://ai.google.dev/gemini-api/docs/openai
@ -120,6 +121,34 @@ The system employs a streamlined workflow with the following components:
   - Processes and structures the collected information
   - Generates comprehensive research reports
 ## Text-to-Speech Integration
 DeerFlow now includes a Text-to-Speech (TTS) feature that allows you to convert research reports to speech. This feature uses the volcengine TTS API to generate high-quality audio from text.
 ### Features
 - Convert any text or research report to natural-sounding speech
 - Adjust speech parameters like speed, volume, and pitch
 - Support for multiple voice types
 - Available through both API and web interface
 ### Using the TTS API
 You can access the TTS functionality through the `/api/tts` endpoint:
 ```bash
 # Example API call using curl
 curl --location 'http://localhost:8000/api/tts' \
 --header 'Content-Type: application/json' \
 --data '{
    "text": "This is a test of the text-to-speech functionality.",
    "speed_ratio": 1.0,
    "volume_ratio": 1.0,
    "pitch_ratio": 1.0
 }' \
 --output speech.mp3
 ```
 ## Examples
 The following examples demonstrate the capabilities of DeerFlow:
--- a/src/server/app.py
+++ b/src/server/app.py
@ -1,19 +1,22 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 import base64
 import json
 import logging
 import os
 from typing import List, cast
 from uuid import uuid4
-from fastapi import FastAPI
+from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse
+from fastapi.responses import StreamingResponse, Response
 from langchain_core.messages import AIMessageChunk, ToolMessage
 from langgraph.types import Command
 from src.graph.builder import build_graph
-from src.server.chat_request import ChatMessage, ChatRequest
+from src.server.chat_request import ChatMessage, ChatRequest, TTSRequest
 from src.tools import VolcengineTTS
 logger = logging.getLogger(__name__)
@ -137,3 +140,59 @@ def _make_event(event_type: str, data: dict[str, any]):
    if data.get("content") == "":
        data.pop("content")
    return f"event: {event_type}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n"
@app.post("/api/tts")
 async def text_to_speech(request: TTSRequest):
    """Convert text to speech using volcengine TTS API."""
    try:
        app_id = os.getenv("VOLCENGINE_TTS_APPID", "")
        if not app_id:
            raise HTTPException(
                status_code=400, detail="VOLCENGINE_TTS_APPID is not set"
            )
        access_token = os.getenv("VOLCENGINE_TTS_ACCESS_TOKEN", "")
        if not access_token:
            raise HTTPException(
                status_code=400, detail="VOLCENGINE_TTS_ACCESS_TOKEN is not set"
            )
        cluster = os.getenv("VOLCENGINE_TTS_CLUSTER", "volcano_tts")
        voice_type = os.getenv("VOLCENGINE_TTS_VOICE_TYPE", "BV700_V2_streaming")
        tts_client = VolcengineTTS(
            appid=app_id,
            access_token=access_token,
            cluster=cluster,
            voice_type=voice_type,
        )
        # Call the TTS API
        result = tts_client.text_to_speech(
            text=request.text[:1024],
            encoding=request.encoding,
            speed_ratio=request.speed_ratio,
            volume_ratio=request.volume_ratio,
            pitch_ratio=request.pitch_ratio,
            text_type=request.text_type,
            with_frontend=request.with_frontend,
            frontend_type=request.frontend_type,
        )
        if not result["success"]:
            raise HTTPException(status_code=500, detail=str(result["error"]))
        # Decode the base64 audio data
        audio_data = base64.b64decode(result["audio_data"])
        # Return the audio file
        return Response(
            content=audio_data,
            media_type=f"audio/{request.encoding}",
            headers={
                "Content-Disposition": (
                    f"attachment; filename=tts_output.{request.encoding}"
                )
            },
        )
    except Exception as e:
        logger.exception(f"Error in TTS endpoint: {str(e)}")
        raise HTTPException(status_code=500, detail=str(e))
--- a/src/server/chat_request.py
+++ b/src/server/chat_request.py
@ -1,7 +1,7 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
-from typing import List, Optional, Union
+from typing import List, Optional, Union, Dict, Any
 from pydantic import BaseModel, Field
@ -44,3 +44,19 @@ class ChatRequest(BaseModel):
    interrupt_feedback: Optional[str] = Field(
        None, description="Interrupt feedback from the user on the plan"
    )
 class TTSRequest(BaseModel):
    text: str = Field(..., description="The text to convert to speech")
    voice_type: Optional[str] = Field(
        "BV700_V2_streaming", description="The voice type to use"
    )
    encoding: Optional[str] = Field("mp3", description="The audio encoding format")
    speed_ratio: Optional[float] = Field(1.0, description="Speech speed ratio")
    volume_ratio: Optional[float] = Field(1.0, description="Speech volume ratio")
    pitch_ratio: Optional[float] = Field(1.0, description="Speech pitch ratio")
    text_type: Optional[str] = Field("plain", description="Text type (plain or ssml)")
    with_frontend: Optional[int] = Field(
        1, description="Whether to use frontend processing"
    )
    frontend_type: Optional[str] = Field("unitTson", description="Frontend type")
--- a/src/tools/init.py
+++ b/src/tools/init.py
@ -1,6 +1,8 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 import os
 from .crawl import crawl_tool
 from .python_repl import python_repl_tool
 from .search import (
@ -9,6 +11,7 @@ from .search import (
    brave_search_tool,
    arxiv_search_tool,
 )
 from .tts import VolcengineTTS
 from src.config import SELECTED_SEARCH_ENGINE, SearchEngine
 # Map search engine names to their respective tools
@ -25,4 +28,5 @@ __all__ = [
    "crawl_tool",
    "web_search_tool",
    "python_repl_tool",
    "VolcengineTTS",
 ]
--- a/src/tools/tts.py
+++ b/src/tools/tts.py
@ -0,0 +1,131 @@
 # Copyright (c) 2025 Bytedance Ltd. and/or its affiliates
 # SPDX-License-Identifier: MIT
 """
 Text-to-Speech module using volcengine TTS API.
 """
 import json
 import uuid
 import logging
 import requests
 from typing import Optional, Dict, Any
 logger = logging.getLogger(__name__)
 class VolcengineTTS:
    """
    Client for volcengine Text-to-Speech API.
    """
    def __init__(
        self,
        appid: str,
        access_token: str,
        cluster: str = "volcano_tts",
        voice_type: str = "BV700_V2_streaming",
        host: str = "openspeech.bytedance.com",
    ):
        """
        Initialize the volcengine TTS client.
        Args:
            appid: Platform application ID
            access_token: Access token for authentication
            cluster: TTS cluster name
            voice_type: Voice type to use
            host: API host
        """
        self.appid = appid
        self.access_token = access_token
        self.cluster = cluster
        self.voice_type = voice_type
        self.host = host
        self.api_url = f"https://{host}/api/v1/tts"
        self.header = {"Authorization": f"Bearer;{access_token}"}
    def text_to_speech(
        self,
        text: str,
        encoding: str = "mp3",
        speed_ratio: float = 1.0,
        volume_ratio: float = 1.0,
        pitch_ratio: float = 1.0,
        text_type: str = "plain",
        with_frontend: int = 1,
        frontend_type: str = "unitTson",
        uid: Optional[str] = None,
    ) -> Dict[str, Any]:
        """
        Convert text to speech using volcengine TTS API.
        Args:
            text: Text to convert to speech
            encoding: Audio encoding format
            speed_ratio: Speech speed ratio
            volume_ratio: Speech volume ratio
            pitch_ratio: Speech pitch ratio
            text_type: Text type (plain or ssml)
            with_frontend: Whether to use frontend processing
            frontend_type: Frontend type
            uid: User ID (generated if not provided)
        Returns:
            Dictionary containing the API response and base64-encoded audio data
        """
        if not uid:
            uid = str(uuid.uuid4())
        request_json = {
            "app": {
                "appid": self.appid,
                "token": self.access_token,
                "cluster": self.cluster,
            },
            "user": {"uid": uid},
            "audio": {
                "voice_type": self.voice_type,
                "encoding": encoding,
                "speed_ratio": speed_ratio,
                "volume_ratio": volume_ratio,
                "pitch_ratio": pitch_ratio,
            },
            "request": {
                "reqid": str(uuid.uuid4()),
                "text": text,
                "text_type": text_type,
                "operation": "query",
                "with_frontend": with_frontend,
                "frontend_type": frontend_type,
            },
        }
        try:
            logger.debug(f"Sending TTS request for text: {text[:50]}...")
            response = requests.post(
                self.api_url, json.dumps(request_json), headers=self.header
            )
            response_json = response.json()
            if response.status_code != 200:
                logger.error(f"TTS API error: {response_json}")
                return {"success": False, "error": response_json, "audio_data": None}
            if "data" not in response_json:
                logger.error(f"TTS API returned no data: {response_json}")
                return {
                    "success": False,
                    "error": "No audio data returned",
                    "audio_data": None,
                }
            return {
                "success": True,
                "response": response_json,
                "audio_data": response_json["data"],  # Base64 encoded audio data
            }
        except Exception as e:
            logger.exception(f"Error in TTS API call: {str(e)}")
            return {"success": False, "error": str(e), "audio_data": None}