add support for TongyiQwen tts (#2311)

### What problem does this PR solve? add support for TongyiQwen tts #1853 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Zhedong Cen <cenzhedong2@126.com>
2025-08-16 19:06:00 +08:00 · 2024-09-09 11:01:43 +08:00 · 2024-09-09 11:01:43 +08:00 · cb69c742b0
commit cb69c742b0
parent 2ac72899ef
3 changed files with 73 additions and 8 deletions
--- a/conf/llm_factories.json
+++ b/conf/llm_factories.json
@ -104,18 +104,24 @@
                    "max_tokens": 2048,
                    "model_type": "embedding"
                },
                {
                    "llm_name": "sambert-zhide-v1",
                    "tags": "TTS",
                    "max_tokens": 2048,
                    "model_type": "tts"
                },
                {
                    "llm_name": "sambert-zhiru-v1",
                    "tags": "TTS",
                    "max_tokens": 2048,
                    "model_type": "tts"
                },
                {
                    "llm_name": "text-embedding-v3",
                    "tags": "TEXT EMBEDDING,8K",
                    "max_tokens": 8192,
                    "model_type": "embedding"
                },
                {
                    "llm_name": "paraformer-realtime-8k-v1",
                    "tags": "SPEECH2TEXT",
                    "max_tokens": 26214400,
                    "model_type": "speech2text"
                },
                {
                    "llm_name": "qwen-vl-max",
                    "tags": "LLM,CHAT,IMAGE2TEXT",
--- a/rag/llm/init.py
+++ b/rag/llm/init.py
@ -137,5 +137,6 @@ Seq2txtModel = {
 }
 TTSModel = {
-    "Fish Audio": FishAudioTTS
+    "Fish Audio": FishAudioTTS,
    "Tongyi-Qianwen": QwenTTS
 }
--- a/rag/llm/tts_model.py
+++ b/rag/llm/tts_model.py
@ -22,7 +22,7 @@ from pydantic import BaseModel, conint
 from rag.utils import num_tokens_from_string
 import json
 import re
-
+import time
 class ServeReferenceAudio(BaseModel):
    audio: bytes
    text: str
@ -96,3 +96,61 @@ class FishAudioTTS(Base):
            except httpx.HTTPStatusError as e:
                raise RuntimeError(f"**ERROR**: {e}")
 class QwenTTS(Base):
    def __init__(self, key, model_name, base_url=""):
        import dashscope
        self.model_name = model_name
        dashscope.api_key = key
    def tts(self, text):
        from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
        from dashscope.audio.tts import ResultCallback, SpeechSynthesizer, SpeechSynthesisResult
        from collections import deque
        class Callback(ResultCallback):
            def __init__(self) -> None:
                self.dque = deque()
            def _run(self):
                while True:
                    if not self.dque:
                        time.sleep(0)
                        continue
                    val = self.dque.popleft()
                    if val:
                        yield val
                    else:
                        break
            def on_open(self):
                pass
            def on_complete(self):
                self.dque.append(None)
            def on_error(self, response: SpeechSynthesisResponse):
                raise RuntimeError(str(response))
            def on_close(self):
                pass
            def on_event(self, result: SpeechSynthesisResult):
                if result.get_audio_frame() is not None:
                    self.dque.append(result.get_audio_frame())
        text = self.normalize_text(text)
        callback = Callback()
        SpeechSynthesizer.call(model=self.model_name,
                                text=text,
                                callback=callback,
                                format="mp3")
        try:
            for data in callback._run():
                yield data
            yield num_tokens_from_string(text)
        except Exception as e:
            raise RuntimeError(f"**ERROR**: {e}")