Add tongyi tts&tts function optimization (#2177)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
2025-08-11 17:49:02 +08:00 · 2024-01-24 20:32:04 +08:00 · 2024-01-24 20:32:04 +08:00 · ac4bb5c35f
commit ac4bb5c35f
parent a96cae4f44
11 changed files with 278 additions and 108 deletions
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@ -1,8 +1,13 @@
+import uuid
+import hashlib
+import subprocess
 from abc import abstractmethod
 from typing import Optional

+from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.model_providers.__base.ai_model import AIModel
+from core.model_runtime.entities.model_entities import ModelPropertyKey


 class TTSModel(AIModel):
@ -40,3 +45,96 @@ class TTSModel(AIModel):
        :return: translated audio file
        """
        raise NotImplementedError
+
+    def _get_model_voice(self, model: str, credentials: dict) -> any:
+        """
+        Get voice for given tts model
+
+        :param model: model name
+        :param credentials: model credentials
+        :return: voice
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
+
+    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
+        """
+        Get audio type for given tts model
+
+        :param model: model name
+        :param credentials: model credentials
+        :return: voice
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
+
+    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
+        """
+        Get audio type for given tts model
+        :return: audio type
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
+
+    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
+        """
+        Get audio max workers for given tts model
+        :return: audio type
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
+
+    @staticmethod
+    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
+        if delimiters is None:
+            delimiters = set('。！？；\n')
+
+        buf = []
+        word_count = 0
+        for char in text:
+            buf.append(char)
+            if char in delimiters:
+                if word_count >= limit:
+                    yield ''.join(buf)
+                    buf = []
+                    word_count = 0
+                else:
+                    word_count += 1
+            else:
+                word_count += 1
+
+        if buf:
+            yield ''.join(buf)
+
+    @staticmethod
+    def _is_ffmpeg_installed():
+        try:
+            output = subprocess.check_output("ffmpeg -version", shell=True)
+            if "ffmpeg version" in output.decode("utf-8"):
+                return True
+            else:
+                raise InvokeBadRequestError("ffmpeg is not installed, "
+                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
+                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
+        except Exception:
+            raise InvokeBadRequestError("ffmpeg is not installed, "
+                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
+                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
+
+    # Todo: To improve the streaming function
+    @staticmethod
+    def _get_file_name(file_content: str) -> str:
+        hash_object = hashlib.sha256(file_content.encode())
+        hex_digest = hash_object.hexdigest()
+
+        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
+        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
+        return str(unique_uuid)
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@ -1,18 +1,13 @@
-import uuid
-import hashlib
-import subprocess
 from io import BytesIO
 from typing import Optional
 from functools import reduce
 from pydub import AudioSegment

-from core.model_runtime.entities.model_entities import ModelPropertyKey
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI

-from typing_extensions import Literal
 from flask import Response, stream_with_context
 from openai import OpenAI
 import concurrent.futures
@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
    """
    Model class for OpenAI Speech to text model.
    """
-
-    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
-                user: Optional[str] = None) -> any:
+    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
        """
        _invoke text2speech model

@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))

-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
        """
        _tts_invoke text2speech model

@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
            raise InvokeBadRequestError(str(ex))

    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
-                              user: Optional[str] = None) -> any:
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
        """
        _tts_invoke_streaming text2speech model

@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

-    def _get_model_voice(self, model: str, credentials: dict) -> Literal[
-        "alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
-        """
-        Get voice for given tts model
-
-        :param model: model name
-        :param credentials: model credentials
-        :return: voice
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
-
-    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
-        """
-        Get audio type for given tts model
-
-        :param model: model name
-        :param credentials: model credentials
-        :return: voice
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
-
-    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
-        """
-        Get audio type for given tts model
-        :return: audio type
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
-
-    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
-        """
-        Get audio max workers for given tts model
-        :return: audio type
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
-
-    @staticmethod
-    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
-        if delimiters is None:
-            delimiters = set('。！？；\n')
-
-        buf = []
-        word_count = 0
-        for char in text:
-            buf.append(char)
-            if char in delimiters:
-                if word_count >= limit:
-                    yield ''.join(buf)
-                    buf = []
-                    word_count = 0
-                else:
-                    word_count += 1
-            else:
-                word_count += 1
-
-        if buf:
-            yield ''.join(buf)
-
-    @staticmethod
-    def _get_file_name(file_content: str) -> str:
-        hash_object = hashlib.sha256(file_content.encode())
-        hex_digest = hash_object.hexdigest()
-
-        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
-        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
-        return str(unique_uuid)
-
    def _process_sentence(self, sentence: str, model: str, credentials: dict):
        """
        _tts_invoke openai text2speech model api
@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
        if isinstance(response.read(), bytes):
            return response.read()
-
-    @staticmethod
-    def _is_ffmpeg_installed():
-        try:
-            output = subprocess.check_output("ffmpeg -version", shell=True)
-            if "ffmpeg version" in output.decode("utf-8"):
-                return True
-            else:
-                raise InvokeBadRequestError("ffmpeg is not installed, "
-                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
-                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
-        except Exception:
-            raise InvokeBadRequestError("ffmpeg is not installed, "
-                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
-                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
--- a/api/core/model_runtime/model_providers/tongyi/_common.py
+++ b/api/core/model_runtime/model_providers/tongyi/_common.py
@ -0,0 +1,23 @@
+from core.model_runtime.errors.invoke import InvokeError
+
+
+class _CommonTongyi:
+    @staticmethod
+    def _to_credential_kwargs(credentials: dict) -> dict:
+        credentials_kwargs = {
+            "dashscope_api_key": credentials['dashscope_api_key'],
+        }
+
+        return credentials_kwargs
+
+    @property
+    def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
+        """
+        Map model invoke error to unified error
+        The key is the error type thrown to the caller
+        The value is the error type thrown by the model,
+        which needs to be converted into a unified error type for the caller.
+
+        :return: Invoke error mapping
+        """
+        pass
--- a/api/core/model_runtime/model_providers/tongyi/tongyi.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tongyi.yaml
@ -16,6 +16,7 @@ help:
    en_US: https://dashscope.console.aliyun.com/api-key_management
 supported_model_types:
  - llm
+  - tts
 configurate_methods:
  - predefined-model
 provider_credential_schema:
--- a/api/core/model_runtime/model_providers/tongyi/tts/init.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/init.py
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@ -0,0 +1,7 @@
+model: tts-1
+model_type: tts
+model_properties:
+  default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
+  word_limit: 120
+  audio_type: 'mp3'
+  max_workers: 5
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@ -0,0 +1,142 @@
+from io import BytesIO
+from typing import Optional
+from functools import reduce
+from pydub import AudioSegment
+
+from core.model_runtime.errors.validate import CredentialsValidateFailedError
+from core.model_runtime.errors.invoke import InvokeBadRequestError
+from core.model_runtime.model_providers.__base.tts_model import TTSModel
+from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
+
+import dashscope
+from flask import Response, stream_with_context
+import concurrent.futures
+
+
+class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
+    """
+    Model class for Tongyi Speech to text model.
+    """
+    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
+        """
+        _invoke text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param streaming: output is streaming
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        self._is_ffmpeg_installed()
+        audio_type = self._get_model_audio_type(model, credentials)
+        if streaming:
+            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
+                                                                           credentials=credentials,
+                                                                           content_text=content_text,
+                                                                           user=user)),
+                            status=200, mimetype=f'audio/{audio_type}')
+        else:
+            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
+
+    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
+        """
+        validate credentials text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        try:
+            self._tts_invoke(
+                model=model,
+                credentials=credentials,
+                content_text='Hello world!',
+                user=user
+            )
+        except Exception as ex:
+            raise CredentialsValidateFailedError(str(ex))
+
+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
+        """
+        _tts_invoke text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        audio_type = self._get_model_audio_type(model, credentials)
+        word_limit = self._get_model_word_limit(model, credentials)
+        max_workers = self._get_model_workers_limit(model, credentials)
+
+        try:
+            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            audio_bytes_list = list()
+
+            # Create a thread pool and map the function to the list of sentences
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
+                                           credentials=credentials, audio_type=audio_type) for sentence in sentences]
+                for future in futures:
+                    try:
+                        audio_bytes_list.append(future.result())
+                    except Exception as ex:
+                        raise InvokeBadRequestError(str(ex))
+
+            audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
+                              audio_bytes_list if audio_bytes]
+            combined_segment = reduce(lambda x, y: x + y, audio_segments)
+            buffer: BytesIO = BytesIO()
+            combined_segment.export(buffer, format=audio_type)
+            buffer.seek(0)
+            return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
+        except Exception as ex:
+            raise InvokeBadRequestError(str(ex))
+
+    # Todo: To improve the streaming function
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
+        """
+        _tts_invoke_streaming text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        # transform credentials to kwargs for model instance
+        dashscope.api_key = credentials.get('dashscope_api_key')
+        voice_name = self._get_model_voice(model, credentials)
+        word_limit = self._get_model_word_limit(model, credentials)
+        audio_type = self._get_model_audio_type(model, credentials)
+        try:
+            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            for sentence in sentences:
+                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
+                                                                      format=audio_type, word_timestamp_enabled=True,
+                                                                      phoneme_timestamp_enabled=True)
+                if isinstance(response.get_audio_data(), bytes):
+                    return response.get_audio_data()
+        except Exception as ex:
+            raise InvokeBadRequestError(str(ex))
+
+    def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
+        """
+        _tts_invoke Tongyi text2speech model api
+
+        :param model: model name
+        :param credentials: model credentials
+        :param sentence: text content to be translated
+        :param audio_type: audio file type
+        :return: text translated to audio file
+        """
+        # transform credentials to kwargs for model instance
+        dashscope.api_key = credentials.get('dashscope_api_key')
+        voice_name = self._get_model_voice(model, credentials)
+
+        response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
+        if isinstance(response.get_audio_data(), bytes):
+            return response.get_audio_data()
--- a/web/app/components/develop/template/template.en.mdx
+++ b/web/app/components/develop/template/template.en.mdx
@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
 />
 <Row>
  <Col>
-    Text to speech, only supports openai model.
+    Text to speech.

    ### Request Body

--- a/web/app/components/develop/template/template.zh.mdx
+++ b/web/app/components/develop/template/template.zh.mdx
@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 <Row>
  <Col>
-    文字转语音，仅支持 openai 模型。
+    文字转语音。

    ### Request Body

--- a/web/app/components/develop/template/template_chat.en.mdx
+++ b/web/app/components/develop/template/template_chat.en.mdx
@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
 />
 <Row>
  <Col>
-    Text to speech, only supports openai model.
+    Text to speech.

    ### Request Body

--- a/web/app/components/develop/template/template_chat.zh.mdx
+++ b/web/app/components/develop/template/template_chat.zh.mdx
@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 <Row>
  <Col>
-    文字转语音，仅支持 openai 模型。
+    文字转语音。

    ### Request Body