Add tongyi tts&tts function optimization (#2177)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
2025-08-12 03:29:01 +08:00 · 2024-01-24 20:32:04 +08:00 · 2024-01-24 20:32:04 +08:00 · ac4bb5c35f
commit ac4bb5c35f
parent a96cae4f44
11 changed files with 278 additions and 108 deletions
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@ -1,8 +1,13 @@
 import uuid
 import hashlib
 import subprocess
 from abc import abstractmethod
 from typing import Optional
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.model_providers.__base.ai_model import AIModel
 from core.model_runtime.entities.model_entities import ModelPropertyKey
 class TTSModel(AIModel):
@ -40,3 +45,96 @@ class TTSModel(AIModel):
        :return: translated audio file
        """
        raise NotImplementedError
    def _get_model_voice(self, model: str, credentials: dict) -> any:
        """
        Get voice for given tts model
        :param model: model name
        :param credentials: model credentials
        :return: voice
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
        """
        Get audio type for given tts model
        :param model: model name
        :param credentials: model credentials
        :return: voice
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
        """
        Get audio type for given tts model
        :return: audio type
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
        """
        Get audio max workers for given tts model
        :return: audio type
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
    @staticmethod
    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
        if delimiters is None:
            delimiters = set('。！？；\n')
        buf = []
        word_count = 0
        for char in text:
            buf.append(char)
            if char in delimiters:
                if word_count >= limit:
                    yield ''.join(buf)
                    buf = []
                    word_count = 0
                else:
                    word_count += 1
            else:
                word_count += 1
        if buf:
            yield ''.join(buf)
    @staticmethod
    def _is_ffmpeg_installed():
        try:
            output = subprocess.check_output("ffmpeg -version", shell=True)
            if "ffmpeg version" in output.decode("utf-8"):
                return True
            else:
                raise InvokeBadRequestError("ffmpeg is not installed, "
                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
        except Exception:
            raise InvokeBadRequestError("ffmpeg is not installed, "
                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
    # Todo: To improve the streaming function
    @staticmethod
    def _get_file_name(file_content: str) -> str:
        hash_object = hashlib.sha256(file_content.encode())
        hex_digest = hash_object.hexdigest()
        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
        return str(unique_uuid)
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@ -1,18 +1,13 @@
 import uuid
 import hashlib
 import subprocess
 from io import BytesIO
 from typing import Optional
 from functools import reduce
 from pydub import AudioSegment
 from core.model_runtime.entities.model_entities import ModelPropertyKey
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
 from typing_extensions import Literal
 from flask import Response, stream_with_context
 from openai import OpenAI
 import concurrent.futures
@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
    """
    Model class for OpenAI Speech to text model.
    """
-
+    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
                user: Optional[str] = None) -> any:
        """
        _invoke text2speech model
@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
        """
        _tts_invoke text2speech model
@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
            raise InvokeBadRequestError(str(ex))
    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
                              user: Optional[str] = None) -> any:
        """
        _tts_invoke_streaming text2speech model
@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    def _get_model_voice(self, model: str, credentials: dict) -> Literal[
        "alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
        """
        Get voice for given tts model
        :param model: model name
        :param credentials: model credentials
        :return: voice
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
        """
        Get audio type for given tts model
        :param model: model name
        :param credentials: model credentials
        :return: voice
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
        """
        Get audio type for given tts model
        :return: audio type
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
        """
        Get audio max workers for given tts model
        :return: audio type
        """
        model_schema = self.get_model_schema(model, credentials)
        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
    @staticmethod
    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
        if delimiters is None:
            delimiters = set('。！？；\n')
        buf = []
        word_count = 0
        for char in text:
            buf.append(char)
            if char in delimiters:
                if word_count >= limit:
                    yield ''.join(buf)
                    buf = []
                    word_count = 0
                else:
                    word_count += 1
            else:
                word_count += 1
        if buf:
            yield ''.join(buf)
    @staticmethod
    def _get_file_name(file_content: str) -> str:
        hash_object = hashlib.sha256(file_content.encode())
        hex_digest = hash_object.hexdigest()
        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
        return str(unique_uuid)
    def _process_sentence(self, sentence: str, model: str, credentials: dict):
        """
        _tts_invoke openai text2speech model api
@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
        if isinstance(response.read(), bytes):
            return response.read()
    @staticmethod
    def _is_ffmpeg_installed():
        try:
            output = subprocess.check_output("ffmpeg -version", shell=True)
            if "ffmpeg version" in output.decode("utf-8"):
                return True
            else:
                raise InvokeBadRequestError("ffmpeg is not installed, "
                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
        except Exception:
            raise InvokeBadRequestError("ffmpeg is not installed, "
                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
--- a/api/core/model_runtime/model_providers/tongyi/_common.py
+++ b/api/core/model_runtime/model_providers/tongyi/_common.py
@ -0,0 +1,23 @@
 from core.model_runtime.errors.invoke import InvokeError
 class _CommonTongyi:
    @staticmethod
    def _to_credential_kwargs(credentials: dict) -> dict:
        credentials_kwargs = {
            "dashscope_api_key": credentials['dashscope_api_key'],
        }
        return credentials_kwargs
    @property
    def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
        """
        Map model invoke error to unified error
        The key is the error type thrown to the caller
        The value is the error type thrown by the model,
        which needs to be converted into a unified error type for the caller.
        :return: Invoke error mapping
        """
        pass
--- a/api/core/model_runtime/model_providers/tongyi/tongyi.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tongyi.yaml
@ -16,6 +16,7 @@ help:
    en_US: https://dashscope.console.aliyun.com/api-key_management
 supported_model_types:
  - llm
  - tts
 configurate_methods:
  - predefined-model
 provider_credential_schema:
--- a/api/core/model_runtime/model_providers/tongyi/tts/init.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/init.py
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@ -0,0 +1,7 @@
 model: tts-1
 model_type: tts
 model_properties:
  default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
  word_limit: 120
  audio_type: 'mp3'
  max_workers: 5
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@ -0,0 +1,142 @@
 from io import BytesIO
 from typing import Optional
 from functools import reduce
 from pydub import AudioSegment
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
 import dashscope
 from flask import Response, stream_with_context
 import concurrent.futures
 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
    """
    Model class for Tongyi Speech to text model.
    """
    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
        """
        _invoke text2speech model
        :param model: model name
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param streaming: output is streaming
        :param user: unique user id
        :return: text translated to audio file
        """
        self._is_ffmpeg_installed()
        audio_type = self._get_model_audio_type(model, credentials)
        if streaming:
            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
                                                                           credentials=credentials,
                                                                           content_text=content_text,
                                                                           user=user)),
                            status=200, mimetype=f'audio/{audio_type}')
        else:
            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
        """
        validate credentials text2speech model
        :param model: model name
        :param credentials: model credentials
        :param user: unique user id
        :return: text translated to audio file
        """
        try:
            self._tts_invoke(
                model=model,
                credentials=credentials,
                content_text='Hello world!',
                user=user
            )
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
        """
        _tts_invoke text2speech model
        :param model: model name
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param user: unique user id
        :return: text translated to audio file
        """
        audio_type = self._get_model_audio_type(model, credentials)
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
            audio_bytes_list = list()
            # Create a thread pool and map the function to the list of sentences
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
                                           credentials=credentials, audio_type=audio_type) for sentence in sentences]
                for future in futures:
                    try:
                        audio_bytes_list.append(future.result())
                    except Exception as ex:
                        raise InvokeBadRequestError(str(ex))
            audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
                              audio_bytes_list if audio_bytes]
            combined_segment = reduce(lambda x, y: x + y, audio_segments)
            buffer: BytesIO = BytesIO()
            combined_segment.export(buffer, format=audio_type)
            buffer.seek(0)
            return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    # Todo: To improve the streaming function
    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
        """
        _tts_invoke_streaming text2speech model
        :param model: model name
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param user: unique user id
        :return: text translated to audio file
        """
        # transform credentials to kwargs for model instance
        dashscope.api_key = credentials.get('dashscope_api_key')
        voice_name = self._get_model_voice(model, credentials)
        word_limit = self._get_model_word_limit(model, credentials)
        audio_type = self._get_model_audio_type(model, credentials)
        try:
            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
            for sentence in sentences:
                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
                                                                      format=audio_type, word_timestamp_enabled=True,
                                                                      phoneme_timestamp_enabled=True)
                if isinstance(response.get_audio_data(), bytes):
                    return response.get_audio_data()
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
        """
        _tts_invoke Tongyi text2speech model api
        :param model: model name
        :param credentials: model credentials
        :param sentence: text content to be translated
        :param audio_type: audio file type
        :return: text translated to audio file
        """
        # transform credentials to kwargs for model instance
        dashscope.api_key = credentials.get('dashscope_api_key')
        voice_name = self._get_model_voice(model, credentials)
        response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
        if isinstance(response.get_audio_data(), bytes):
            return response.get_audio_data()
--- a/web/app/components/develop/template/template.en.mdx
+++ b/web/app/components/develop/template/template.en.mdx
@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
 />
 <Row>
  <Col>
-    Text to speech, only supports openai model.
+    Text to speech.
    ### Request Body
--- a/web/app/components/develop/template/template.zh.mdx
+++ b/web/app/components/develop/template/template.zh.mdx
@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 <Row>
  <Col>
-    文字转语音，仅支持 openai 模型。
+    文字转语音。
    ### Request Body
--- a/web/app/components/develop/template/template_chat.en.mdx
+++ b/web/app/components/develop/template/template_chat.en.mdx
@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
 />
 <Row>
  <Col>
-    Text to speech, only supports openai model.
+    Text to speech.
    ### Request Body
--- a/web/app/components/develop/template/template_chat.zh.mdx
+++ b/web/app/components/develop/template/template_chat.zh.mdx
@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 <Row>
  <Col>
-    文字转语音，仅支持 openai 模型。
+    文字转语音。
    ### Request Body