Remove tts (blocking call) (#6869)

2025-08-12 22:49:00 +08:00 · 2024-08-01 14:50:22 +08:00 · 2024-08-01 14:50:22 +08:00 · a9cd6df97e
commit a9cd6df97e
parent f31142e758
3 changed files with 7 additions and 138 deletions
--- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
@ -1,12 +1,8 @@
 import concurrent.futures
 import copy
 from functools import reduce
 from io import BytesIO
 from typing import Optional
 from flask import Response
 from openai import AzureOpenAI
 from pydub import AudioSegment
 from core.model_runtime.entities.model_entities import AIModelEntity
 from core.model_runtime.errors.invoke import InvokeBadRequestError
@ -51,7 +47,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -60,45 +56,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
        """
        _tts_invoke text2speech model
        :param model: model name
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
        :return: text translated to audio file
        """
        audio_type = self._get_model_audio_type(model, credentials)
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []
            # Create a thread pool and map the function to the list of sentences
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
                                           credentials=credentials) for sentence in sentences]
                for future in futures:
                    try:
                        if future.result():
                            audio_bytes_list.append(future.result())
                    except Exception as ex:
                        raise InvokeBadRequestError(str(ex))
            if len(audio_bytes_list) > 0:
                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
                                  audio_bytes_list if audio_bytes]
                combined_segment = reduce(lambda x, y: x + y, audio_segments)
                buffer: BytesIO = BytesIO()
                combined_segment.export(buffer, format=audio_type)
                buffer.seek(0)
                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    def _tts_invoke_streaming(self, model: str,  credentials: dict, content_text: str,
                              voice: str) -> any:
        """
@ -144,7 +101,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        :param sentence: text content to be translated
        :return: text translated to audio file
        """
        # transform credentials to kwargs for model instance
        credentials_kwargs = self._to_credential_kwargs(credentials)
        client = AzureOpenAI(**credentials_kwargs)
        response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@ -1,11 +1,7 @@
 import concurrent.futures
 from functools import reduce
 from io import BytesIO
 from typing import Optional
 from flask import Response
 from openai import OpenAI
 from pydub import AudioSegment
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        :return: text translated to audio file
        """
-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
+        if not voice or voice not in [d['value'] for d in
                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
            voice = self._get_model_default_voice(model, credentials)
        # if streaming:
        return self._tts_invoke_streaming(model=model,
@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
        """
        _tts_invoke text2speech model
        :param model: model name
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
        :return: text translated to audio file
        """
        audio_type = self._get_model_audio_type(model, credentials)
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []
            # Create a thread pool and map the function to the list of sentences
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
                                           credentials=credentials) for sentence in sentences]
                for future in futures:
                    try:
                        if future.result():
                            audio_bytes_list.append(future.result())
                    except Exception as ex:
                        raise InvokeBadRequestError(str(ex))
            if len(audio_bytes_list) > 0:
                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
                                  audio_bytes_list if audio_bytes]
                combined_segment = reduce(lambda x, y: x + y, audio_segments)
                buffer: BytesIO = BytesIO()
                combined_segment.export(buffer, format=audio_type)
                buffer.seek(0)
                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """
@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
            # doc: https://platform.openai.com/docs/guides/text-to-speech
            credentials_kwargs = self._to_credential_kwargs(credentials)
            client = OpenAI(**credentials_kwargs)
-            model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
+            model_support_voice = [x.get("value") for x in
                                   self.get_tts_model_voices(model=model, credentials=credentials)]
            if not voice or voice not in model_support_voice:
                voice = self._get_model_default_voice(model, credentials)
            word_limit = self._get_model_word_limit(model, credentials)
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@ -1,7 +1,4 @@
 import concurrent.futures
 import threading
 from functools import reduce
 from io import BytesIO
 from queue import Queue
 from typing import Optional
@ -9,8 +6,6 @@ import dashscope
 from dashscope import SpeechSynthesizer
 from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
 from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
 from flask import Response
 from pydub import AudioSegment
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -55,7 +50,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -64,46 +59,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
        """
        _tts_invoke text2speech model
        :param model: model name
        :param credentials: model credentials
        :param voice: model timbre
        :param content_text: text content to be translated
        :return: text translated to audio file
        """
        audio_type = self._get_model_audio_type(model, credentials)
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []
            # Create a thread pool and map the function to the list of sentences
            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
                futures = [executor.submit(self._process_sentence, sentence=sentence,
                                           credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
                           sentences]
                for future in futures:
                    try:
                        if future.result():
                            audio_bytes_list.append(future.result())
                    except Exception as ex:
                        raise InvokeBadRequestError(str(ex))
            if len(audio_bytes_list) > 0:
                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
                                  audio_bytes_list if audio_bytes]
                combined_segment = reduce(lambda x, y: x + y, audio_segments)
                buffer: BytesIO = BytesIO()
                combined_segment.export(buffer, format=audio_type)
                buffer.seek(0)
                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))
    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """