Remove tts (blocking call) (#6869)

2025-08-13 04:18:58 +08:00 · 2024-08-01 14:50:22 +08:00 · 2024-08-01 14:50:22 +08:00 · a9cd6df97e
commit a9cd6df97e
parent f31142e758
3 changed files with 7 additions and 138 deletions
--- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
@ -1,12 +1,8 @@
 import concurrent.futures
 import copy
-from functools import reduce
-from io import BytesIO
 from typing import Optional

-from flask import Response
 from openai import AzureOpenAI
-from pydub import AudioSegment

 from core.model_runtime.entities.model_entities import AIModelEntity
 from core.model_runtime.errors.invoke import InvokeBadRequestError
@ -51,7 +47,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -60,45 +56,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))

-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
-        """
-        _tts_invoke text2speech model
-
-        :param model: model name
-        :param credentials: model credentials
-        :param content_text: text content to be translated
-        :param voice: model timbre
-        :return: text translated to audio file
-        """
-        audio_type = self._get_model_audio_type(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        max_workers = self._get_model_workers_limit(model, credentials)
-        try:
-            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
-            audio_bytes_list = []
-
-            # Create a thread pool and map the function to the list of sentences
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
-                                           credentials=credentials) for sentence in sentences]
-                for future in futures:
-                    try:
-                        if future.result():
-                            audio_bytes_list.append(future.result())
-                    except Exception as ex:
-                        raise InvokeBadRequestError(str(ex))
-
-            if len(audio_bytes_list) > 0:
-                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
-                                  audio_bytes_list if audio_bytes]
-                combined_segment = reduce(lambda x, y: x + y, audio_segments)
-                buffer: BytesIO = BytesIO()
-                combined_segment.export(buffer, format=audio_type)
-                buffer.seek(0)
-                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
-        except Exception as ex:
-            raise InvokeBadRequestError(str(ex))
-
    def _tts_invoke_streaming(self, model: str,  credentials: dict, content_text: str,
                              voice: str) -> any:
        """
@ -144,7 +101,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        :param sentence: text content to be translated
        :return: text translated to audio file
        """
-        # transform credentials to kwargs for model instance
        credentials_kwargs = self._to_credential_kwargs(credentials)
        client = AzureOpenAI(**credentials_kwargs)
        response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@ -1,11 +1,7 @@
 import concurrent.futures
-from functools import reduce
-from io import BytesIO
 from typing import Optional

-from flask import Response
 from openai import OpenAI
-from pydub import AudioSegment

 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        :return: text translated to audio file
        """

-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
+        if not voice or voice not in [d['value'] for d in
+                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
            voice = self._get_model_default_voice(model, credentials)
        # if streaming:
        return self._tts_invoke_streaming(model=model,
@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))

-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
-        """
-        _tts_invoke text2speech model
-
-        :param model: model name
-        :param credentials: model credentials
-        :param content_text: text content to be translated
-        :param voice: model timbre
-        :return: text translated to audio file
-        """
-        audio_type = self._get_model_audio_type(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        max_workers = self._get_model_workers_limit(model, credentials)
-        try:
-            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
-            audio_bytes_list = []
-
-            # Create a thread pool and map the function to the list of sentences
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
-                                           credentials=credentials) for sentence in sentences]
-                for future in futures:
-                    try:
-                        if future.result():
-                            audio_bytes_list.append(future.result())
-                    except Exception as ex:
-                        raise InvokeBadRequestError(str(ex))
-
-            if len(audio_bytes_list) > 0:
-                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
-                                  audio_bytes_list if audio_bytes]
-                combined_segment = reduce(lambda x, y: x + y, audio_segments)
-                buffer: BytesIO = BytesIO()
-                combined_segment.export(buffer, format=audio_type)
-                buffer.seek(0)
-                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
-        except Exception as ex:
-            raise InvokeBadRequestError(str(ex))
-
-
    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """
@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
            # doc: https://platform.openai.com/docs/guides/text-to-speech
            credentials_kwargs = self._to_credential_kwargs(credentials)
            client = OpenAI(**credentials_kwargs)
-            model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
+            model_support_voice = [x.get("value") for x in
+                                   self.get_tts_model_voices(model=model, credentials=credentials)]
            if not voice or voice not in model_support_voice:
                voice = self._get_model_default_voice(model, credentials)
            word_limit = self._get_model_word_limit(model, credentials)
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@ -1,7 +1,4 @@
-import concurrent.futures
 import threading
-from functools import reduce
-from io import BytesIO
 from queue import Queue
 from typing import Optional

@ -9,8 +6,6 @@ import dashscope
 from dashscope import SpeechSynthesizer
 from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
 from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
-from flask import Response
-from pydub import AudioSegment

 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -55,7 +50,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        :return: text translated to audio file
        """
        try:
-            self._tts_invoke(
+            self._tts_invoke_streaming(
                model=model,
                credentials=credentials,
                content_text='Hello Dify!',
@ -64,46 +59,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))

-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
-        """
-        _tts_invoke text2speech model
-
-        :param model: model name
-        :param credentials: model credentials
-        :param voice: model timbre
-        :param content_text: text content to be translated
-        :return: text translated to audio file
-        """
-        audio_type = self._get_model_audio_type(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        max_workers = self._get_model_workers_limit(model, credentials)
-        try:
-            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
-            audio_bytes_list = []
-
-            # Create a thread pool and map the function to the list of sentences
-            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-                futures = [executor.submit(self._process_sentence, sentence=sentence,
-                                           credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
-                           sentences]
-                for future in futures:
-                    try:
-                        if future.result():
-                            audio_bytes_list.append(future.result())
-                    except Exception as ex:
-                        raise InvokeBadRequestError(str(ex))
-
-            if len(audio_bytes_list) > 0:
-                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
-                                  audio_bytes_list if audio_bytes]
-                combined_segment = reduce(lambda x, y: x + y, audio_segments)
-                buffer: BytesIO = BytesIO()
-                combined_segment.export(buffer, format=audio_type)
-                buffer.seek(0)
-                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
-        except Exception as ex:
-            raise InvokeBadRequestError(str(ex))
-
    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """