diff --git a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py index 50c125b873..3d2bac1c31 100644 --- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py +++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py @@ -1,12 +1,8 @@ import concurrent.futures import copy -from functools import reduce -from io import BytesIO from typing import Optional -from flask import Response from openai import AzureOpenAI -from pydub import AudioSegment from core.model_runtime.entities.model_entities import AIModelEntity from core.model_runtime.errors.invoke import InvokeBadRequestError @@ -51,7 +47,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel): :return: text translated to audio file """ try: - self._tts_invoke( + self._tts_invoke_streaming( model=model, credentials=credentials, content_text='Hello Dify!', @@ -60,45 +56,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel): except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: - """ - _tts_invoke text2speech model - - :param model: model name - :param credentials: model credentials - :param content_text: text content to be translated - :param voice: model timbre - :return: text translated to audio file - """ - audio_type = self._get_model_audio_type(model, credentials) - word_limit = self._get_model_word_limit(model, credentials) - max_workers = self._get_model_workers_limit(model, credentials) - try: - sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit)) - audio_bytes_list = [] - - # Create a thread pool and map the function to the list of sentences - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, - credentials=credentials) for sentence in sentences] - for future in futures: - try: - if future.result(): - audio_bytes_list.append(future.result()) - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - - if len(audio_bytes_list) > 0: - audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in - audio_bytes_list if audio_bytes] - combined_segment = reduce(lambda x, y: x + y, audio_segments) - buffer: BytesIO = BytesIO() - combined_segment.export(buffer, format=audio_type) - buffer.seek(0) - return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, voice: str) -> any: """ @@ -144,7 +101,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel): :param sentence: text content to be translated :return: text translated to audio file """ - # transform credentials to kwargs for model instance credentials_kwargs = self._to_credential_kwargs(credentials) client = AzureOpenAI(**credentials_kwargs) response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) diff --git a/api/core/model_runtime/model_providers/openai/tts/tts.py b/api/core/model_runtime/model_providers/openai/tts/tts.py index d3fcf731f1..afa5d4b88a 100644 --- a/api/core/model_runtime/model_providers/openai/tts/tts.py +++ b/api/core/model_runtime/model_providers/openai/tts/tts.py @@ -1,11 +1,7 @@ import concurrent.futures -from functools import reduce -from io import BytesIO from typing import Optional -from flask import Response from openai import OpenAI -from pydub import AudioSegment from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.validate import CredentialsValidateFailedError @@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): :return: text translated to audio file """ - if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: + if not voice or voice not in [d['value'] for d in + self.get_tts_model_voices(model=model, credentials=credentials)]: voice = self._get_model_default_voice(model, credentials) # if streaming: return self._tts_invoke_streaming(model=model, @@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): :return: text translated to audio file """ try: - self._tts_invoke( + self._tts_invoke_streaming( model=model, credentials=credentials, content_text='Hello Dify!', @@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: - """ - _tts_invoke text2speech model - - :param model: model name - :param credentials: model credentials - :param content_text: text content to be translated - :param voice: model timbre - :return: text translated to audio file - """ - audio_type = self._get_model_audio_type(model, credentials) - word_limit = self._get_model_word_limit(model, credentials) - max_workers = self._get_model_workers_limit(model, credentials) - try: - sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit)) - audio_bytes_list = [] - - # Create a thread pool and map the function to the list of sentences - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, - credentials=credentials) for sentence in sentences] - for future in futures: - try: - if future.result(): - audio_bytes_list.append(future.result()) - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - - if len(audio_bytes_list) > 0: - audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in - audio_bytes_list if audio_bytes] - combined_segment = reduce(lambda x, y: x + y, audio_segments) - buffer: BytesIO = BytesIO() - combined_segment.export(buffer, format=audio_type) - buffer.seek(0) - return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, voice: str) -> any: """ @@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): # doc: https://platform.openai.com/docs/guides/text-to-speech credentials_kwargs = self._to_credential_kwargs(credentials) client = OpenAI(**credentials_kwargs) - model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)] + model_support_voice = [x.get("value") for x in + self.get_tts_model_voices(model=model, credentials=credentials)] if not voice or voice not in model_support_voice: voice = self._get_model_default_voice(model, credentials) word_limit = self._get_model_word_limit(model, credentials) diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts.py b/api/core/model_runtime/model_providers/tongyi/tts/tts.py index 655ed2d1d0..664b02cd92 100644 --- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py @@ -1,7 +1,4 @@ -import concurrent.futures import threading -from functools import reduce -from io import BytesIO from queue import Queue from typing import Optional @@ -9,8 +6,6 @@ import dashscope from dashscope import SpeechSynthesizer from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult -from flask import Response -from pydub import AudioSegment from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.validate import CredentialsValidateFailedError @@ -55,7 +50,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel): :return: text translated to audio file """ try: - self._tts_invoke( + self._tts_invoke_streaming( model=model, credentials=credentials, content_text='Hello Dify!', @@ -64,46 +59,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel): except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: - """ - _tts_invoke text2speech model - - :param model: model name - :param credentials: model credentials - :param voice: model timbre - :param content_text: text content to be translated - :return: text translated to audio file - """ - audio_type = self._get_model_audio_type(model, credentials) - word_limit = self._get_model_word_limit(model, credentials) - max_workers = self._get_model_workers_limit(model, credentials) - try: - sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit)) - audio_bytes_list = [] - - # Create a thread pool and map the function to the list of sentences - with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(self._process_sentence, sentence=sentence, - credentials=credentials, voice=voice, audio_type=audio_type) for sentence in - sentences] - for future in futures: - try: - if future.result(): - audio_bytes_list.append(future.result()) - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - - if len(audio_bytes_list) > 0: - audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in - audio_bytes_list if audio_bytes] - combined_segment = reduce(lambda x, y: x + y, audio_segments) - buffer: BytesIO = BytesIO() - combined_segment.export(buffer, format=audio_type) - buffer.seek(0) - return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") - except Exception as ex: - raise InvokeBadRequestError(str(ex)) - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, voice: str) -> any: """