Remove tts (blocking call) (#6869)

This commit is contained in:
chenxu9741 2024-08-01 14:50:22 +08:00 committed by GitHub
parent f31142e758
commit a9cd6df97e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 7 additions and 138 deletions

View File

@ -1,12 +1,8 @@
import concurrent.futures import concurrent.futures
import copy import copy
from functools import reduce
from io import BytesIO
from typing import Optional from typing import Optional
from flask import Response
from openai import AzureOpenAI from openai import AzureOpenAI
from pydub import AudioSegment
from core.model_runtime.entities.model_entities import AIModelEntity from core.model_runtime.entities.model_entities import AIModelEntity
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
@ -51,7 +47,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
:return: text translated to audio file :return: text translated to audio file
""" """
try: try:
self._tts_invoke( self._tts_invoke_streaming(
model=model, model=model,
credentials=credentials, credentials=credentials,
content_text='Hello Dify!', content_text='Hello Dify!',
@ -60,45 +56,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """
@ -144,7 +101,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
:param sentence: text content to be translated :param sentence: text content to be translated
:return: text translated to audio file :return: text translated to audio file
""" """
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials) credentials_kwargs = self._to_credential_kwargs(credentials)
client = AzureOpenAI(**credentials_kwargs) client = AzureOpenAI(**credentials_kwargs)
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())

View File

@ -1,11 +1,7 @@
import concurrent.futures import concurrent.futures
from functools import reduce
from io import BytesIO
from typing import Optional from typing import Optional
from flask import Response
from openai import OpenAI from openai import OpenAI
from pydub import AudioSegment
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
:return: text translated to audio file :return: text translated to audio file
""" """
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]: if not voice or voice not in [d['value'] for d in
self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials) voice = self._get_model_default_voice(model, credentials)
# if streaming: # if streaming:
return self._tts_invoke_streaming(model=model, return self._tts_invoke_streaming(model=model,
@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
:return: text translated to audio file :return: text translated to audio file
""" """
try: try:
self._tts_invoke( self._tts_invoke_streaming(
model=model, model=model,
credentials=credentials, credentials=credentials,
content_text='Hello Dify!', content_text='Hello Dify!',
@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """
@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
# doc: https://platform.openai.com/docs/guides/text-to-speech # doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials) credentials_kwargs = self._to_credential_kwargs(credentials)
client = OpenAI(**credentials_kwargs) client = OpenAI(**credentials_kwargs)
model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)] model_support_voice = [x.get("value") for x in
self.get_tts_model_voices(model=model, credentials=credentials)]
if not voice or voice not in model_support_voice: if not voice or voice not in model_support_voice:
voice = self._get_model_default_voice(model, credentials) voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials) word_limit = self._get_model_word_limit(model, credentials)

View File

@ -1,7 +1,4 @@
import concurrent.futures
import threading import threading
from functools import reduce
from io import BytesIO
from queue import Queue from queue import Queue
from typing import Optional from typing import Optional
@ -9,8 +6,6 @@ import dashscope
from dashscope import SpeechSynthesizer from dashscope import SpeechSynthesizer
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
from flask import Response
from pydub import AudioSegment
from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.validate import CredentialsValidateFailedError
@ -55,7 +50,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
:return: text translated to audio file :return: text translated to audio file
""" """
try: try:
self._tts_invoke( self._tts_invoke_streaming(
model=model, model=model,
credentials=credentials, credentials=credentials,
content_text='Hello Dify!', content_text='Hello Dify!',
@ -64,46 +59,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
except Exception as ex: except Exception as ex:
raise CredentialsValidateFailedError(str(ex)) raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence=sentence,
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
sentences]
for future in futures:
try:
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any: voice: str) -> any:
""" """