mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 22:49:00 +08:00
Remove tts (blocking call) (#6869)
This commit is contained in:
parent
f31142e758
commit
a9cd6df97e
@ -1,12 +1,8 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
import copy
|
import copy
|
||||||
from functools import reduce
|
|
||||||
from io import BytesIO
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from flask import Response
|
|
||||||
from openai import AzureOpenAI
|
from openai import AzureOpenAI
|
||||||
from pydub import AudioSegment
|
|
||||||
|
|
||||||
from core.model_runtime.entities.model_entities import AIModelEntity
|
from core.model_runtime.entities.model_entities import AIModelEntity
|
||||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
@ -51,7 +47,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
|
|||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
self._tts_invoke(
|
self._tts_invoke_streaming(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text='Hello Dify!',
|
content_text='Hello Dify!',
|
||||||
@ -60,45 +56,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
|
||||||
"""
|
|
||||||
_tts_invoke text2speech model
|
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
|
||||||
:param content_text: text content to be translated
|
|
||||||
:param voice: model timbre
|
|
||||||
:return: text translated to audio file
|
|
||||||
"""
|
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
|
||||||
max_workers = self._get_model_workers_limit(model, credentials)
|
|
||||||
try:
|
|
||||||
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
|
|
||||||
audio_bytes_list = []
|
|
||||||
|
|
||||||
# Create a thread pool and map the function to the list of sentences
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
||||||
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
|
|
||||||
credentials=credentials) for sentence in sentences]
|
|
||||||
for future in futures:
|
|
||||||
try:
|
|
||||||
if future.result():
|
|
||||||
audio_bytes_list.append(future.result())
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
if len(audio_bytes_list) > 0:
|
|
||||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
|
||||||
audio_bytes_list if audio_bytes]
|
|
||||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
|
||||||
buffer: BytesIO = BytesIO()
|
|
||||||
combined_segment.export(buffer, format=audio_type)
|
|
||||||
buffer.seek(0)
|
|
||||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
||||||
voice: str) -> any:
|
voice: str) -> any:
|
||||||
"""
|
"""
|
||||||
@ -144,7 +101,6 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
|
|||||||
:param sentence: text content to be translated
|
:param sentence: text content to be translated
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
# transform credentials to kwargs for model instance
|
|
||||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||||
client = AzureOpenAI(**credentials_kwargs)
|
client = AzureOpenAI(**credentials_kwargs)
|
||||||
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
||||||
|
@ -1,11 +1,7 @@
|
|||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
from functools import reduce
|
|
||||||
from io import BytesIO
|
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
from flask import Response
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
from pydub import AudioSegment
|
|
||||||
|
|
||||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
@ -32,7 +28,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
|
|
||||||
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
|
if not voice or voice not in [d['value'] for d in
|
||||||
|
self.get_tts_model_voices(model=model, credentials=credentials)]:
|
||||||
voice = self._get_model_default_voice(model, credentials)
|
voice = self._get_model_default_voice(model, credentials)
|
||||||
# if streaming:
|
# if streaming:
|
||||||
return self._tts_invoke_streaming(model=model,
|
return self._tts_invoke_streaming(model=model,
|
||||||
@ -50,7 +47,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
self._tts_invoke(
|
self._tts_invoke_streaming(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text='Hello Dify!',
|
content_text='Hello Dify!',
|
||||||
@ -59,46 +56,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
|
||||||
"""
|
|
||||||
_tts_invoke text2speech model
|
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
|
||||||
:param content_text: text content to be translated
|
|
||||||
:param voice: model timbre
|
|
||||||
:return: text translated to audio file
|
|
||||||
"""
|
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
|
||||||
max_workers = self._get_model_workers_limit(model, credentials)
|
|
||||||
try:
|
|
||||||
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
|
|
||||||
audio_bytes_list = []
|
|
||||||
|
|
||||||
# Create a thread pool and map the function to the list of sentences
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
||||||
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
|
|
||||||
credentials=credentials) for sentence in sentences]
|
|
||||||
for future in futures:
|
|
||||||
try:
|
|
||||||
if future.result():
|
|
||||||
audio_bytes_list.append(future.result())
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
if len(audio_bytes_list) > 0:
|
|
||||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
|
||||||
audio_bytes_list if audio_bytes]
|
|
||||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
|
||||||
buffer: BytesIO = BytesIO()
|
|
||||||
combined_segment.export(buffer, format=audio_type)
|
|
||||||
buffer.seek(0)
|
|
||||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
|
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
||||||
voice: str) -> any:
|
voice: str) -> any:
|
||||||
"""
|
"""
|
||||||
@ -114,7 +71,8 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
# doc: https://platform.openai.com/docs/guides/text-to-speech
|
# doc: https://platform.openai.com/docs/guides/text-to-speech
|
||||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||||
client = OpenAI(**credentials_kwargs)
|
client = OpenAI(**credentials_kwargs)
|
||||||
model_support_voice = [x.get("value") for x in self.get_tts_model_voices(model=model, credentials=credentials)]
|
model_support_voice = [x.get("value") for x in
|
||||||
|
self.get_tts_model_voices(model=model, credentials=credentials)]
|
||||||
if not voice or voice not in model_support_voice:
|
if not voice or voice not in model_support_voice:
|
||||||
voice = self._get_model_default_voice(model, credentials)
|
voice = self._get_model_default_voice(model, credentials)
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
|
@ -1,7 +1,4 @@
|
|||||||
import concurrent.futures
|
|
||||||
import threading
|
import threading
|
||||||
from functools import reduce
|
|
||||||
from io import BytesIO
|
|
||||||
from queue import Queue
|
from queue import Queue
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
@ -9,8 +6,6 @@ import dashscope
|
|||||||
from dashscope import SpeechSynthesizer
|
from dashscope import SpeechSynthesizer
|
||||||
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
|
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
|
||||||
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
|
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
|
||||||
from flask import Response
|
|
||||||
from pydub import AudioSegment
|
|
||||||
|
|
||||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
@ -55,7 +50,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
|||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
self._tts_invoke(
|
self._tts_invoke_streaming(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text='Hello Dify!',
|
content_text='Hello Dify!',
|
||||||
@ -64,46 +59,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
|
||||||
"""
|
|
||||||
_tts_invoke text2speech model
|
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
|
||||||
:param voice: model timbre
|
|
||||||
:param content_text: text content to be translated
|
|
||||||
:return: text translated to audio file
|
|
||||||
"""
|
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
|
||||||
max_workers = self._get_model_workers_limit(model, credentials)
|
|
||||||
try:
|
|
||||||
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
|
|
||||||
audio_bytes_list = []
|
|
||||||
|
|
||||||
# Create a thread pool and map the function to the list of sentences
|
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
||||||
futures = [executor.submit(self._process_sentence, sentence=sentence,
|
|
||||||
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
|
|
||||||
sentences]
|
|
||||||
for future in futures:
|
|
||||||
try:
|
|
||||||
if future.result():
|
|
||||||
audio_bytes_list.append(future.result())
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
if len(audio_bytes_list) > 0:
|
|
||||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
|
||||||
audio_bytes_list if audio_bytes]
|
|
||||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
|
||||||
buffer: BytesIO = BytesIO()
|
|
||||||
combined_segment.export(buffer, format=audio_type)
|
|
||||||
buffer.seek(0)
|
|
||||||
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
|
||||||
except Exception as ex:
|
|
||||||
raise InvokeBadRequestError(str(ex))
|
|
||||||
|
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
||||||
voice: str) -> any:
|
voice: str) -> any:
|
||||||
"""
|
"""
|
||||||
|
Loading…
x
Reference in New Issue
Block a user