mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-12 03:29:01 +08:00
Add tongyi tts&tts function optimization (#2177)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
a96cae4f44
commit
ac4bb5c35f
@ -1,8 +1,13 @@
|
|||||||
|
import uuid
|
||||||
|
import hashlib
|
||||||
|
import subprocess
|
||||||
from abc import abstractmethod
|
from abc import abstractmethod
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
from core.model_runtime.entities.model_entities import ModelType
|
from core.model_runtime.entities.model_entities import ModelType
|
||||||
from core.model_runtime.model_providers.__base.ai_model import AIModel
|
from core.model_runtime.model_providers.__base.ai_model import AIModel
|
||||||
|
from core.model_runtime.entities.model_entities import ModelPropertyKey
|
||||||
|
|
||||||
|
|
||||||
class TTSModel(AIModel):
|
class TTSModel(AIModel):
|
||||||
@ -40,3 +45,96 @@ class TTSModel(AIModel):
|
|||||||
:return: translated audio file
|
:return: translated audio file
|
||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
|
def _get_model_voice(self, model: str, credentials: dict) -> any:
|
||||||
|
"""
|
||||||
|
Get voice for given tts model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:return: voice
|
||||||
|
"""
|
||||||
|
model_schema = self.get_model_schema(model, credentials)
|
||||||
|
|
||||||
|
if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
|
||||||
|
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
|
||||||
|
|
||||||
|
def _get_model_audio_type(self, model: str, credentials: dict) -> str:
|
||||||
|
"""
|
||||||
|
Get audio type for given tts model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:return: voice
|
||||||
|
"""
|
||||||
|
model_schema = self.get_model_schema(model, credentials)
|
||||||
|
|
||||||
|
if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
|
||||||
|
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
|
||||||
|
|
||||||
|
def _get_model_word_limit(self, model: str, credentials: dict) -> int:
|
||||||
|
"""
|
||||||
|
Get audio type for given tts model
|
||||||
|
:return: audio type
|
||||||
|
"""
|
||||||
|
model_schema = self.get_model_schema(model, credentials)
|
||||||
|
|
||||||
|
if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
|
||||||
|
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
|
||||||
|
|
||||||
|
def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
|
||||||
|
"""
|
||||||
|
Get audio max workers for given tts model
|
||||||
|
:return: audio type
|
||||||
|
"""
|
||||||
|
model_schema = self.get_model_schema(model, credentials)
|
||||||
|
|
||||||
|
if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
|
||||||
|
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
|
||||||
|
if delimiters is None:
|
||||||
|
delimiters = set('。!?;\n')
|
||||||
|
|
||||||
|
buf = []
|
||||||
|
word_count = 0
|
||||||
|
for char in text:
|
||||||
|
buf.append(char)
|
||||||
|
if char in delimiters:
|
||||||
|
if word_count >= limit:
|
||||||
|
yield ''.join(buf)
|
||||||
|
buf = []
|
||||||
|
word_count = 0
|
||||||
|
else:
|
||||||
|
word_count += 1
|
||||||
|
else:
|
||||||
|
word_count += 1
|
||||||
|
|
||||||
|
if buf:
|
||||||
|
yield ''.join(buf)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _is_ffmpeg_installed():
|
||||||
|
try:
|
||||||
|
output = subprocess.check_output("ffmpeg -version", shell=True)
|
||||||
|
if "ffmpeg version" in output.decode("utf-8"):
|
||||||
|
return True
|
||||||
|
else:
|
||||||
|
raise InvokeBadRequestError("ffmpeg is not installed, "
|
||||||
|
"details: https://docs.dify.ai/getting-started/install-self-hosted"
|
||||||
|
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
|
||||||
|
except Exception:
|
||||||
|
raise InvokeBadRequestError("ffmpeg is not installed, "
|
||||||
|
"details: https://docs.dify.ai/getting-started/install-self-hosted"
|
||||||
|
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
|
||||||
|
|
||||||
|
# Todo: To improve the streaming function
|
||||||
|
@staticmethod
|
||||||
|
def _get_file_name(file_content: str) -> str:
|
||||||
|
hash_object = hashlib.sha256(file_content.encode())
|
||||||
|
hex_digest = hash_object.hexdigest()
|
||||||
|
|
||||||
|
namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
|
||||||
|
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
|
||||||
|
return str(unique_uuid)
|
||||||
|
@ -1,18 +1,13 @@
|
|||||||
import uuid
|
|
||||||
import hashlib
|
|
||||||
import subprocess
|
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from functools import reduce
|
from functools import reduce
|
||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
|
|
||||||
from core.model_runtime.entities.model_entities import ModelPropertyKey
|
|
||||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||||
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
||||||
|
|
||||||
from typing_extensions import Literal
|
|
||||||
from flask import Response, stream_with_context
|
from flask import Response, stream_with_context
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
import concurrent.futures
|
import concurrent.futures
|
||||||
@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
"""
|
"""
|
||||||
Model class for OpenAI Speech to text model.
|
Model class for OpenAI Speech to text model.
|
||||||
"""
|
"""
|
||||||
|
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
|
|
||||||
user: Optional[str] = None) -> any:
|
|
||||||
"""
|
"""
|
||||||
_invoke text2speech model
|
_invoke text2speech model
|
||||||
|
|
||||||
@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
||||||
"""
|
"""
|
||||||
_tts_invoke text2speech model
|
_tts_invoke text2speech model
|
||||||
|
|
||||||
@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
# Todo: To improve the streaming function
|
# Todo: To improve the streaming function
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
|
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||||
user: Optional[str] = None) -> any:
|
|
||||||
"""
|
"""
|
||||||
_tts_invoke_streaming text2speech model
|
_tts_invoke_streaming text2speech model
|
||||||
|
|
||||||
@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
def _get_model_voice(self, model: str, credentials: dict) -> Literal[
|
|
||||||
"alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
|
|
||||||
"""
|
|
||||||
Get voice for given tts model
|
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
|
||||||
:return: voice
|
|
||||||
"""
|
|
||||||
model_schema = self.get_model_schema(model, credentials)
|
|
||||||
|
|
||||||
if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
|
|
||||||
return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
|
|
||||||
|
|
||||||
def _get_model_audio_type(self, model: str, credentials: dict) -> str:
|
|
||||||
"""
|
|
||||||
Get audio type for given tts model
|
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
|
||||||
:return: voice
|
|
||||||
"""
|
|
||||||
model_schema = self.get_model_schema(model, credentials)
|
|
||||||
|
|
||||||
if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
|
|
||||||
return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
|
|
||||||
|
|
||||||
def _get_model_word_limit(self, model: str, credentials: dict) -> int:
|
|
||||||
"""
|
|
||||||
Get audio type for given tts model
|
|
||||||
:return: audio type
|
|
||||||
"""
|
|
||||||
model_schema = self.get_model_schema(model, credentials)
|
|
||||||
|
|
||||||
if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
|
|
||||||
return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
|
|
||||||
|
|
||||||
def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
|
|
||||||
"""
|
|
||||||
Get audio max workers for given tts model
|
|
||||||
:return: audio type
|
|
||||||
"""
|
|
||||||
model_schema = self.get_model_schema(model, credentials)
|
|
||||||
|
|
||||||
if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
|
|
||||||
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
|
|
||||||
if delimiters is None:
|
|
||||||
delimiters = set('。!?;\n')
|
|
||||||
|
|
||||||
buf = []
|
|
||||||
word_count = 0
|
|
||||||
for char in text:
|
|
||||||
buf.append(char)
|
|
||||||
if char in delimiters:
|
|
||||||
if word_count >= limit:
|
|
||||||
yield ''.join(buf)
|
|
||||||
buf = []
|
|
||||||
word_count = 0
|
|
||||||
else:
|
|
||||||
word_count += 1
|
|
||||||
else:
|
|
||||||
word_count += 1
|
|
||||||
|
|
||||||
if buf:
|
|
||||||
yield ''.join(buf)
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _get_file_name(file_content: str) -> str:
|
|
||||||
hash_object = hashlib.sha256(file_content.encode())
|
|
||||||
hex_digest = hash_object.hexdigest()
|
|
||||||
|
|
||||||
namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
|
|
||||||
unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
|
|
||||||
return str(unique_uuid)
|
|
||||||
|
|
||||||
def _process_sentence(self, sentence: str, model: str, credentials: dict):
|
def _process_sentence(self, sentence: str, model: str, credentials: dict):
|
||||||
"""
|
"""
|
||||||
_tts_invoke openai text2speech model api
|
_tts_invoke openai text2speech model api
|
||||||
@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
||||||
if isinstance(response.read(), bytes):
|
if isinstance(response.read(), bytes):
|
||||||
return response.read()
|
return response.read()
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _is_ffmpeg_installed():
|
|
||||||
try:
|
|
||||||
output = subprocess.check_output("ffmpeg -version", shell=True)
|
|
||||||
if "ffmpeg version" in output.decode("utf-8"):
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
raise InvokeBadRequestError("ffmpeg is not installed, "
|
|
||||||
"details: https://docs.dify.ai/getting-started/install-self-hosted"
|
|
||||||
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
|
|
||||||
except Exception:
|
|
||||||
raise InvokeBadRequestError("ffmpeg is not installed, "
|
|
||||||
"details: https://docs.dify.ai/getting-started/install-self-hosted"
|
|
||||||
"/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
|
|
||||||
|
23
api/core/model_runtime/model_providers/tongyi/_common.py
Normal file
23
api/core/model_runtime/model_providers/tongyi/_common.py
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
from core.model_runtime.errors.invoke import InvokeError
|
||||||
|
|
||||||
|
|
||||||
|
class _CommonTongyi:
|
||||||
|
@staticmethod
|
||||||
|
def _to_credential_kwargs(credentials: dict) -> dict:
|
||||||
|
credentials_kwargs = {
|
||||||
|
"dashscope_api_key": credentials['dashscope_api_key'],
|
||||||
|
}
|
||||||
|
|
||||||
|
return credentials_kwargs
|
||||||
|
|
||||||
|
@property
|
||||||
|
def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
|
||||||
|
"""
|
||||||
|
Map model invoke error to unified error
|
||||||
|
The key is the error type thrown to the caller
|
||||||
|
The value is the error type thrown by the model,
|
||||||
|
which needs to be converted into a unified error type for the caller.
|
||||||
|
|
||||||
|
:return: Invoke error mapping
|
||||||
|
"""
|
||||||
|
pass
|
@ -16,6 +16,7 @@ help:
|
|||||||
en_US: https://dashscope.console.aliyun.com/api-key_management
|
en_US: https://dashscope.console.aliyun.com/api-key_management
|
||||||
supported_model_types:
|
supported_model_types:
|
||||||
- llm
|
- llm
|
||||||
|
- tts
|
||||||
configurate_methods:
|
configurate_methods:
|
||||||
- predefined-model
|
- predefined-model
|
||||||
provider_credential_schema:
|
provider_credential_schema:
|
||||||
|
@ -0,0 +1,7 @@
|
|||||||
|
model: tts-1
|
||||||
|
model_type: tts
|
||||||
|
model_properties:
|
||||||
|
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
|
||||||
|
word_limit: 120
|
||||||
|
audio_type: 'mp3'
|
||||||
|
max_workers: 5
|
142
api/core/model_runtime/model_providers/tongyi/tts/tts.py
Normal file
142
api/core/model_runtime/model_providers/tongyi/tts/tts.py
Normal file
@ -0,0 +1,142 @@
|
|||||||
|
from io import BytesIO
|
||||||
|
from typing import Optional
|
||||||
|
from functools import reduce
|
||||||
|
from pydub import AudioSegment
|
||||||
|
|
||||||
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
|
from core.model_runtime.errors.invoke import InvokeBadRequestError
|
||||||
|
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||||
|
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
|
||||||
|
|
||||||
|
import dashscope
|
||||||
|
from flask import Response, stream_with_context
|
||||||
|
import concurrent.futures
|
||||||
|
|
||||||
|
|
||||||
|
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
||||||
|
"""
|
||||||
|
Model class for Tongyi Speech to text model.
|
||||||
|
"""
|
||||||
|
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||||
|
"""
|
||||||
|
_invoke text2speech model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:param content_text: text content to be translated
|
||||||
|
:param streaming: output is streaming
|
||||||
|
:param user: unique user id
|
||||||
|
:return: text translated to audio file
|
||||||
|
"""
|
||||||
|
self._is_ffmpeg_installed()
|
||||||
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
if streaming:
|
||||||
|
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||||
|
credentials=credentials,
|
||||||
|
content_text=content_text,
|
||||||
|
user=user)),
|
||||||
|
status=200, mimetype=f'audio/{audio_type}')
|
||||||
|
else:
|
||||||
|
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
||||||
|
|
||||||
|
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||||
|
"""
|
||||||
|
validate credentials text2speech model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:param user: unique user id
|
||||||
|
:return: text translated to audio file
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
self._tts_invoke(
|
||||||
|
model=model,
|
||||||
|
credentials=credentials,
|
||||||
|
content_text='Hello world!',
|
||||||
|
user=user
|
||||||
|
)
|
||||||
|
except Exception as ex:
|
||||||
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
|
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
||||||
|
"""
|
||||||
|
_tts_invoke text2speech model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:param content_text: text content to be translated
|
||||||
|
:param user: unique user id
|
||||||
|
:return: text translated to audio file
|
||||||
|
"""
|
||||||
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
|
max_workers = self._get_model_workers_limit(model, credentials)
|
||||||
|
|
||||||
|
try:
|
||||||
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
|
audio_bytes_list = list()
|
||||||
|
|
||||||
|
# Create a thread pool and map the function to the list of sentences
|
||||||
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
|
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
|
||||||
|
credentials=credentials, audio_type=audio_type) for sentence in sentences]
|
||||||
|
for future in futures:
|
||||||
|
try:
|
||||||
|
audio_bytes_list.append(future.result())
|
||||||
|
except Exception as ex:
|
||||||
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
|
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||||
|
audio_bytes_list if audio_bytes]
|
||||||
|
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||||
|
buffer: BytesIO = BytesIO()
|
||||||
|
combined_segment.export(buffer, format=audio_type)
|
||||||
|
buffer.seek(0)
|
||||||
|
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
|
||||||
|
except Exception as ex:
|
||||||
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
|
# Todo: To improve the streaming function
|
||||||
|
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
||||||
|
"""
|
||||||
|
_tts_invoke_streaming text2speech model
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:param content_text: text content to be translated
|
||||||
|
:param user: unique user id
|
||||||
|
:return: text translated to audio file
|
||||||
|
"""
|
||||||
|
# transform credentials to kwargs for model instance
|
||||||
|
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||||
|
voice_name = self._get_model_voice(model, credentials)
|
||||||
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
try:
|
||||||
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
|
for sentence in sentences:
|
||||||
|
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
|
||||||
|
format=audio_type, word_timestamp_enabled=True,
|
||||||
|
phoneme_timestamp_enabled=True)
|
||||||
|
if isinstance(response.get_audio_data(), bytes):
|
||||||
|
return response.get_audio_data()
|
||||||
|
except Exception as ex:
|
||||||
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
|
def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
|
||||||
|
"""
|
||||||
|
_tts_invoke Tongyi text2speech model api
|
||||||
|
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:param sentence: text content to be translated
|
||||||
|
:param audio_type: audio file type
|
||||||
|
:return: text translated to audio file
|
||||||
|
"""
|
||||||
|
# transform credentials to kwargs for model instance
|
||||||
|
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||||
|
voice_name = self._get_model_voice(model, credentials)
|
||||||
|
|
||||||
|
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
|
||||||
|
if isinstance(response.get_audio_data(), bytes):
|
||||||
|
return response.get_audio_data()
|
@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
|
|||||||
/>
|
/>
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
Text to speech, only supports openai model.
|
Text to speech.
|
||||||
|
|
||||||
### Request Body
|
### Request Body
|
||||||
|
|
||||||
|
@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
|
|||||||
/>
|
/>
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
文字转语音,仅支持 openai 模型。
|
文字转语音。
|
||||||
|
|
||||||
### Request Body
|
### Request Body
|
||||||
|
|
||||||
|
@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
|
|||||||
/>
|
/>
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
Text to speech, only supports openai model.
|
Text to speech.
|
||||||
|
|
||||||
### Request Body
|
### Request Body
|
||||||
|
|
||||||
|
@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
|
|||||||
/>
|
/>
|
||||||
<Row>
|
<Row>
|
||||||
<Col>
|
<Col>
|
||||||
文字转语音,仅支持 openai 模型。
|
文字转语音。
|
||||||
|
|
||||||
### Request Body
|
### Request Body
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user