mirror of
https://git.mirrors.martin98.com/https://github.com/langgenius/dify.git
synced 2025-08-13 14:19:04 +08:00
tts add voice choose (#2391)
Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM> Co-authored-by: crazywoola <427733928@qq.com> Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
parent
e47b5b43b8
commit
300d9892a5
@ -1,7 +1,7 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
from flask import request
|
from flask import request
|
||||||
from flask_restful import Resource
|
from flask_restful import Resource, reqparse
|
||||||
from werkzeug.exceptions import InternalServerError
|
from werkzeug.exceptions import InternalServerError
|
||||||
|
|
||||||
import services
|
import services
|
||||||
@ -23,6 +23,7 @@ from controllers.console.wraps import account_initialization_required
|
|||||||
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
|
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
|
||||||
from core.model_runtime.errors.invoke import InvokeError
|
from core.model_runtime.errors.invoke import InvokeError
|
||||||
from libs.login import login_required
|
from libs.login import login_required
|
||||||
|
from models.model import AppModelConfig
|
||||||
from services.audio_service import AudioService
|
from services.audio_service import AudioService
|
||||||
from services.errors.audio import (
|
from services.errors.audio import (
|
||||||
AudioTooLargeServiceError,
|
AudioTooLargeServiceError,
|
||||||
@ -45,7 +46,9 @@ class ChatMessageAudioApi(Resource):
|
|||||||
try:
|
try:
|
||||||
response = AudioService.transcript_asr(
|
response = AudioService.transcript_asr(
|
||||||
tenant_id=app_model.tenant_id,
|
tenant_id=app_model.tenant_id,
|
||||||
file=file
|
file=file,
|
||||||
|
end_user=None,
|
||||||
|
promot=app_model.app_model_config.pre_prompt
|
||||||
)
|
)
|
||||||
|
|
||||||
return response
|
return response
|
||||||
@ -71,7 +74,7 @@ class ChatMessageAudioApi(Resource):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("internal server error.")
|
logging.exception(f"internal server error, {str(e)}.")
|
||||||
raise InternalServerError()
|
raise InternalServerError()
|
||||||
|
|
||||||
|
|
||||||
@ -82,10 +85,17 @@ class ChatMessageTextApi(Resource):
|
|||||||
def post(self, app_id):
|
def post(self, app_id):
|
||||||
app_id = str(app_id)
|
app_id = str(app_id)
|
||||||
app_model = _get_app(app_id, None)
|
app_model = _get_app(app_id, None)
|
||||||
|
|
||||||
|
app_model_config: AppModelConfig = app_model.app_model_config
|
||||||
|
|
||||||
|
if not app_model_config.text_to_speech_dict['enabled']:
|
||||||
|
raise AppUnavailableError()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = AudioService.transcript_tts(
|
response = AudioService.transcript_tts(
|
||||||
tenant_id=app_model.tenant_id,
|
tenant_id=app_model.tenant_id,
|
||||||
text=request.form['text'],
|
text=request.form['text'],
|
||||||
|
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
|
||||||
streaming=False
|
streaming=False
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -112,9 +122,54 @@ class ChatMessageTextApi(Resource):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("internal server error.")
|
logging.exception(f"internal server error, {str(e)}.")
|
||||||
|
raise InternalServerError()
|
||||||
|
|
||||||
|
|
||||||
|
class TextModesApi(Resource):
|
||||||
|
def get(self, app_id: str):
|
||||||
|
app_model = _get_app(str(app_id))
|
||||||
|
app_model_config: AppModelConfig = app_model.app_model_config
|
||||||
|
|
||||||
|
if not app_model_config.text_to_speech_dict['enabled']:
|
||||||
|
raise AppUnavailableError()
|
||||||
|
|
||||||
|
try:
|
||||||
|
parser = reqparse.RequestParser()
|
||||||
|
parser.add_argument('language', type=str, required=True, location='args')
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
response = AudioService.transcript_tts_voices(
|
||||||
|
tenant_id=app_model.tenant_id,
|
||||||
|
language=args['language'],
|
||||||
|
)
|
||||||
|
|
||||||
|
return response
|
||||||
|
except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError:
|
||||||
|
raise AppUnavailableError("Text to audio voices language parameter loss.")
|
||||||
|
except NoAudioUploadedServiceError:
|
||||||
|
raise NoAudioUploadedError()
|
||||||
|
except AudioTooLargeServiceError as e:
|
||||||
|
raise AudioTooLargeError(str(e))
|
||||||
|
except UnsupportedAudioTypeServiceError:
|
||||||
|
raise UnsupportedAudioTypeError()
|
||||||
|
except ProviderNotSupportSpeechToTextServiceError:
|
||||||
|
raise ProviderNotSupportSpeechToTextError()
|
||||||
|
except ProviderTokenNotInitError as ex:
|
||||||
|
raise ProviderNotInitializeError(ex.description)
|
||||||
|
except QuotaExceededError:
|
||||||
|
raise ProviderQuotaExceededError()
|
||||||
|
except ModelCurrentlyNotSupportError:
|
||||||
|
raise ProviderModelCurrentlyNotSupportError()
|
||||||
|
except InvokeError as e:
|
||||||
|
raise CompletionRequestError(e.description)
|
||||||
|
except ValueError as e:
|
||||||
|
raise e
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(f"internal server error, {str(e)}.")
|
||||||
raise InternalServerError()
|
raise InternalServerError()
|
||||||
|
|
||||||
|
|
||||||
api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
|
api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
|
||||||
api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio')
|
api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio')
|
||||||
|
api.add_resource(TextModesApi, '/apps/<uuid:app_id>/text-to-audio/voices')
|
||||||
|
@ -85,6 +85,7 @@ class ChatTextApi(InstalledAppResource):
|
|||||||
response = AudioService.transcript_tts(
|
response = AudioService.transcript_tts(
|
||||||
tenant_id=app_model.tenant_id,
|
tenant_id=app_model.tenant_id,
|
||||||
text=request.form['text'],
|
text=request.form['text'],
|
||||||
|
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
|
||||||
streaming=False
|
streaming=False
|
||||||
)
|
)
|
||||||
return {'data': response.data.decode('latin1')}
|
return {'data': response.data.decode('latin1')}
|
||||||
|
@ -86,6 +86,7 @@ class TextApi(AppApiResource):
|
|||||||
tenant_id=app_model.tenant_id,
|
tenant_id=app_model.tenant_id,
|
||||||
text=args['text'],
|
text=args['text'],
|
||||||
end_user=args['user'],
|
end_user=args['user'],
|
||||||
|
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
|
||||||
streaming=args['streaming']
|
streaming=args['streaming']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -68,17 +68,23 @@ class AudioApi(WebApiResource):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("internal server error.")
|
logging.exception(f"internal server error: {str(e)}")
|
||||||
raise InternalServerError()
|
raise InternalServerError()
|
||||||
|
|
||||||
|
|
||||||
class TextApi(WebApiResource):
|
class TextApi(WebApiResource):
|
||||||
def post(self, app_model: App, end_user):
|
def post(self, app_model: App, end_user):
|
||||||
|
app_model_config: AppModelConfig = app_model.app_model_config
|
||||||
|
|
||||||
|
if not app_model_config.text_to_speech_dict['enabled']:
|
||||||
|
raise AppUnavailableError()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = AudioService.transcript_tts(
|
response = AudioService.transcript_tts(
|
||||||
tenant_id=app_model.tenant_id,
|
tenant_id=app_model.tenant_id,
|
||||||
text=request.form['text'],
|
text=request.form['text'],
|
||||||
end_user=end_user.external_user_id,
|
end_user=end_user.external_user_id,
|
||||||
|
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
|
||||||
streaming=False
|
streaming=False
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -105,7 +111,7 @@ class TextApi(WebApiResource):
|
|||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
raise e
|
raise e
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.exception("internal server error.")
|
logging.exception(f"internal server error: {str(e)}")
|
||||||
raise InternalServerError()
|
raise InternalServerError()
|
||||||
|
|
||||||
|
|
||||||
|
@ -28,6 +28,7 @@ from core.entities.application_entities import (
|
|||||||
ModelConfigEntity,
|
ModelConfigEntity,
|
||||||
PromptTemplateEntity,
|
PromptTemplateEntity,
|
||||||
SensitiveWordAvoidanceEntity,
|
SensitiveWordAvoidanceEntity,
|
||||||
|
TextToSpeechEntity,
|
||||||
)
|
)
|
||||||
from core.entities.model_entities import ModelStatus
|
from core.entities.model_entities import ModelStatus
|
||||||
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
|
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
|
||||||
@ -572,7 +573,11 @@ class ApplicationManager:
|
|||||||
text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech')
|
text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech')
|
||||||
if text_to_speech_dict:
|
if text_to_speech_dict:
|
||||||
if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']:
|
if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']:
|
||||||
properties['text_to_speech'] = True
|
properties['text_to_speech'] = TextToSpeechEntity(
|
||||||
|
enabled=text_to_speech_dict.get('enabled'),
|
||||||
|
voice=text_to_speech_dict.get('voice'),
|
||||||
|
language=text_to_speech_dict.get('language'),
|
||||||
|
)
|
||||||
|
|
||||||
# sensitive word avoidance
|
# sensitive word avoidance
|
||||||
sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance')
|
sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance')
|
||||||
|
@ -42,6 +42,7 @@ class AdvancedCompletionPromptTemplateEntity(BaseModel):
|
|||||||
"""
|
"""
|
||||||
Advanced Completion Prompt Template Entity.
|
Advanced Completion Prompt Template Entity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class RolePrefixEntity(BaseModel):
|
class RolePrefixEntity(BaseModel):
|
||||||
"""
|
"""
|
||||||
Role Prefix Entity.
|
Role Prefix Entity.
|
||||||
@ -57,6 +58,7 @@ class PromptTemplateEntity(BaseModel):
|
|||||||
"""
|
"""
|
||||||
Prompt Template Entity.
|
Prompt Template Entity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class PromptType(Enum):
|
class PromptType(Enum):
|
||||||
"""
|
"""
|
||||||
Prompt Type.
|
Prompt Type.
|
||||||
@ -97,6 +99,7 @@ class DatasetRetrieveConfigEntity(BaseModel):
|
|||||||
"""
|
"""
|
||||||
Dataset Retrieve Config Entity.
|
Dataset Retrieve Config Entity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class RetrieveStrategy(Enum):
|
class RetrieveStrategy(Enum):
|
||||||
"""
|
"""
|
||||||
Dataset Retrieve Strategy.
|
Dataset Retrieve Strategy.
|
||||||
@ -143,6 +146,15 @@ class SensitiveWordAvoidanceEntity(BaseModel):
|
|||||||
config: dict[str, Any] = {}
|
config: dict[str, Any] = {}
|
||||||
|
|
||||||
|
|
||||||
|
class TextToSpeechEntity(BaseModel):
|
||||||
|
"""
|
||||||
|
Sensitive Word Avoidance Entity.
|
||||||
|
"""
|
||||||
|
enabled: bool
|
||||||
|
voice: Optional[str] = None
|
||||||
|
language: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
class FileUploadEntity(BaseModel):
|
class FileUploadEntity(BaseModel):
|
||||||
"""
|
"""
|
||||||
File Upload Entity.
|
File Upload Entity.
|
||||||
@ -159,6 +171,7 @@ class AgentToolEntity(BaseModel):
|
|||||||
tool_name: str
|
tool_name: str
|
||||||
tool_parameters: dict[str, Any] = {}
|
tool_parameters: dict[str, Any] = {}
|
||||||
|
|
||||||
|
|
||||||
class AgentPromptEntity(BaseModel):
|
class AgentPromptEntity(BaseModel):
|
||||||
"""
|
"""
|
||||||
Agent Prompt Entity.
|
Agent Prompt Entity.
|
||||||
@ -166,6 +179,7 @@ class AgentPromptEntity(BaseModel):
|
|||||||
first_prompt: str
|
first_prompt: str
|
||||||
next_iteration: str
|
next_iteration: str
|
||||||
|
|
||||||
|
|
||||||
class AgentScratchpadUnit(BaseModel):
|
class AgentScratchpadUnit(BaseModel):
|
||||||
"""
|
"""
|
||||||
Agent First Prompt Entity.
|
Agent First Prompt Entity.
|
||||||
@ -184,10 +198,12 @@ class AgentScratchpadUnit(BaseModel):
|
|||||||
observation: Optional[str] = None
|
observation: Optional[str] = None
|
||||||
action: Optional[Action] = None
|
action: Optional[Action] = None
|
||||||
|
|
||||||
|
|
||||||
class AgentEntity(BaseModel):
|
class AgentEntity(BaseModel):
|
||||||
"""
|
"""
|
||||||
Agent Entity.
|
Agent Entity.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
class Strategy(Enum):
|
class Strategy(Enum):
|
||||||
"""
|
"""
|
||||||
Agent Strategy.
|
Agent Strategy.
|
||||||
@ -202,6 +218,7 @@ class AgentEntity(BaseModel):
|
|||||||
tools: list[AgentToolEntity] = None
|
tools: list[AgentToolEntity] = None
|
||||||
max_iteration: int = 5
|
max_iteration: int = 5
|
||||||
|
|
||||||
|
|
||||||
class AppOrchestrationConfigEntity(BaseModel):
|
class AppOrchestrationConfigEntity(BaseModel):
|
||||||
"""
|
"""
|
||||||
App Orchestration Config Entity.
|
App Orchestration Config Entity.
|
||||||
@ -219,7 +236,7 @@ class AppOrchestrationConfigEntity(BaseModel):
|
|||||||
show_retrieve_source: bool = False
|
show_retrieve_source: bool = False
|
||||||
more_like_this: bool = False
|
more_like_this: bool = False
|
||||||
speech_to_text: bool = False
|
speech_to_text: bool = False
|
||||||
text_to_speech: bool = False
|
text_to_speech: dict = {}
|
||||||
sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None
|
sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None
|
||||||
|
|
||||||
|
|
||||||
|
@ -99,7 +99,8 @@ class ModelInstance:
|
|||||||
user=user
|
user=user
|
||||||
)
|
)
|
||||||
|
|
||||||
def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None,
|
def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None,
|
||||||
|
top_n: Optional[int] = None,
|
||||||
user: Optional[str] = None) \
|
user: Optional[str] = None) \
|
||||||
-> RerankResult:
|
-> RerankResult:
|
||||||
"""
|
"""
|
||||||
@ -166,13 +167,15 @@ class ModelInstance:
|
|||||||
user=user
|
user=user
|
||||||
)
|
)
|
||||||
|
|
||||||
def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \
|
def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \
|
||||||
-> str:
|
-> str:
|
||||||
"""
|
"""
|
||||||
Invoke large language model
|
Invoke large language tts model
|
||||||
|
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param user: unique user id
|
:param user: unique user id
|
||||||
|
:param voice: model timbre
|
||||||
:param streaming: output is streaming
|
:param streaming: output is streaming
|
||||||
:return: text for given audio file
|
:return: text for given audio file
|
||||||
"""
|
"""
|
||||||
@ -185,9 +188,28 @@ class ModelInstance:
|
|||||||
credentials=self.credentials,
|
credentials=self.credentials,
|
||||||
content_text=content_text,
|
content_text=content_text,
|
||||||
user=user,
|
user=user,
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
voice=voice,
|
||||||
streaming=streaming
|
streaming=streaming
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def get_tts_voices(self, language: str) -> list:
|
||||||
|
"""
|
||||||
|
Invoke large language tts model voices
|
||||||
|
|
||||||
|
:param language: tts language
|
||||||
|
:return: tts model voices
|
||||||
|
"""
|
||||||
|
if not isinstance(self.model_type_instance, TTSModel):
|
||||||
|
raise Exception("Model type instance is not TTSModel")
|
||||||
|
|
||||||
|
self.model_type_instance = cast(TTSModel, self.model_type_instance)
|
||||||
|
return self.model_type_instance.get_tts_model_voices(
|
||||||
|
model=self.model,
|
||||||
|
credentials=self.credentials,
|
||||||
|
language=language
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class ModelManager:
|
class ModelManager:
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
|
@ -48,6 +48,10 @@
|
|||||||
- `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
|
- `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
|
||||||
- `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
|
- `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
|
||||||
- `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`)
|
- `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`)
|
||||||
|
- `voices` (list) List of available voice.(available for model type `tts`)
|
||||||
|
- `mode` (string) voice model.(available for model type `tts`)
|
||||||
|
- `name` (string) voice model display name.(available for model type `tts`)
|
||||||
|
- `lanuage` (string) the voice model supports languages.(available for model type `tts`)
|
||||||
- `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`)
|
- `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`)
|
||||||
- `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`)
|
- `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`)
|
||||||
- `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`)
|
- `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`)
|
||||||
|
@ -48,7 +48,11 @@
|
|||||||
- `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
|
- `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
|
||||||
- `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用)
|
- `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用)
|
||||||
- `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用)
|
- `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用)
|
||||||
- `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用)
|
- `default_voice` (string) 缺省音色,必选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用)
|
||||||
|
- `voices` (list) 可选音色列表。
|
||||||
|
- `mode` (string) 音色模型。(模型类型 `tts` 可用)
|
||||||
|
- `name` (string) 音色模型显示名称。(模型类型 `tts` 可用)
|
||||||
|
- `lanuage` (string) 音色模型支持语言。(模型类型 `tts` 可用)
|
||||||
- `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用)
|
- `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用)
|
||||||
- `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用)
|
- `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用)
|
||||||
- `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用)
|
- `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用)
|
||||||
|
@ -127,6 +127,7 @@ class ModelPropertyKey(Enum):
|
|||||||
SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions"
|
SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions"
|
||||||
MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk"
|
MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk"
|
||||||
DEFAULT_VOICE = "default_voice"
|
DEFAULT_VOICE = "default_voice"
|
||||||
|
VOICES = "voices"
|
||||||
WORD_LIMIT = "word_limit"
|
WORD_LIMIT = "word_limit"
|
||||||
AUDOI_TYPE = "audio_type"
|
AUDOI_TYPE = "audio_type"
|
||||||
MAX_WORKERS = "max_workers"
|
MAX_WORKERS = "max_workers"
|
||||||
|
@ -15,29 +15,37 @@ class TTSModel(AIModel):
|
|||||||
"""
|
"""
|
||||||
model_type: ModelType = ModelType.TTS
|
model_type: ModelType = ModelType.TTS
|
||||||
|
|
||||||
def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||||
|
user: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Invoke large language model
|
Invoke large language model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param streaming: output is streaming
|
:param streaming: output is streaming
|
||||||
:param user: unique user id
|
:param user: unique user id
|
||||||
:return: translated audio file
|
:return: translated audio file
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text)
|
self._is_ffmpeg_installed()
|
||||||
|
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
|
||||||
|
content_text=content_text, voice=voice, tenant_id=tenant_id)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise self._transform_invoke_error(e)
|
raise self._transform_invoke_error(e)
|
||||||
|
|
||||||
@abstractmethod
|
@abstractmethod
|
||||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
|
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||||
|
user: Optional[str] = None):
|
||||||
"""
|
"""
|
||||||
Invoke large language model
|
Invoke large language model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param streaming: output is streaming
|
:param streaming: output is streaming
|
||||||
:param user: unique user id
|
:param user: unique user id
|
||||||
@ -45,7 +53,22 @@ class TTSModel(AIModel):
|
|||||||
"""
|
"""
|
||||||
raise NotImplementedError
|
raise NotImplementedError
|
||||||
|
|
||||||
def _get_model_voice(self, model: str, credentials: dict) -> any:
|
def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list:
|
||||||
|
"""
|
||||||
|
Get voice for given tts model voices
|
||||||
|
|
||||||
|
:param language: tts language
|
||||||
|
:param model: model name
|
||||||
|
:param credentials: model credentials
|
||||||
|
:return: voices lists
|
||||||
|
"""
|
||||||
|
model_schema = self.get_model_schema(model, credentials)
|
||||||
|
|
||||||
|
if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties:
|
||||||
|
voices = model_schema.model_properties[ModelPropertyKey.VOICES]
|
||||||
|
return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')]
|
||||||
|
|
||||||
|
def _get_model_default_voice(self, model: str, credentials: dict) -> any:
|
||||||
"""
|
"""
|
||||||
Get voice for given tts model
|
Get voice for given tts model
|
||||||
|
|
||||||
|
@ -1,7 +1,31 @@
|
|||||||
model: tts-1-hd
|
model: tts-1
|
||||||
model_type: tts
|
model_type: tts
|
||||||
model_properties:
|
model_properties:
|
||||||
default_voice: 'alloy'
|
default_voice: 'alloy'
|
||||||
|
voices:
|
||||||
|
- mode: 'alloy'
|
||||||
|
name: 'Alloy'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'echo'
|
||||||
|
name: 'Echo'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'fable'
|
||||||
|
name: 'Fable'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'onyx'
|
||||||
|
name: 'Onyx'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'nova'
|
||||||
|
name: 'Nova'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'shimmer'
|
||||||
|
name: 'Shimmer'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
word_limit: 120
|
word_limit: 120
|
||||||
audio_type: 'mp3'
|
audio_type: 'mp3'
|
||||||
max_workers: 5
|
max_workers: 5
|
||||||
|
pricing:
|
||||||
|
input: '0.03'
|
||||||
|
output: '0'
|
||||||
|
unit: '0.001'
|
||||||
|
currency: USD
|
||||||
|
@ -2,6 +2,30 @@ model: tts-1
|
|||||||
model_type: tts
|
model_type: tts
|
||||||
model_properties:
|
model_properties:
|
||||||
default_voice: 'alloy'
|
default_voice: 'alloy'
|
||||||
|
voices:
|
||||||
|
- mode: 'alloy'
|
||||||
|
name: 'Alloy'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'echo'
|
||||||
|
name: 'Echo'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'fable'
|
||||||
|
name: 'Fable'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'onyx'
|
||||||
|
name: 'Onyx'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'nova'
|
||||||
|
name: 'Nova'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
|
- mode: 'shimmer'
|
||||||
|
name: 'Shimmer'
|
||||||
|
language: ['zh-CN', 'en-US']
|
||||||
word_limit: 120
|
word_limit: 120
|
||||||
audio_type: 'mp3'
|
audio_type: 'mp3'
|
||||||
max_workers: 5
|
max_workers: 5
|
||||||
|
pricing:
|
||||||
|
input: '0.015'
|
||||||
|
output: '0'
|
||||||
|
unit: '0.001'
|
||||||
|
currency: USD
|
||||||
|
@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
|
|||||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||||
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
|
||||||
|
from extensions.ext_storage import storage
|
||||||
|
|
||||||
|
|
||||||
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
||||||
"""
|
"""
|
||||||
Model class for OpenAI Speech to text model.
|
Model class for OpenAI Speech to text model.
|
||||||
"""
|
"""
|
||||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
|
||||||
|
def _invoke(self, model: str, tenant_id: str, credentials: dict,
|
||||||
|
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
|
||||||
"""
|
"""
|
||||||
_invoke text2speech model
|
_invoke text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
|
:param voice: model timbre
|
||||||
:param streaming: output is streaming
|
:param streaming: output is streaming
|
||||||
:param user: unique user id
|
:param user: unique user id
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
self._is_ffmpeg_installed()
|
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
if not voice:
|
||||||
|
voice = self._get_model_default_voice(model, credentials)
|
||||||
if streaming:
|
if streaming:
|
||||||
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text=content_text,
|
content_text=content_text,
|
||||||
user=user)),
|
tenant_id=tenant_id,
|
||||||
|
voice=voice)),
|
||||||
status=200, mimetype=f'audio/{audio_type}')
|
status=200, mimetype=f'audio/{audio_type}')
|
||||||
else:
|
else:
|
||||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
|
||||||
|
|
||||||
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||||
"""
|
"""
|
||||||
@ -52,40 +59,41 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
self._tts_invoke(
|
self._tts_invoke(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text='Hello world!',
|
content_text='Hello Dify!',
|
||||||
user=user
|
voice=self._get_model_default_voice(model, credentials),
|
||||||
)
|
)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
||||||
"""
|
"""
|
||||||
_tts_invoke text2speech model
|
_tts_invoke text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param user: unique user id
|
:param voice: model timbre
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
max_workers = self._get_model_workers_limit(model, credentials)
|
max_workers = self._get_model_workers_limit(model, credentials)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
audio_bytes_list = list()
|
audio_bytes_list = list()
|
||||||
|
|
||||||
# Create a thread pool and map the function to the list of sentences
|
# Create a thread pool and map the function to the list of sentences
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence
|
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
|
||||||
in sentences]
|
credentials=credentials) for sentence in sentences]
|
||||||
for future in futures:
|
for future in futures:
|
||||||
try:
|
try:
|
||||||
|
if future.result():
|
||||||
audio_bytes_list.append(future.result())
|
audio_bytes_list.append(future.result())
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
|
if len(audio_bytes_list) > 0:
|
||||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||||
audio_bytes_list if audio_bytes]
|
audio_bytes_list if audio_bytes]
|
||||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||||
@ -97,46 +105,50 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
|
|||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
# Todo: To improve the streaming function
|
# Todo: To improve the streaming function
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
|
||||||
|
voice: str) -> any:
|
||||||
"""
|
"""
|
||||||
_tts_invoke_streaming text2speech model
|
_tts_invoke_streaming text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param user: unique user id
|
:param voice: model timbre
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
# transform credentials to kwargs for model instance
|
# transform credentials to kwargs for model instance
|
||||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||||
voice_name = self._get_model_voice(model, credentials)
|
if not voice:
|
||||||
|
voice = self._get_model_default_voice(model, credentials)
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
tts_file_id = self._get_file_name(content_text)
|
tts_file_id = self._get_file_name(content_text)
|
||||||
file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}'
|
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
|
||||||
try:
|
try:
|
||||||
client = OpenAI(**credentials_kwargs)
|
client = OpenAI(**credentials_kwargs)
|
||||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
||||||
response.stream_to_file(file_path)
|
# response.stream_to_file(file_path)
|
||||||
|
storage.save(file_path, response.read())
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
def _process_sentence(self, sentence: str, model: str, credentials: dict):
|
def _process_sentence(self, sentence: str, model: str,
|
||||||
|
voice, credentials: dict):
|
||||||
"""
|
"""
|
||||||
_tts_invoke openai text2speech model api
|
_tts_invoke openai text2speech model api
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param sentence: text content to be translated
|
:param sentence: text content to be translated
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
# transform credentials to kwargs for model instance
|
# transform credentials to kwargs for model instance
|
||||||
credentials_kwargs = self._to_credential_kwargs(credentials)
|
credentials_kwargs = self._to_credential_kwargs(credentials)
|
||||||
voice_name = self._get_model_voice(model, credentials)
|
|
||||||
|
|
||||||
client = OpenAI(**credentials_kwargs)
|
client = OpenAI(**credentials_kwargs)
|
||||||
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
|
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
|
||||||
if isinstance(response.read(), bytes):
|
if isinstance(response.read(), bytes):
|
||||||
return response.read()
|
return response.read()
|
||||||
|
@ -1,7 +1,134 @@
|
|||||||
model: tts-1
|
model: tts-1
|
||||||
model_type: tts
|
model_type: tts
|
||||||
model_properties:
|
model_properties:
|
||||||
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
|
default_voice: 'sambert-zhiru-v1'
|
||||||
|
voices:
|
||||||
|
- mode: "sambert-zhinan-v1"
|
||||||
|
name: "知楠(广告男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiqi-v1"
|
||||||
|
name: "知琪(温柔女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhichu-v1"
|
||||||
|
name: "知厨(新闻播报)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhide-v1"
|
||||||
|
name: "知德(新闻男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhijia-v1"
|
||||||
|
name: "知佳(标准女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiru-v1"
|
||||||
|
name: "知茹(新闻女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiqian-v1"
|
||||||
|
name: "知倩(配音解说、新闻播报)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhixiang-v1"
|
||||||
|
name: "知祥(配音解说)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiwei-v1"
|
||||||
|
name: "知薇(萝莉女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhihao-v1"
|
||||||
|
name: "知浩(咨询男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhijing-v1"
|
||||||
|
name: "知婧(严厉女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiming-v1"
|
||||||
|
name: "知茗(诙谐男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhimo-v1"
|
||||||
|
name: "知墨(情感男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhina-v1"
|
||||||
|
name: "知娜(浙普女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhishu-v1"
|
||||||
|
name: "知树(资讯男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhistella-v1"
|
||||||
|
name: "知莎(知性女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiting-v1"
|
||||||
|
name: "知婷(电台女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhixiao-v1"
|
||||||
|
name: "知笑(资讯女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiya-v1"
|
||||||
|
name: "知雅(严厉女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiye-v1"
|
||||||
|
name: "知晔(青年男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiying-v1"
|
||||||
|
name: "知颖(软萌童声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhiyuan-v1"
|
||||||
|
name: "知媛(知心姐姐)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhigui-v1"
|
||||||
|
name: "知柜(直播女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhishuo-v1"
|
||||||
|
name: "知硕(自然男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhimiao-emo-v1"
|
||||||
|
name: "知妙(多种情感女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhimao-v1"
|
||||||
|
name: "知猫(直播女声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhilun-v1"
|
||||||
|
name: "知伦(悬疑解说)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhifei-v1"
|
||||||
|
name: "知飞(激昂解说)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-zhida-v1"
|
||||||
|
name: "知达(标准男声)"
|
||||||
|
language: [ "zh-CN", "en-US" ]
|
||||||
|
- mode: "sambert-camila-v1"
|
||||||
|
name: "Camila(西班牙语女声)"
|
||||||
|
language: [ "es-ES" ]
|
||||||
|
- mode: "sambert-perla-v1"
|
||||||
|
name: "Perla(意大利语女声)"
|
||||||
|
language: [ "it-IT" ]
|
||||||
|
- mode: "sambert-indah-v1"
|
||||||
|
name: "Indah(印尼语女声)"
|
||||||
|
language: [ "id-ID" ]
|
||||||
|
- mode: "sambert-clara-v1"
|
||||||
|
name: "Clara(法语女声)"
|
||||||
|
language: [ "fr-FR" ]
|
||||||
|
- mode: "sambert-hanna-v1"
|
||||||
|
name: "Hanna(德语女声)"
|
||||||
|
language: [ "de-DE" ]
|
||||||
|
- mode: "sambert-beth-v1"
|
||||||
|
name: "Beth(咨询女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-betty-v1"
|
||||||
|
name: "Betty(客服女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-cally-v1"
|
||||||
|
name: "Cally(自然女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-cindy-v1"
|
||||||
|
name: "Cindy(对话女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-eva-v1"
|
||||||
|
name: "Eva(陪伴女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-donna-v1"
|
||||||
|
name: "Donna(教育女声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-brian-v1"
|
||||||
|
name: "Brian(客服男声)"
|
||||||
|
language: [ "en-US" ]
|
||||||
|
- mode: "sambert-waan-v1"
|
||||||
|
name: "Waan(泰语女声)"
|
||||||
|
language: [ "th-TH" ]
|
||||||
word_limit: 120
|
word_limit: 120
|
||||||
audio_type: 'mp3'
|
audio_type: 'mp3'
|
||||||
max_workers: 5
|
max_workers: 5
|
||||||
|
@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
|
|||||||
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
from core.model_runtime.errors.validate import CredentialsValidateFailedError
|
||||||
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
from core.model_runtime.model_providers.__base.tts_model import TTSModel
|
||||||
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
|
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
|
||||||
|
from extensions.ext_storage import storage
|
||||||
|
|
||||||
|
|
||||||
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
||||||
"""
|
"""
|
||||||
Model class for Tongyi Speech to text model.
|
Model class for Tongyi Speech to text model.
|
||||||
"""
|
"""
|
||||||
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
|
|
||||||
|
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
|
||||||
|
user: Optional[str] = None) -> any:
|
||||||
"""
|
"""
|
||||||
_invoke text2speech model
|
_invoke text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param streaming: output is streaming
|
:param streaming: output is streaming
|
||||||
:param user: unique user id
|
:param user: unique user id
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
self._is_ffmpeg_installed()
|
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
if not voice:
|
||||||
|
voice = self._get_model_default_voice(model, credentials)
|
||||||
if streaming:
|
if streaming:
|
||||||
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text=content_text,
|
content_text=content_text,
|
||||||
user=user)),
|
voice=voice,
|
||||||
|
tenant_id=tenant_id)),
|
||||||
status=200, mimetype=f'audio/{audio_type}')
|
status=200, mimetype=f'audio/{audio_type}')
|
||||||
else:
|
else:
|
||||||
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
|
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
|
||||||
|
|
||||||
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
|
||||||
"""
|
"""
|
||||||
@ -52,40 +59,42 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
|||||||
self._tts_invoke(
|
self._tts_invoke(
|
||||||
model=model,
|
model=model,
|
||||||
credentials=credentials,
|
credentials=credentials,
|
||||||
content_text='Hello world!',
|
content_text='Hello Dify!',
|
||||||
user=user
|
voice=self._get_model_default_voice(model, credentials),
|
||||||
)
|
)
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise CredentialsValidateFailedError(str(ex))
|
raise CredentialsValidateFailedError(str(ex))
|
||||||
|
|
||||||
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
|
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
|
||||||
"""
|
"""
|
||||||
_tts_invoke text2speech model
|
_tts_invoke text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param user: unique user id
|
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
max_workers = self._get_model_workers_limit(model, credentials)
|
max_workers = self._get_model_workers_limit(model, credentials)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
audio_bytes_list = list()
|
audio_bytes_list = list()
|
||||||
|
|
||||||
# Create a thread pool and map the function to the list of sentences
|
# Create a thread pool and map the function to the list of sentences
|
||||||
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
|
||||||
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
|
futures = [executor.submit(self._process_sentence, sentence=sentence,
|
||||||
credentials=credentials, audio_type=audio_type) for sentence in sentences]
|
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
|
||||||
|
sentences]
|
||||||
for future in futures:
|
for future in futures:
|
||||||
try:
|
try:
|
||||||
|
if future.result():
|
||||||
audio_bytes_list.append(future.result())
|
audio_bytes_list.append(future.result())
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
|
if len(audio_bytes_list) > 0:
|
||||||
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
|
||||||
audio_bytes_list if audio_bytes]
|
audio_bytes_list if audio_bytes]
|
||||||
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
combined_segment = reduce(lambda x, y: x + y, audio_segments)
|
||||||
@ -97,46 +106,49 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
|
|||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
# Todo: To improve the streaming function
|
# Todo: To improve the streaming function
|
||||||
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
|
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
|
||||||
|
voice: str) -> any:
|
||||||
"""
|
"""
|
||||||
_tts_invoke_streaming text2speech model
|
_tts_invoke_streaming text2speech model
|
||||||
|
|
||||||
:param model: model name
|
:param model: model name
|
||||||
|
:param tenant_id: user tenant id
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
|
:param voice: model timbre
|
||||||
:param content_text: text content to be translated
|
:param content_text: text content to be translated
|
||||||
:param user: unique user id
|
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
# transform credentials to kwargs for model instance
|
|
||||||
dashscope.api_key = credentials.get('dashscope_api_key')
|
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||||
voice_name = self._get_model_voice(model, credentials)
|
|
||||||
word_limit = self._get_model_word_limit(model, credentials)
|
word_limit = self._get_model_word_limit(model, credentials)
|
||||||
audio_type = self._get_model_audio_type(model, credentials)
|
audio_type = self._get_model_audio_type(model, credentials)
|
||||||
|
tts_file_id = self._get_file_name(content_text)
|
||||||
|
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
|
||||||
try:
|
try:
|
||||||
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
|
||||||
for sentence in sentences:
|
for sentence in sentences:
|
||||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
|
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
|
||||||
|
text=sentence.strip(),
|
||||||
format=audio_type, word_timestamp_enabled=True,
|
format=audio_type, word_timestamp_enabled=True,
|
||||||
phoneme_timestamp_enabled=True)
|
phoneme_timestamp_enabled=True)
|
||||||
if isinstance(response.get_audio_data(), bytes):
|
if isinstance(response.get_audio_data(), bytes):
|
||||||
return response.get_audio_data()
|
storage.save(file_path, response.get_audio_data())
|
||||||
except Exception as ex:
|
except Exception as ex:
|
||||||
raise InvokeBadRequestError(str(ex))
|
raise InvokeBadRequestError(str(ex))
|
||||||
|
|
||||||
def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
|
@staticmethod
|
||||||
|
def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str):
|
||||||
"""
|
"""
|
||||||
_tts_invoke Tongyi text2speech model api
|
_tts_invoke Tongyi text2speech model api
|
||||||
|
|
||||||
:param model: model name
|
|
||||||
:param credentials: model credentials
|
:param credentials: model credentials
|
||||||
:param sentence: text content to be translated
|
:param sentence: text content to be translated
|
||||||
|
:param voice: model timbre
|
||||||
:param audio_type: audio file type
|
:param audio_type: audio file type
|
||||||
:return: text translated to audio file
|
:return: text translated to audio file
|
||||||
"""
|
"""
|
||||||
# transform credentials to kwargs for model instance
|
|
||||||
dashscope.api_key = credentials.get('dashscope_api_key')
|
dashscope.api_key = credentials.get('dashscope_api_key')
|
||||||
voice_name = self._get_model_voice(model, credentials)
|
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
|
||||||
|
text=sentence.strip(),
|
||||||
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
|
format=audio_type)
|
||||||
if isinstance(response.get_audio_data(), bytes):
|
if isinstance(response.get_audio_data(), bytes):
|
||||||
return response.get_audio_data()
|
return response.get_audio_data()
|
||||||
|
@ -98,7 +98,9 @@ class AppModelConfigService:
|
|||||||
# text_to_speech
|
# text_to_speech
|
||||||
if 'text_to_speech' not in config or not config["text_to_speech"]:
|
if 'text_to_speech' not in config or not config["text_to_speech"]:
|
||||||
config["text_to_speech"] = {
|
config["text_to_speech"] = {
|
||||||
"enabled": False
|
"enabled": False,
|
||||||
|
"voice": "",
|
||||||
|
"language": ""
|
||||||
}
|
}
|
||||||
|
|
||||||
if not isinstance(config["text_to_speech"], dict):
|
if not isinstance(config["text_to_speech"], dict):
|
||||||
@ -106,6 +108,8 @@ class AppModelConfigService:
|
|||||||
|
|
||||||
if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]:
|
if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]:
|
||||||
config["text_to_speech"]["enabled"] = False
|
config["text_to_speech"]["enabled"] = False
|
||||||
|
config["text_to_speech"]["voice"] = ""
|
||||||
|
config["text_to_speech"]["language"] = ""
|
||||||
|
|
||||||
if not isinstance(config["text_to_speech"]["enabled"], bool):
|
if not isinstance(config["text_to_speech"]["enabled"], bool):
|
||||||
raise ValueError("enabled in text_to_speech must be of boolean type")
|
raise ValueError("enabled in text_to_speech must be of boolean type")
|
||||||
|
@ -13,14 +13,14 @@ from services.errors.audio import (
|
|||||||
UnsupportedAudioTypeServiceError,
|
UnsupportedAudioTypeServiceError,
|
||||||
)
|
)
|
||||||
|
|
||||||
FILE_SIZE = 15
|
FILE_SIZE = 30
|
||||||
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
|
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
|
||||||
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
|
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
|
||||||
|
|
||||||
|
|
||||||
class AudioService:
|
class AudioService:
|
||||||
@classmethod
|
@classmethod
|
||||||
def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None):
|
def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None):
|
||||||
if file is None:
|
if file is None:
|
||||||
raise NoAudioUploadedServiceError()
|
raise NoAudioUploadedServiceError()
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ class AudioService:
|
|||||||
return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
|
return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None):
|
def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None):
|
||||||
model_manager = ModelManager()
|
model_manager = ModelManager()
|
||||||
model_instance = model_manager.get_default_model_instance(
|
model_instance = model_manager.get_default_model_instance(
|
||||||
tenant_id=tenant_id,
|
tenant_id=tenant_id,
|
||||||
@ -59,6 +59,21 @@ class AudioService:
|
|||||||
raise ProviderNotSupportTextToSpeechServiceError()
|
raise ProviderNotSupportTextToSpeechServiceError()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
|
return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice)
|
||||||
|
except Exception as e:
|
||||||
|
raise e
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def transcript_tts_voices(cls, tenant_id: str, language: str):
|
||||||
|
model_manager = ModelManager()
|
||||||
|
model_instance = model_manager.get_default_model_instance(
|
||||||
|
tenant_id=tenant_id,
|
||||||
|
model_type=ModelType.TTS
|
||||||
|
)
|
||||||
|
if model_instance is None:
|
||||||
|
raise ProviderNotSupportTextToSpeechServiceError()
|
||||||
|
|
||||||
|
try:
|
||||||
|
return model_instance.get_tts_voices(language)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise e
|
raise e
|
||||||
|
@ -16,3 +16,7 @@ class ProviderNotSupportSpeechToTextServiceError(Exception):
|
|||||||
|
|
||||||
class ProviderNotSupportTextToSpeechServiceError(Exception):
|
class ProviderNotSupportTextToSpeechServiceError(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ProviderNotSupportTextToSpeechLanageServiceError(Exception):
|
||||||
|
pass
|
||||||
|
@ -2,6 +2,7 @@
|
|||||||
import type { FC, ReactNode } from 'react'
|
import type { FC, ReactNode } from 'react'
|
||||||
import React from 'react'
|
import React from 'react'
|
||||||
import cn from 'classnames'
|
import cn from 'classnames'
|
||||||
|
import ParamsConfig from '@/app/components/app/configuration/config-voice/param-config'
|
||||||
|
|
||||||
export type IFeaturePanelProps = {
|
export type IFeaturePanelProps = {
|
||||||
className?: string
|
className?: string
|
||||||
@ -12,6 +13,7 @@ export type IFeaturePanelProps = {
|
|||||||
isFocus?: boolean
|
isFocus?: boolean
|
||||||
noBodySpacing?: boolean
|
noBodySpacing?: boolean
|
||||||
children?: ReactNode
|
children?: ReactNode
|
||||||
|
isShowTextToSpeech?: boolean
|
||||||
}
|
}
|
||||||
|
|
||||||
const FeaturePanel: FC<IFeaturePanelProps> = ({
|
const FeaturePanel: FC<IFeaturePanelProps> = ({
|
||||||
@ -23,6 +25,7 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({
|
|||||||
isFocus,
|
isFocus,
|
||||||
noBodySpacing,
|
noBodySpacing,
|
||||||
children,
|
children,
|
||||||
|
isShowTextToSpeech,
|
||||||
}) => {
|
}) => {
|
||||||
return (
|
return (
|
||||||
<div
|
<div
|
||||||
@ -41,7 +44,13 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({
|
|||||||
<div className='text-sm font-semibold text-gray-800'>{title}</div>
|
<div className='text-sm font-semibold text-gray-800'>{title}</div>
|
||||||
</div>
|
</div>
|
||||||
<div>
|
<div>
|
||||||
{headerRight}
|
{isShowTextToSpeech
|
||||||
|
? (
|
||||||
|
<div className='flex items-center'>
|
||||||
|
<ParamsConfig/>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
: headerRight}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -0,0 +1,187 @@
|
|||||||
|
'use client'
|
||||||
|
import useSWR from 'swr'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import { useContext } from 'use-context-selector'
|
||||||
|
import React, { Fragment } from 'react'
|
||||||
|
import classNames from 'classnames'
|
||||||
|
import { usePathname } from 'next/navigation'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import { Listbox, Transition } from '@headlessui/react'
|
||||||
|
import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
|
||||||
|
import type { Item } from '@/app/components/base/select'
|
||||||
|
import ConfigContext from '@/context/debug-configuration'
|
||||||
|
import { fetchAppVoices } from '@/service/apps'
|
||||||
|
import Tooltip from '@/app/components/base/tooltip'
|
||||||
|
import { HelpCircle } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
|
||||||
|
const VoiceParamConfig: FC = () => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const pathname = usePathname()
|
||||||
|
const matched = pathname.match(/\/app\/([^/]+)/)
|
||||||
|
const appId = (matched?.length && matched[1]) ? matched[1] : ''
|
||||||
|
|
||||||
|
const LanguageItems = [
|
||||||
|
{ value: 'zh-CN', name: '中文' },
|
||||||
|
{ value: 'en-US', name: '英语' },
|
||||||
|
{ value: 'de-DE', name: '德语' },
|
||||||
|
{ value: 'fr-FR', name: '法语' },
|
||||||
|
{ value: 'es-ES', name: '西班牙语' },
|
||||||
|
{ value: 'it-IT', name: '意大利语' },
|
||||||
|
{ value: 'th-TH', name: '泰语' },
|
||||||
|
{ value: 'id-ID', name: '印尼语' },
|
||||||
|
]
|
||||||
|
const {
|
||||||
|
textToSpeechConfig,
|
||||||
|
setTextToSpeechConfig,
|
||||||
|
} = useContext(ConfigContext)
|
||||||
|
|
||||||
|
const languageItem = LanguageItems.find(item => item.value === textToSpeechConfig.language)
|
||||||
|
const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
|
||||||
|
|
||||||
|
const voiceItems = useSWR({ url: `/apps/${appId}/text-to-audio/voices?language=${languageItem ? languageItem.value : 'zh-CN'}` }, fetchAppVoices).data
|
||||||
|
const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
|
||||||
|
const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
|
||||||
|
|
||||||
|
return (
|
||||||
|
<div>
|
||||||
|
<div>
|
||||||
|
<div className='leading-6 text-base font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.title')}</div>
|
||||||
|
<div className='pt-3 space-y-6'>
|
||||||
|
<div>
|
||||||
|
<div className='mb-2 flex items-center space-x-1'>
|
||||||
|
<div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
|
||||||
|
<Tooltip htmlContent={<div className='w-[180px]' >
|
||||||
|
{t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
|
||||||
|
<div key={item}>{item}</div>
|
||||||
|
))}
|
||||||
|
</div>} selector='config-resolution-tooltip'>
|
||||||
|
<HelpCircle className='w-[14px] h-[14px] text-gray-400' />
|
||||||
|
</Tooltip>
|
||||||
|
</div>
|
||||||
|
<Listbox
|
||||||
|
value={languageItem}
|
||||||
|
onChange={(value: Item) => {
|
||||||
|
setTextToSpeechConfig({
|
||||||
|
...textToSpeechConfig,
|
||||||
|
language: String(value.value),
|
||||||
|
})
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<div className={'relative h-9'}>
|
||||||
|
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
|
||||||
|
<span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>{languageItem?.name ?? localLanguagePlaceholder}</span>
|
||||||
|
<span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
|
||||||
|
<ChevronDownIcon
|
||||||
|
className="h-5 w-5 text-gray-400"
|
||||||
|
aria-hidden="true"
|
||||||
|
/>
|
||||||
|
</span>
|
||||||
|
</Listbox.Button>
|
||||||
|
<Transition
|
||||||
|
as={Fragment}
|
||||||
|
leave="transition ease-in duration-100"
|
||||||
|
leaveFrom="opacity-100"
|
||||||
|
leaveTo="opacity-0"
|
||||||
|
>
|
||||||
|
|
||||||
|
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
|
||||||
|
{LanguageItems.map((item: Item) => (
|
||||||
|
<Listbox.Option
|
||||||
|
key={item.value}
|
||||||
|
className={({ active }) =>
|
||||||
|
`relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : ''
|
||||||
|
}`
|
||||||
|
}
|
||||||
|
value={item}
|
||||||
|
disabled={false}
|
||||||
|
>
|
||||||
|
{({ /* active, */ selected }) => (
|
||||||
|
<>
|
||||||
|
<span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
|
||||||
|
{(selected || item.value === textToSpeechConfig.language) && (
|
||||||
|
<span
|
||||||
|
className={classNames(
|
||||||
|
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<CheckIcon className="h-5 w-5" aria-hidden="true" />
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</Listbox.Option>
|
||||||
|
))}
|
||||||
|
</Listbox.Options>
|
||||||
|
</Transition>
|
||||||
|
</div>
|
||||||
|
</Listbox>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div>
|
||||||
|
<div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
|
||||||
|
<Listbox
|
||||||
|
value={voiceItem}
|
||||||
|
disabled={!languageItem}
|
||||||
|
onChange={(value: Item) => {
|
||||||
|
setTextToSpeechConfig({
|
||||||
|
...textToSpeechConfig,
|
||||||
|
voice: String(value.value),
|
||||||
|
})
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<div className={'relative h-9'}>
|
||||||
|
<Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
|
||||||
|
<span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
|
||||||
|
<span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
|
||||||
|
<ChevronDownIcon
|
||||||
|
className="h-5 w-5 text-gray-400"
|
||||||
|
aria-hidden="true"
|
||||||
|
/>
|
||||||
|
</span>
|
||||||
|
</Listbox.Button>
|
||||||
|
<Transition
|
||||||
|
as={Fragment}
|
||||||
|
leave="transition ease-in duration-100"
|
||||||
|
leaveFrom="opacity-100"
|
||||||
|
leaveTo="opacity-0"
|
||||||
|
>
|
||||||
|
|
||||||
|
<Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
|
||||||
|
{voiceItems?.map((item: Item) => (
|
||||||
|
<Listbox.Option
|
||||||
|
key={item.value}
|
||||||
|
className={({ active }) =>
|
||||||
|
`relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : ''
|
||||||
|
}`
|
||||||
|
}
|
||||||
|
value={item}
|
||||||
|
disabled={false}
|
||||||
|
>
|
||||||
|
{({ /* active, */ selected }) => (
|
||||||
|
<>
|
||||||
|
<span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
|
||||||
|
{(selected || item.value === textToSpeechConfig.voice) && (
|
||||||
|
<span
|
||||||
|
className={classNames(
|
||||||
|
'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
|
||||||
|
)}
|
||||||
|
>
|
||||||
|
<CheckIcon className="h-5 w-5" aria-hidden="true" />
|
||||||
|
</span>
|
||||||
|
)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
|
</Listbox.Option>
|
||||||
|
))}
|
||||||
|
</Listbox.Options>
|
||||||
|
</Transition>
|
||||||
|
</div>
|
||||||
|
</Listbox>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
export default React.memo(VoiceParamConfig)
|
@ -0,0 +1,41 @@
|
|||||||
|
'use client'
|
||||||
|
import type { FC } from 'react'
|
||||||
|
import { memo, useState } from 'react'
|
||||||
|
import { useTranslation } from 'react-i18next'
|
||||||
|
import cn from 'classnames'
|
||||||
|
import VoiceParamConfig from './param-config-content'
|
||||||
|
import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
|
||||||
|
import {
|
||||||
|
PortalToFollowElem,
|
||||||
|
PortalToFollowElemContent,
|
||||||
|
PortalToFollowElemTrigger,
|
||||||
|
} from '@/app/components/base/portal-to-follow-elem'
|
||||||
|
|
||||||
|
const ParamsConfig: FC = () => {
|
||||||
|
const { t } = useTranslation()
|
||||||
|
const [open, setOpen] = useState(false)
|
||||||
|
|
||||||
|
return (
|
||||||
|
<PortalToFollowElem
|
||||||
|
open={open}
|
||||||
|
onOpenChange={setOpen}
|
||||||
|
placement='bottom-end'
|
||||||
|
offset={{
|
||||||
|
mainAxis: 4,
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<PortalToFollowElemTrigger onClick={() => setOpen(v => !v)}>
|
||||||
|
<div className={cn('flex items-center rounded-md h-7 px-3 space-x-1 text-gray-700 cursor-pointer hover:bg-gray-200', open && 'bg-gray-200')}>
|
||||||
|
<Settings01 className='w-3.5 h-3.5 ' />
|
||||||
|
<div className='ml-1 leading-[18px] text-xs font-medium '>{t('appDebug.voice.settings')}</div>
|
||||||
|
</div>
|
||||||
|
</PortalToFollowElemTrigger>
|
||||||
|
<PortalToFollowElemContent style={{ zIndex: 50 }}>
|
||||||
|
<div className='w-80 sm:w-[412px] p-4 bg-white rounded-lg border-[0.5px] border-gray-200 shadow-lg space-y-3'>
|
||||||
|
<VoiceParamConfig />
|
||||||
|
</div>
|
||||||
|
</PortalToFollowElemContent>
|
||||||
|
</PortalToFollowElem>
|
||||||
|
)
|
||||||
|
}
|
||||||
|
export default memo(ParamsConfig)
|
@ -119,6 +119,8 @@ const Config: FC = () => {
|
|||||||
setTextToSpeech: (value) => {
|
setTextToSpeech: (value) => {
|
||||||
setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => {
|
setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => {
|
||||||
draft.enabled = value
|
draft.enabled = value
|
||||||
|
draft.voice = textToSpeechConfig?.voice
|
||||||
|
draft.language = textToSpeechConfig?.language
|
||||||
}))
|
}))
|
||||||
},
|
},
|
||||||
citation: citationConfig.enabled,
|
citation: citationConfig.enabled,
|
||||||
@ -245,6 +247,7 @@ const Config: FC = () => {
|
|||||||
{(isAgent && isChatApp) && (
|
{(isAgent && isChatApp) && (
|
||||||
<AgentTools />
|
<AgentTools />
|
||||||
)}
|
)}
|
||||||
|
|
||||||
<ConfigVision />
|
<ConfigVision />
|
||||||
|
|
||||||
{/* Chat History */}
|
{/* Chat History */}
|
||||||
|
@ -61,6 +61,11 @@ const TextGenerationItem: FC<TextGenerationItemProps> = ({
|
|||||||
sensitive_word_avoidance: moderationConfig,
|
sensitive_word_avoidance: moderationConfig,
|
||||||
external_data_tools: externalDataToolsConfig,
|
external_data_tools: externalDataToolsConfig,
|
||||||
more_like_this: moreLikeThisConfig,
|
more_like_this: moreLikeThisConfig,
|
||||||
|
text_to_speech: {
|
||||||
|
enabled: false,
|
||||||
|
voice: '',
|
||||||
|
language: '',
|
||||||
|
},
|
||||||
agent_mode: {
|
agent_mode: {
|
||||||
enabled: false,
|
enabled: false,
|
||||||
tools: [],
|
tools: [],
|
||||||
|
@ -213,9 +213,6 @@ const Debug: FC<IDebug> = ({
|
|||||||
const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key
|
const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key
|
||||||
|
|
||||||
const postModelConfig: BackendModelConfig = {
|
const postModelConfig: BackendModelConfig = {
|
||||||
text_to_speech: {
|
|
||||||
enabled: false,
|
|
||||||
},
|
|
||||||
pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '',
|
pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '',
|
||||||
prompt_type: promptMode,
|
prompt_type: promptMode,
|
||||||
chat_prompt_config: {},
|
chat_prompt_config: {},
|
||||||
@ -234,6 +231,11 @@ const Debug: FC<IDebug> = ({
|
|||||||
mode: modelConfig.mode,
|
mode: modelConfig.mode,
|
||||||
completion_params: completionParams as any,
|
completion_params: completionParams as any,
|
||||||
},
|
},
|
||||||
|
text_to_speech: {
|
||||||
|
enabled: false,
|
||||||
|
voice: '',
|
||||||
|
language: '',
|
||||||
|
},
|
||||||
agent_mode: {
|
agent_mode: {
|
||||||
enabled: false,
|
enabled: false,
|
||||||
tools: [],
|
tools: [],
|
||||||
|
@ -19,6 +19,7 @@ const TextToSpeech: FC = () => {
|
|||||||
<div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div>
|
<div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div>
|
||||||
}
|
}
|
||||||
noBodySpacing
|
noBodySpacing
|
||||||
|
isShowTextToSpeech={true}
|
||||||
/>
|
/>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
@ -30,6 +30,7 @@ import type {
|
|||||||
MoreLikeThisConfig,
|
MoreLikeThisConfig,
|
||||||
PromptConfig,
|
PromptConfig,
|
||||||
PromptVariable,
|
PromptVariable,
|
||||||
|
TextToSpeechConfig,
|
||||||
} from '@/models/debug'
|
} from '@/models/debug'
|
||||||
import type { ExternalDataTool } from '@/models/common'
|
import type { ExternalDataTool } from '@/models/common'
|
||||||
import type { DataSet } from '@/models/datasets'
|
import type { DataSet } from '@/models/datasets'
|
||||||
@ -98,8 +99,10 @@ const Configuration: FC = () => {
|
|||||||
const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({
|
const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({
|
||||||
enabled: false,
|
enabled: false,
|
||||||
})
|
})
|
||||||
const [textToSpeechConfig, setTextToSpeechConfig] = useState<MoreLikeThisConfig>({
|
const [textToSpeechConfig, setTextToSpeechConfig] = useState<TextToSpeechConfig>({
|
||||||
enabled: false,
|
enabled: false,
|
||||||
|
voice: '',
|
||||||
|
language: '',
|
||||||
})
|
})
|
||||||
const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({
|
const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({
|
||||||
enabled: false,
|
enabled: false,
|
||||||
@ -246,6 +249,8 @@ const Configuration: FC = () => {
|
|||||||
})
|
})
|
||||||
setTextToSpeechConfig(modelConfig.text_to_speech || {
|
setTextToSpeechConfig(modelConfig.text_to_speech || {
|
||||||
enabled: false,
|
enabled: false,
|
||||||
|
voice: '',
|
||||||
|
language: '',
|
||||||
})
|
})
|
||||||
setCitationConfig(modelConfig.retriever_resource || {
|
setCitationConfig(modelConfig.retriever_resource || {
|
||||||
enabled: false,
|
enabled: false,
|
||||||
|
@ -73,7 +73,8 @@ const Operation: FC<OperationProps> = ({
|
|||||||
/>
|
/>
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
{!isOpeningStatement && config?.text_to_speech && (
|
|
||||||
|
{(!isOpeningStatement && config?.text_to_speech.enabled) && (
|
||||||
<AudioBtn
|
<AudioBtn
|
||||||
value={content}
|
value={content}
|
||||||
className='hidden group-hover:block'
|
className='hidden group-hover:block'
|
||||||
|
@ -156,6 +156,8 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({
|
|||||||
setSpeechToTextConfig: () => { },
|
setSpeechToTextConfig: () => { },
|
||||||
textToSpeechConfig: {
|
textToSpeechConfig: {
|
||||||
enabled: false,
|
enabled: false,
|
||||||
|
voice: '',
|
||||||
|
language: '',
|
||||||
},
|
},
|
||||||
setTextToSpeechConfig: () => { },
|
setTextToSpeechConfig: () => { },
|
||||||
citationConfig: {
|
citationConfig: {
|
||||||
|
@ -298,6 +298,17 @@ const translation = {
|
|||||||
uploadLimit: 'Upload Limit',
|
uploadLimit: 'Upload Limit',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
voice: {
|
||||||
|
name: 'Voice',
|
||||||
|
description: 'Text to speech voice Settings',
|
||||||
|
settings: 'Settings',
|
||||||
|
voiceSettings: {
|
||||||
|
title: 'Voice Settings',
|
||||||
|
language: 'Language',
|
||||||
|
resolutionTooltip: 'Text-to-speech voice support language。',
|
||||||
|
voice: 'Voice',
|
||||||
|
},
|
||||||
|
},
|
||||||
openingStatement: {
|
openingStatement: {
|
||||||
title: 'Conversation Opener',
|
title: 'Conversation Opener',
|
||||||
add: 'Add',
|
add: 'Add',
|
||||||
|
@ -294,6 +294,17 @@ const translation = {
|
|||||||
uploadLimit: '上传数量限制',
|
uploadLimit: '上传数量限制',
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
voice: {
|
||||||
|
name: '音色',
|
||||||
|
description: '文本转语音音色设置',
|
||||||
|
settings: '设置',
|
||||||
|
voiceSettings: {
|
||||||
|
title: '音色设置',
|
||||||
|
language: '语言',
|
||||||
|
resolutionTooltip: '文本转语音音色支持语言。',
|
||||||
|
voice: '音色',
|
||||||
|
},
|
||||||
|
},
|
||||||
openingStatement: {
|
openingStatement: {
|
||||||
title: '对话开场白',
|
title: '对话开场白',
|
||||||
add: '添加开场白',
|
add: '添加开场白',
|
||||||
|
@ -122,3 +122,8 @@ export type UpdateOpenAIKeyResponse = ValidateOpenAIKeyResponse
|
|||||||
export type GenerationIntroductionResponse = {
|
export type GenerationIntroductionResponse = {
|
||||||
introduction: string
|
introduction: string
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export type AppVoicesListResponse = [{
|
||||||
|
name: string
|
||||||
|
value: string
|
||||||
|
}]
|
||||||
|
@ -75,7 +75,11 @@ export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig
|
|||||||
|
|
||||||
export type SpeechToTextConfig = MoreLikeThisConfig
|
export type SpeechToTextConfig = MoreLikeThisConfig
|
||||||
|
|
||||||
export type TextToSpeechConfig = MoreLikeThisConfig
|
export type TextToSpeechConfig = {
|
||||||
|
enabled: boolean
|
||||||
|
voice?: string
|
||||||
|
language?: string
|
||||||
|
}
|
||||||
|
|
||||||
export type CitationConfig = MoreLikeThisConfig
|
export type CitationConfig = MoreLikeThisConfig
|
||||||
|
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
import type { Fetcher } from 'swr'
|
import type { Fetcher } from 'swr'
|
||||||
import { del, get, post } from './base'
|
import { del, get, post } from './base'
|
||||||
import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app'
|
import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, AppVoicesListResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app'
|
||||||
import type { CommonResponse } from '@/models/common'
|
import type { CommonResponse } from '@/models/common'
|
||||||
import type { AppMode, ModelConfig } from '@/types/app'
|
import type { AppMode, ModelConfig } from '@/types/app'
|
||||||
|
|
||||||
@ -93,3 +93,7 @@ export const updateOpenAIKey: Fetcher<UpdateOpenAIKeyResponse, { url: string; bo
|
|||||||
export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => {
|
export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => {
|
||||||
return post<GenerationIntroductionResponse>(url, { body })
|
return post<GenerationIntroductionResponse>(url, { body })
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const fetchAppVoices: Fetcher<AppVoicesListResponse, { url: string }> = ({ url }) => {
|
||||||
|
return get<AppVoicesListResponse>(url)
|
||||||
|
}
|
||||||
|
@ -155,6 +155,8 @@ export type ModelConfig = {
|
|||||||
}
|
}
|
||||||
text_to_speech: {
|
text_to_speech: {
|
||||||
enabled: boolean
|
enabled: boolean
|
||||||
|
voice?: string
|
||||||
|
language?: string
|
||||||
}
|
}
|
||||||
retriever_resource: {
|
retriever_resource: {
|
||||||
enabled: boolean
|
enabled: boolean
|
||||||
|
Loading…
x
Reference in New Issue
Block a user