diff --git a/api/controllers/console/app/audio.py b/api/controllers/console/app/audio.py index d95b3d03c2..a401763776 100644 --- a/api/controllers/console/app/audio.py +++ b/api/controllers/console/app/audio.py @@ -1,7 +1,7 @@ import logging from flask import request -from flask_restful import Resource +from flask_restful import Resource, reqparse from werkzeug.exceptions import InternalServerError import services @@ -23,6 +23,7 @@ from controllers.console.wraps import account_initialization_required from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError from core.model_runtime.errors.invoke import InvokeError from libs.login import login_required +from models.model import AppModelConfig from services.audio_service import AudioService from services.errors.audio import ( AudioTooLargeServiceError, @@ -45,7 +46,9 @@ class ChatMessageAudioApi(Resource): try: response = AudioService.transcript_asr( tenant_id=app_model.tenant_id, - file=file + file=file, + end_user=None, + promot=app_model.app_model_config.pre_prompt ) return response @@ -71,7 +74,7 @@ class ChatMessageAudioApi(Resource): except ValueError as e: raise e except Exception as e: - logging.exception("internal server error.") + logging.exception(f"internal server error, {str(e)}.") raise InternalServerError() @@ -82,10 +85,17 @@ class ChatMessageTextApi(Resource): def post(self, app_id): app_id = str(app_id) app_model = _get_app(app_id, None) + + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.text_to_speech_dict['enabled']: + raise AppUnavailableError() + try: response = AudioService.transcript_tts( tenant_id=app_model.tenant_id, text=request.form['text'], + voice=app_model.app_model_config.text_to_speech_dict.get('voice'), streaming=False ) @@ -112,9 +122,54 @@ class ChatMessageTextApi(Resource): except ValueError as e: raise e except Exception as e: - logging.exception("internal server error.") + logging.exception(f"internal server error, {str(e)}.") + raise InternalServerError() + + +class TextModesApi(Resource): + def get(self, app_id: str): + app_model = _get_app(str(app_id)) + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.text_to_speech_dict['enabled']: + raise AppUnavailableError() + + try: + parser = reqparse.RequestParser() + parser.add_argument('language', type=str, required=True, location='args') + args = parser.parse_args() + + response = AudioService.transcript_tts_voices( + tenant_id=app_model.tenant_id, + language=args['language'], + ) + + return response + except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError: + raise AppUnavailableError("Text to audio voices language parameter loss.") + except NoAudioUploadedServiceError: + raise NoAudioUploadedError() + except AudioTooLargeServiceError as e: + raise AudioTooLargeError(str(e)) + except UnsupportedAudioTypeServiceError: + raise UnsupportedAudioTypeError() + except ProviderNotSupportSpeechToTextServiceError: + raise ProviderNotSupportSpeechToTextError() + except ProviderTokenNotInitError as ex: + raise ProviderNotInitializeError(ex.description) + except QuotaExceededError: + raise ProviderQuotaExceededError() + except ModelCurrentlyNotSupportError: + raise ProviderModelCurrentlyNotSupportError() + except InvokeError as e: + raise CompletionRequestError(e.description) + except ValueError as e: + raise e + except Exception as e: + logging.exception(f"internal server error, {str(e)}.") raise InternalServerError() api.add_resource(ChatMessageAudioApi, '/apps//audio-to-text') api.add_resource(ChatMessageTextApi, '/apps//text-to-audio') +api.add_resource(TextModesApi, '/apps//text-to-audio/voices') diff --git a/api/controllers/console/explore/audio.py b/api/controllers/console/explore/audio.py index d6afee0d63..f957d38174 100644 --- a/api/controllers/console/explore/audio.py +++ b/api/controllers/console/explore/audio.py @@ -85,6 +85,7 @@ class ChatTextApi(InstalledAppResource): response = AudioService.transcript_tts( tenant_id=app_model.tenant_id, text=request.form['text'], + voice=app_model.app_model_config.text_to_speech_dict.get('voice'), streaming=False ) return {'data': response.data.decode('latin1')} diff --git a/api/controllers/service_api/app/audio.py b/api/controllers/service_api/app/audio.py index 574fc55454..d2906b1d6e 100644 --- a/api/controllers/service_api/app/audio.py +++ b/api/controllers/service_api/app/audio.py @@ -86,6 +86,7 @@ class TextApi(AppApiResource): tenant_id=app_model.tenant_id, text=args['text'], end_user=args['user'], + voice=app_model.app_model_config.text_to_speech_dict.get('voice'), streaming=args['streaming'] ) diff --git a/api/controllers/web/audio.py b/api/controllers/web/audio.py index 673aa9ad8c..c628c16606 100644 --- a/api/controllers/web/audio.py +++ b/api/controllers/web/audio.py @@ -68,17 +68,23 @@ class AudioApi(WebApiResource): except ValueError as e: raise e except Exception as e: - logging.exception("internal server error.") + logging.exception(f"internal server error: {str(e)}") raise InternalServerError() class TextApi(WebApiResource): def post(self, app_model: App, end_user): + app_model_config: AppModelConfig = app_model.app_model_config + + if not app_model_config.text_to_speech_dict['enabled']: + raise AppUnavailableError() + try: response = AudioService.transcript_tts( tenant_id=app_model.tenant_id, text=request.form['text'], end_user=end_user.external_user_id, + voice=app_model.app_model_config.text_to_speech_dict.get('voice'), streaming=False ) @@ -105,7 +111,7 @@ class TextApi(WebApiResource): except ValueError as e: raise e except Exception as e: - logging.exception("internal server error.") + logging.exception(f"internal server error: {str(e)}") raise InternalServerError() diff --git a/api/core/application_manager.py b/api/core/application_manager.py index d2f4326b4f..e073eac4b9 100644 --- a/api/core/application_manager.py +++ b/api/core/application_manager.py @@ -28,6 +28,7 @@ from core.entities.application_entities import ( ModelConfigEntity, PromptTemplateEntity, SensitiveWordAvoidanceEntity, + TextToSpeechEntity, ) from core.entities.model_entities import ModelStatus from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError @@ -572,7 +573,11 @@ class ApplicationManager: text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech') if text_to_speech_dict: if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']: - properties['text_to_speech'] = True + properties['text_to_speech'] = TextToSpeechEntity( + enabled=text_to_speech_dict.get('enabled'), + voice=text_to_speech_dict.get('voice'), + language=text_to_speech_dict.get('language'), + ) # sensitive word avoidance sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance') diff --git a/api/core/entities/application_entities.py b/api/core/entities/application_entities.py index d26998ce80..abcf605c92 100644 --- a/api/core/entities/application_entities.py +++ b/api/core/entities/application_entities.py @@ -42,6 +42,7 @@ class AdvancedCompletionPromptTemplateEntity(BaseModel): """ Advanced Completion Prompt Template Entity. """ + class RolePrefixEntity(BaseModel): """ Role Prefix Entity. @@ -57,6 +58,7 @@ class PromptTemplateEntity(BaseModel): """ Prompt Template Entity. """ + class PromptType(Enum): """ Prompt Type. @@ -97,6 +99,7 @@ class DatasetRetrieveConfigEntity(BaseModel): """ Dataset Retrieve Config Entity. """ + class RetrieveStrategy(Enum): """ Dataset Retrieve Strategy. @@ -143,6 +146,15 @@ class SensitiveWordAvoidanceEntity(BaseModel): config: dict[str, Any] = {} +class TextToSpeechEntity(BaseModel): + """ + Sensitive Word Avoidance Entity. + """ + enabled: bool + voice: Optional[str] = None + language: Optional[str] = None + + class FileUploadEntity(BaseModel): """ File Upload Entity. @@ -159,6 +171,7 @@ class AgentToolEntity(BaseModel): tool_name: str tool_parameters: dict[str, Any] = {} + class AgentPromptEntity(BaseModel): """ Agent Prompt Entity. @@ -166,6 +179,7 @@ class AgentPromptEntity(BaseModel): first_prompt: str next_iteration: str + class AgentScratchpadUnit(BaseModel): """ Agent First Prompt Entity. @@ -182,12 +196,14 @@ class AgentScratchpadUnit(BaseModel): thought: Optional[str] = None action_str: Optional[str] = None observation: Optional[str] = None - action: Optional[Action] = None + action: Optional[Action] = None + class AgentEntity(BaseModel): """ Agent Entity. """ + class Strategy(Enum): """ Agent Strategy. @@ -202,6 +218,7 @@ class AgentEntity(BaseModel): tools: list[AgentToolEntity] = None max_iteration: int = 5 + class AppOrchestrationConfigEntity(BaseModel): """ App Orchestration Config Entity. @@ -219,7 +236,7 @@ class AppOrchestrationConfigEntity(BaseModel): show_retrieve_source: bool = False more_like_this: bool = False speech_to_text: bool = False - text_to_speech: bool = False + text_to_speech: dict = {} sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None diff --git a/api/core/model_manager.py b/api/core/model_manager.py index 8e36ab7ee8..aa16cf866f 100644 --- a/api/core/model_manager.py +++ b/api/core/model_manager.py @@ -99,7 +99,8 @@ class ModelInstance: user=user ) - def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None, + def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, + top_n: Optional[int] = None, user: Optional[str] = None) \ -> RerankResult: """ @@ -166,13 +167,15 @@ class ModelInstance: user=user ) - def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \ + def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \ -> str: """ - Invoke large language model + Invoke large language tts model :param content_text: text content to be translated + :param tenant_id: user tenant id :param user: unique user id + :param voice: model timbre :param streaming: output is streaming :return: text for given audio file """ @@ -185,9 +188,28 @@ class ModelInstance: credentials=self.credentials, content_text=content_text, user=user, + tenant_id=tenant_id, + voice=voice, streaming=streaming ) + def get_tts_voices(self, language: str) -> list: + """ + Invoke large language tts model voices + + :param language: tts language + :return: tts model voices + """ + if not isinstance(self.model_type_instance, TTSModel): + raise Exception("Model type instance is not TTSModel") + + self.model_type_instance = cast(TTSModel, self.model_type_instance) + return self.model_type_instance.get_tts_model_voices( + model=self.model, + credentials=self.credentials, + language=language + ) + class ModelManager: def __init__(self) -> None: diff --git a/api/core/model_runtime/docs/en_US/schema.md b/api/core/model_runtime/docs/en_US/schema.md index 9606579e1c..61cd2c32d4 100644 --- a/api/core/model_runtime/docs/en_US/schema.md +++ b/api/core/model_runtime/docs/en_US/schema.md @@ -48,6 +48,10 @@ - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`) - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`) - `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`) + - `voices` (list) List of available voice.(available for model type `tts`) + - `mode` (string) voice model.(available for model type `tts`) + - `name` (string) voice model display name.(available for model type `tts`) + - `lanuage` (string) the voice model supports languages.(available for model type `tts`) - `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`) - `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`) - `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`) diff --git a/api/core/model_runtime/docs/zh_Hans/schema.md b/api/core/model_runtime/docs/zh_Hans/schema.md index 1eab541d24..55202a1a80 100644 --- a/api/core/model_runtime/docs/zh_Hans/schema.md +++ b/api/core/model_runtime/docs/zh_Hans/schema.md @@ -48,7 +48,11 @@ - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用) - `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用) - `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用) - - `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) + - `default_voice` (string) 缺省音色,必选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) + - `voices` (list) 可选音色列表。 + - `mode` (string) 音色模型。(模型类型 `tts` 可用) + - `name` (string) 音色模型显示名称。(模型类型 `tts` 可用) + - `lanuage` (string) 音色模型支持语言。(模型类型 `tts` 可用) - `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用) - `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用) - `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用) diff --git a/api/core/model_runtime/entities/model_entities.py b/api/core/model_runtime/entities/model_entities.py index ebde3ec85b..e35be27f86 100644 --- a/api/core/model_runtime/entities/model_entities.py +++ b/api/core/model_runtime/entities/model_entities.py @@ -127,6 +127,7 @@ class ModelPropertyKey(Enum): SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions" MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk" DEFAULT_VOICE = "default_voice" + VOICES = "voices" WORD_LIMIT = "word_limit" AUDOI_TYPE = "audio_type" MAX_WORKERS = "max_workers" diff --git a/api/core/model_runtime/model_providers/__base/tts_model.py b/api/core/model_runtime/model_providers/__base/tts_model.py index ff20cf7b9f..77be02978c 100644 --- a/api/core/model_runtime/model_providers/__base/tts_model.py +++ b/api/core/model_runtime/model_providers/__base/tts_model.py @@ -15,29 +15,37 @@ class TTSModel(AIModel): """ model_type: ModelType = ModelType.TTS - def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): + def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, + user: Optional[str] = None): """ Invoke large language model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials + :param voice: model timbre :param content_text: text content to be translated :param streaming: output is streaming :param user: unique user id :return: translated audio file """ try: - return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text) + self._is_ffmpeg_installed() + return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, + content_text=content_text, voice=voice, tenant_id=tenant_id) except Exception as e: raise self._transform_invoke_error(e) @abstractmethod - def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): + def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, + user: Optional[str] = None): """ Invoke large language model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials + :param voice: model timbre :param content_text: text content to be translated :param streaming: output is streaming :param user: unique user id @@ -45,7 +53,22 @@ class TTSModel(AIModel): """ raise NotImplementedError - def _get_model_voice(self, model: str, credentials: dict) -> any: + def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list: + """ + Get voice for given tts model voices + + :param language: tts language + :param model: model name + :param credentials: model credentials + :return: voices lists + """ + model_schema = self.get_model_schema(model, credentials) + + if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties: + voices = model_schema.model_properties[ModelPropertyKey.VOICES] + return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')] + + def _get_model_default_voice(self, model: str, credentials: dict) -> any: """ Get voice for given tts model diff --git a/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml b/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml index aa7ed537a4..bc3c44beb7 100644 --- a/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml +++ b/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml @@ -1,7 +1,31 @@ -model: tts-1-hd +model: tts-1 model_type: tts model_properties: default_voice: 'alloy' + voices: + - mode: 'alloy' + name: 'Alloy' + language: ['zh-CN', 'en-US'] + - mode: 'echo' + name: 'Echo' + language: ['zh-CN', 'en-US'] + - mode: 'fable' + name: 'Fable' + language: ['zh-CN', 'en-US'] + - mode: 'onyx' + name: 'Onyx' + language: ['zh-CN', 'en-US'] + - mode: 'nova' + name: 'Nova' + language: ['zh-CN', 'en-US'] + - mode: 'shimmer' + name: 'Shimmer' + language: ['zh-CN', 'en-US'] word_limit: 120 audio_type: 'mp3' max_workers: 5 +pricing: + input: '0.03' + output: '0' + unit: '0.001' + currency: USD diff --git a/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml b/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml index 96f54a7340..e52602d1a1 100644 --- a/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml +++ b/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml @@ -2,6 +2,30 @@ model: tts-1 model_type: tts model_properties: default_voice: 'alloy' + voices: + - mode: 'alloy' + name: 'Alloy' + language: ['zh-CN', 'en-US'] + - mode: 'echo' + name: 'Echo' + language: ['zh-CN', 'en-US'] + - mode: 'fable' + name: 'Fable' + language: ['zh-CN', 'en-US'] + - mode: 'onyx' + name: 'Onyx' + language: ['zh-CN', 'en-US'] + - mode: 'nova' + name: 'Nova' + language: ['zh-CN', 'en-US'] + - mode: 'shimmer' + name: 'Shimmer' + language: ['zh-CN', 'en-US'] word_limit: 120 audio_type: 'mp3' max_workers: 5 +pricing: + input: '0.015' + output: '0' + unit: '0.001' + currency: USD diff --git a/api/core/model_runtime/model_providers/openai/tts/tts.py b/api/core/model_runtime/model_providers/openai/tts/tts.py index b3e66c1223..269760ab64 100644 --- a/api/core/model_runtime/model_providers/openai/tts/tts.py +++ b/api/core/model_runtime/model_providers/openai/tts/tts.py @@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.openai._common import _CommonOpenAI +from extensions.ext_storage import storage class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): """ Model class for OpenAI Speech to text model. """ - def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: + + def _invoke(self, model: str, tenant_id: str, credentials: dict, + content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any: """ _invoke text2speech model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials :param content_text: text content to be translated + :param voice: model timbre :param streaming: output is streaming :param user: unique user id :return: text translated to audio file """ - self._is_ffmpeg_installed() audio_type = self._get_model_audio_type(model, credentials) + if not voice: + voice = self._get_model_default_voice(model, credentials) if streaming: return Response(stream_with_context(self._tts_invoke_streaming(model=model, credentials=credentials, content_text=content_text, - user=user)), + tenant_id=tenant_id, + voice=voice)), status=200, mimetype=f'audio/{audio_type}') else: - return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) + return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: """ @@ -52,91 +59,96 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): self._tts_invoke( model=model, credentials=credentials, - content_text='Hello world!', - user=user + content_text='Hello Dify!', + voice=self._get_model_default_voice(model, credentials), ) except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: + def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: """ _tts_invoke text2speech model :param model: model name :param credentials: model credentials :param content_text: text content to be translated - :param user: unique user id + :param voice: model timbre :return: text translated to audio file """ audio_type = self._get_model_audio_type(model, credentials) word_limit = self._get_model_word_limit(model, credentials) max_workers = self._get_model_workers_limit(model, credentials) - try: sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) audio_bytes_list = list() # Create a thread pool and map the function to the list of sentences with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence - in sentences] + futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice, + credentials=credentials) for sentence in sentences] for future in futures: try: - audio_bytes_list.append(future.result()) + if future.result(): + audio_bytes_list.append(future.result()) except Exception as ex: raise InvokeBadRequestError(str(ex)) - audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in - audio_bytes_list if audio_bytes] - combined_segment = reduce(lambda x, y: x + y, audio_segments) - buffer: BytesIO = BytesIO() - combined_segment.export(buffer, format=audio_type) - buffer.seek(0) - return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") + if len(audio_bytes_list) > 0: + audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in + audio_bytes_list if audio_bytes] + combined_segment = reduce(lambda x, y: x + y, audio_segments) + buffer: BytesIO = BytesIO() + combined_segment.export(buffer, format=audio_type) + buffer.seek(0) + return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") except Exception as ex: raise InvokeBadRequestError(str(ex)) # Todo: To improve the streaming function - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: + def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, + voice: str) -> any: """ _tts_invoke_streaming text2speech model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials :param content_text: text content to be translated - :param user: unique user id + :param voice: model timbre :return: text translated to audio file """ # transform credentials to kwargs for model instance credentials_kwargs = self._to_credential_kwargs(credentials) - voice_name = self._get_model_voice(model, credentials) + if not voice: + voice = self._get_model_default_voice(model, credentials) word_limit = self._get_model_word_limit(model, credentials) audio_type = self._get_model_audio_type(model, credentials) tts_file_id = self._get_file_name(content_text) - file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}' + file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' try: client = OpenAI(**credentials_kwargs) sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) for sentence in sentences: - response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) - response.stream_to_file(file_path) + response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) + # response.stream_to_file(file_path) + storage.save(file_path, response.read()) except Exception as ex: raise InvokeBadRequestError(str(ex)) - def _process_sentence(self, sentence: str, model: str, credentials: dict): + def _process_sentence(self, sentence: str, model: str, + voice, credentials: dict): """ _tts_invoke openai text2speech model api :param model: model name :param credentials: model credentials + :param voice: model timbre :param sentence: text content to be translated :return: text translated to audio file """ # transform credentials to kwargs for model instance credentials_kwargs = self._to_credential_kwargs(credentials) - voice_name = self._get_model_voice(model, credentials) - client = OpenAI(**credentials_kwargs) - response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) + response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip()) if isinstance(response.read(), bytes): return response.read() diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml index 8746fb9f02..7627dfd0be 100644 --- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml @@ -1,7 +1,134 @@ model: tts-1 model_type: tts model_properties: - default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置 + default_voice: 'sambert-zhiru-v1' + voices: + - mode: "sambert-zhinan-v1" + name: "知楠(广告男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiqi-v1" + name: "知琪(温柔女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhichu-v1" + name: "知厨(新闻播报)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhide-v1" + name: "知德(新闻男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhijia-v1" + name: "知佳(标准女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiru-v1" + name: "知茹(新闻女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiqian-v1" + name: "知倩(配音解说、新闻播报)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhixiang-v1" + name: "知祥(配音解说)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiwei-v1" + name: "知薇(萝莉女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhihao-v1" + name: "知浩(咨询男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhijing-v1" + name: "知婧(严厉女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiming-v1" + name: "知茗(诙谐男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhimo-v1" + name: "知墨(情感男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhina-v1" + name: "知娜(浙普女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhishu-v1" + name: "知树(资讯男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhistella-v1" + name: "知莎(知性女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiting-v1" + name: "知婷(电台女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhixiao-v1" + name: "知笑(资讯女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiya-v1" + name: "知雅(严厉女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiye-v1" + name: "知晔(青年男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiying-v1" + name: "知颖(软萌童声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhiyuan-v1" + name: "知媛(知心姐姐)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhigui-v1" + name: "知柜(直播女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhishuo-v1" + name: "知硕(自然男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhimiao-emo-v1" + name: "知妙(多种情感女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhimao-v1" + name: "知猫(直播女声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhilun-v1" + name: "知伦(悬疑解说)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhifei-v1" + name: "知飞(激昂解说)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-zhida-v1" + name: "知达(标准男声)" + language: [ "zh-CN", "en-US" ] + - mode: "sambert-camila-v1" + name: "Camila(西班牙语女声)" + language: [ "es-ES" ] + - mode: "sambert-perla-v1" + name: "Perla(意大利语女声)" + language: [ "it-IT" ] + - mode: "sambert-indah-v1" + name: "Indah(印尼语女声)" + language: [ "id-ID" ] + - mode: "sambert-clara-v1" + name: "Clara(法语女声)" + language: [ "fr-FR" ] + - mode: "sambert-hanna-v1" + name: "Hanna(德语女声)" + language: [ "de-DE" ] + - mode: "sambert-beth-v1" + name: "Beth(咨询女声)" + language: [ "en-US" ] + - mode: "sambert-betty-v1" + name: "Betty(客服女声)" + language: [ "en-US" ] + - mode: "sambert-cally-v1" + name: "Cally(自然女声)" + language: [ "en-US" ] + - mode: "sambert-cindy-v1" + name: "Cindy(对话女声)" + language: [ "en-US" ] + - mode: "sambert-eva-v1" + name: "Eva(陪伴女声)" + language: [ "en-US" ] + - mode: "sambert-donna-v1" + name: "Donna(教育女声)" + language: [ "en-US" ] + - mode: "sambert-brian-v1" + name: "Brian(客服男声)" + language: [ "en-US" ] + - mode: "sambert-waan-v1" + name: "Waan(泰语女声)" + language: [ "th-TH" ] word_limit: 120 audio_type: 'mp3' max_workers: 5 diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts.py b/api/core/model_runtime/model_providers/tongyi/tts/tts.py index 3e1608944b..1b670baff7 100644 --- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py @@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.tongyi._common import _CommonTongyi +from extensions.ext_storage import storage class TongyiText2SpeechModel(_CommonTongyi, TTSModel): """ Model class for Tongyi Speech to text model. """ - def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: + + def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool, + user: Optional[str] = None) -> any: """ _invoke text2speech model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials + :param voice: model timbre :param content_text: text content to be translated :param streaming: output is streaming :param user: unique user id :return: text translated to audio file """ - self._is_ffmpeg_installed() audio_type = self._get_model_audio_type(model, credentials) + if not voice: + voice = self._get_model_default_voice(model, credentials) if streaming: return Response(stream_with_context(self._tts_invoke_streaming(model=model, credentials=credentials, content_text=content_text, - user=user)), + voice=voice, + tenant_id=tenant_id)), status=200, mimetype=f'audio/{audio_type}') else: - return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) + return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice) def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: """ @@ -52,91 +59,96 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel): self._tts_invoke( model=model, credentials=credentials, - content_text='Hello world!', - user=user + content_text='Hello Dify!', + voice=self._get_model_default_voice(model, credentials), ) except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: + def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response: """ _tts_invoke text2speech model :param model: model name :param credentials: model credentials + :param voice: model timbre :param content_text: text content to be translated - :param user: unique user id :return: text translated to audio file """ audio_type = self._get_model_audio_type(model, credentials) word_limit = self._get_model_word_limit(model, credentials) max_workers = self._get_model_workers_limit(model, credentials) - try: sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) audio_bytes_list = list() # Create a thread pool and map the function to the list of sentences with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - futures = [executor.submit(self._process_sentence, model=model, sentence=sentence, - credentials=credentials, audio_type=audio_type) for sentence in sentences] + futures = [executor.submit(self._process_sentence, sentence=sentence, + credentials=credentials, voice=voice, audio_type=audio_type) for sentence in + sentences] for future in futures: try: - audio_bytes_list.append(future.result()) + if future.result(): + audio_bytes_list.append(future.result()) except Exception as ex: raise InvokeBadRequestError(str(ex)) - audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in - audio_bytes_list if audio_bytes] - combined_segment = reduce(lambda x, y: x + y, audio_segments) - buffer: BytesIO = BytesIO() - combined_segment.export(buffer, format=audio_type) - buffer.seek(0) - return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") + if len(audio_bytes_list) > 0: + audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in + audio_bytes_list if audio_bytes] + combined_segment = reduce(lambda x, y: x + y, audio_segments) + buffer: BytesIO = BytesIO() + combined_segment.export(buffer, format=audio_type) + buffer.seek(0) + return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") except Exception as ex: raise InvokeBadRequestError(str(ex)) # Todo: To improve the streaming function - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: + def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str, + voice: str) -> any: """ _tts_invoke_streaming text2speech model :param model: model name + :param tenant_id: user tenant id :param credentials: model credentials + :param voice: model timbre :param content_text: text content to be translated - :param user: unique user id :return: text translated to audio file """ - # transform credentials to kwargs for model instance dashscope.api_key = credentials.get('dashscope_api_key') - voice_name = self._get_model_voice(model, credentials) word_limit = self._get_model_word_limit(model, credentials) audio_type = self._get_model_audio_type(model, credentials) + tts_file_id = self._get_file_name(content_text) + file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}' try: sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) for sentence in sentences: - response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), + response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, + text=sentence.strip(), format=audio_type, word_timestamp_enabled=True, phoneme_timestamp_enabled=True) if isinstance(response.get_audio_data(), bytes): - return response.get_audio_data() + storage.save(file_path, response.get_audio_data()) except Exception as ex: raise InvokeBadRequestError(str(ex)) - def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str): + @staticmethod + def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str): """ _tts_invoke Tongyi text2speech model api - :param model: model name :param credentials: model credentials :param sentence: text content to be translated + :param voice: model timbre :param audio_type: audio file type :return: text translated to audio file """ - # transform credentials to kwargs for model instance dashscope.api_key = credentials.get('dashscope_api_key') - voice_name = self._get_model_voice(model, credentials) - - response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type) + response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000, + text=sentence.strip(), + format=audio_type) if isinstance(response.get_audio_data(), bytes): return response.get_audio_data() diff --git a/api/services/app_model_config_service.py b/api/services/app_model_config_service.py index 9c367a429e..2851624ba1 100644 --- a/api/services/app_model_config_service.py +++ b/api/services/app_model_config_service.py @@ -98,7 +98,9 @@ class AppModelConfigService: # text_to_speech if 'text_to_speech' not in config or not config["text_to_speech"]: config["text_to_speech"] = { - "enabled": False + "enabled": False, + "voice": "", + "language": "" } if not isinstance(config["text_to_speech"], dict): @@ -106,6 +108,8 @@ class AppModelConfigService: if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]: config["text_to_speech"]["enabled"] = False + config["text_to_speech"]["voice"] = "" + config["text_to_speech"]["language"] = "" if not isinstance(config["text_to_speech"]["enabled"], bool): raise ValueError("enabled in text_to_speech must be of boolean type") diff --git a/api/services/audio_service.py b/api/services/audio_service.py index ba6bf1ab6f..0161fde7bb 100644 --- a/api/services/audio_service.py +++ b/api/services/audio_service.py @@ -13,14 +13,14 @@ from services.errors.audio import ( UnsupportedAudioTypeServiceError, ) -FILE_SIZE = 15 +FILE_SIZE = 30 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr'] class AudioService: @classmethod - def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None): + def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None): if file is None: raise NoAudioUploadedServiceError() @@ -49,7 +49,7 @@ class AudioService: return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)} @classmethod - def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None): + def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None): model_manager = ModelManager() model_instance = model_manager.get_default_model_instance( tenant_id=tenant_id, @@ -59,6 +59,21 @@ class AudioService: raise ProviderNotSupportTextToSpeechServiceError() try: - return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming) + return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice) + except Exception as e: + raise e + + @classmethod + def transcript_tts_voices(cls, tenant_id: str, language: str): + model_manager = ModelManager() + model_instance = model_manager.get_default_model_instance( + tenant_id=tenant_id, + model_type=ModelType.TTS + ) + if model_instance is None: + raise ProviderNotSupportTextToSpeechServiceError() + + try: + return model_instance.get_tts_voices(language) except Exception as e: raise e diff --git a/api/services/errors/audio.py b/api/services/errors/audio.py index 091ce36588..4005cbfcd7 100644 --- a/api/services/errors/audio.py +++ b/api/services/errors/audio.py @@ -16,3 +16,7 @@ class ProviderNotSupportSpeechToTextServiceError(Exception): class ProviderNotSupportTextToSpeechServiceError(Exception): pass + + +class ProviderNotSupportTextToSpeechLanageServiceError(Exception): + pass diff --git a/web/app/components/app/configuration/base/feature-panel/index.tsx b/web/app/components/app/configuration/base/feature-panel/index.tsx index 5bc3734f60..ff21c7cb08 100644 --- a/web/app/components/app/configuration/base/feature-panel/index.tsx +++ b/web/app/components/app/configuration/base/feature-panel/index.tsx @@ -2,6 +2,7 @@ import type { FC, ReactNode } from 'react' import React from 'react' import cn from 'classnames' +import ParamsConfig from '@/app/components/app/configuration/config-voice/param-config' export type IFeaturePanelProps = { className?: string @@ -12,6 +13,7 @@ export type IFeaturePanelProps = { isFocus?: boolean noBodySpacing?: boolean children?: ReactNode + isShowTextToSpeech?: boolean } const FeaturePanel: FC = ({ @@ -23,6 +25,7 @@ const FeaturePanel: FC = ({ isFocus, noBodySpacing, children, + isShowTextToSpeech, }) => { return (
= ({
{title}
- {headerRight} + {isShowTextToSpeech + ? ( +
+ +
+ ) + : headerRight}
diff --git a/web/app/components/app/configuration/config-voice/param-config-content.tsx b/web/app/components/app/configuration/config-voice/param-config-content.tsx new file mode 100644 index 0000000000..bf6459df41 --- /dev/null +++ b/web/app/components/app/configuration/config-voice/param-config-content.tsx @@ -0,0 +1,187 @@ +'use client' +import useSWR from 'swr' +import type { FC } from 'react' +import { useContext } from 'use-context-selector' +import React, { Fragment } from 'react' +import classNames from 'classnames' +import { usePathname } from 'next/navigation' +import { useTranslation } from 'react-i18next' +import { Listbox, Transition } from '@headlessui/react' +import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid' +import type { Item } from '@/app/components/base/select' +import ConfigContext from '@/context/debug-configuration' +import { fetchAppVoices } from '@/service/apps' +import Tooltip from '@/app/components/base/tooltip' +import { HelpCircle } from '@/app/components/base/icons/src/vender/line/general' + +const VoiceParamConfig: FC = () => { + const { t } = useTranslation() + const pathname = usePathname() + const matched = pathname.match(/\/app\/([^/]+)/) + const appId = (matched?.length && matched[1]) ? matched[1] : '' + + const LanguageItems = [ + { value: 'zh-CN', name: '中文' }, + { value: 'en-US', name: '英语' }, + { value: 'de-DE', name: '德语' }, + { value: 'fr-FR', name: '法语' }, + { value: 'es-ES', name: '西班牙语' }, + { value: 'it-IT', name: '意大利语' }, + { value: 'th-TH', name: '泰语' }, + { value: 'id-ID', name: '印尼语' }, + ] + const { + textToSpeechConfig, + setTextToSpeechConfig, + } = useContext(ConfigContext) + + const languageItem = LanguageItems.find(item => item.value === textToSpeechConfig.language) + const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select') + + const voiceItems = useSWR({ url: `/apps/${appId}/text-to-audio/voices?language=${languageItem ? languageItem.value : 'zh-CN'}` }, fetchAppVoices).data + const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice) + const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select') + + return ( +
+
+
{t('appDebug.voice.voiceSettings.title')}
+
+
+
+
{t('appDebug.voice.voiceSettings.language')}
+ + {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => ( +
{item}
+ ))} +
} selector='config-resolution-tooltip'> + + +
+ { + setTextToSpeechConfig({ + ...textToSpeechConfig, + language: String(value.value), + }) + }} + > +
+ + {languageItem?.name ?? localLanguagePlaceholder} + + + + + + + {LanguageItems.map((item: Item) => ( + + `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' + }` + } + value={item} + disabled={false} + > + {({ /* active, */ selected }) => ( + <> + {item.name} + {(selected || item.value === textToSpeechConfig.language) && ( + + + )} + + )} + + ))} + + +
+
+
+ +
+
{t('appDebug.voice.voiceSettings.voice')}
+ { + setTextToSpeechConfig({ + ...textToSpeechConfig, + voice: String(value.value), + }) + }} + > +
+ + {voiceItem?.name ?? localVoicePlaceholder} + + + + + + + {voiceItems?.map((item: Item) => ( + + `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : '' + }` + } + value={item} + disabled={false} + > + {({ /* active, */ selected }) => ( + <> + {item.name} + {(selected || item.value === textToSpeechConfig.voice) && ( + + + )} + + )} + + ))} + + +
+
+
+
+
+ + ) +} + +export default React.memo(VoiceParamConfig) diff --git a/web/app/components/app/configuration/config-voice/param-config.tsx b/web/app/components/app/configuration/config-voice/param-config.tsx new file mode 100644 index 0000000000..5ea0a32907 --- /dev/null +++ b/web/app/components/app/configuration/config-voice/param-config.tsx @@ -0,0 +1,41 @@ +'use client' +import type { FC } from 'react' +import { memo, useState } from 'react' +import { useTranslation } from 'react-i18next' +import cn from 'classnames' +import VoiceParamConfig from './param-config-content' +import { Settings01 } from '@/app/components/base/icons/src/vender/line/general' +import { + PortalToFollowElem, + PortalToFollowElemContent, + PortalToFollowElemTrigger, +} from '@/app/components/base/portal-to-follow-elem' + +const ParamsConfig: FC = () => { + const { t } = useTranslation() + const [open, setOpen] = useState(false) + + return ( + + setOpen(v => !v)}> +
+ +
{t('appDebug.voice.settings')}
+
+
+ +
+ +
+
+
+ ) +} +export default memo(ParamsConfig) diff --git a/web/app/components/app/configuration/config/index.tsx b/web/app/components/app/configuration/config/index.tsx index 3d1da56d83..6f16ace26e 100644 --- a/web/app/components/app/configuration/config/index.tsx +++ b/web/app/components/app/configuration/config/index.tsx @@ -119,6 +119,8 @@ const Config: FC = () => { setTextToSpeech: (value) => { setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => { draft.enabled = value + draft.voice = textToSpeechConfig?.voice + draft.language = textToSpeechConfig?.language })) }, citation: citationConfig.enabled, @@ -245,6 +247,7 @@ const Config: FC = () => { {(isAgent && isChatApp) && ( )} + {/* Chat History */} diff --git a/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx b/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx index 9a6f79320d..4de9a4e328 100644 --- a/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx +++ b/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx @@ -61,6 +61,11 @@ const TextGenerationItem: FC = ({ sensitive_word_avoidance: moderationConfig, external_data_tools: externalDataToolsConfig, more_like_this: moreLikeThisConfig, + text_to_speech: { + enabled: false, + voice: '', + language: '', + }, agent_mode: { enabled: false, tools: [], diff --git a/web/app/components/app/configuration/debug/index.tsx b/web/app/components/app/configuration/debug/index.tsx index 04be614349..194bc1f8b5 100644 --- a/web/app/components/app/configuration/debug/index.tsx +++ b/web/app/components/app/configuration/debug/index.tsx @@ -213,9 +213,6 @@ const Debug: FC = ({ const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key const postModelConfig: BackendModelConfig = { - text_to_speech: { - enabled: false, - }, pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '', prompt_type: promptMode, chat_prompt_config: {}, @@ -234,6 +231,11 @@ const Debug: FC = ({ mode: modelConfig.mode, completion_params: completionParams as any, }, + text_to_speech: { + enabled: false, + voice: '', + language: '', + }, agent_mode: { enabled: false, tools: [], diff --git a/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx b/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx index d3f5562df7..8e09379e7a 100644 --- a/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx +++ b/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx @@ -19,6 +19,7 @@ const TextToSpeech: FC = () => {
{t('appDebug.feature.textToSpeech.resDes')}
} noBodySpacing + isShowTextToSpeech={true} /> ) } diff --git a/web/app/components/app/configuration/index.tsx b/web/app/components/app/configuration/index.tsx index 01b3c312c2..a2ac27f495 100644 --- a/web/app/components/app/configuration/index.tsx +++ b/web/app/components/app/configuration/index.tsx @@ -30,6 +30,7 @@ import type { MoreLikeThisConfig, PromptConfig, PromptVariable, + TextToSpeechConfig, } from '@/models/debug' import type { ExternalDataTool } from '@/models/common' import type { DataSet } from '@/models/datasets' @@ -98,8 +99,10 @@ const Configuration: FC = () => { const [speechToTextConfig, setSpeechToTextConfig] = useState({ enabled: false, }) - const [textToSpeechConfig, setTextToSpeechConfig] = useState({ + const [textToSpeechConfig, setTextToSpeechConfig] = useState({ enabled: false, + voice: '', + language: '', }) const [citationConfig, setCitationConfig] = useState({ enabled: false, @@ -246,6 +249,8 @@ const Configuration: FC = () => { }) setTextToSpeechConfig(modelConfig.text_to_speech || { enabled: false, + voice: '', + language: '', }) setCitationConfig(modelConfig.retriever_resource || { enabled: false, diff --git a/web/app/components/base/chat/chat/answer/operation.tsx b/web/app/components/base/chat/chat/answer/operation.tsx index 9d9cf2735e..1f7c57ae75 100644 --- a/web/app/components/base/chat/chat/answer/operation.tsx +++ b/web/app/components/base/chat/chat/answer/operation.tsx @@ -73,7 +73,8 @@ const Operation: FC = ({ /> ) } - {!isOpeningStatement && config?.text_to_speech && ( + + {(!isOpeningStatement && config?.text_to_speech.enabled) && ( ({ setSpeechToTextConfig: () => { }, textToSpeechConfig: { enabled: false, + voice: '', + language: '', }, setTextToSpeechConfig: () => { }, citationConfig: { diff --git a/web/i18n/lang/app-debug.en.ts b/web/i18n/lang/app-debug.en.ts index da89eb97d9..088d76154e 100644 --- a/web/i18n/lang/app-debug.en.ts +++ b/web/i18n/lang/app-debug.en.ts @@ -298,6 +298,17 @@ const translation = { uploadLimit: 'Upload Limit', }, }, + voice: { + name: 'Voice', + description: 'Text to speech voice Settings', + settings: 'Settings', + voiceSettings: { + title: 'Voice Settings', + language: 'Language', + resolutionTooltip: 'Text-to-speech voice support language。', + voice: 'Voice', + }, + }, openingStatement: { title: 'Conversation Opener', add: 'Add', diff --git a/web/i18n/lang/app-debug.zh.ts b/web/i18n/lang/app-debug.zh.ts index 4d329db694..7cd61cf63e 100644 --- a/web/i18n/lang/app-debug.zh.ts +++ b/web/i18n/lang/app-debug.zh.ts @@ -294,6 +294,17 @@ const translation = { uploadLimit: '上传数量限制', }, }, + voice: { + name: '音色', + description: '文本转语音音色设置', + settings: '设置', + voiceSettings: { + title: '音色设置', + language: '语言', + resolutionTooltip: '文本转语音音色支持语言。', + voice: '音色', + }, + }, openingStatement: { title: '对话开场白', add: '添加开场白', diff --git a/web/models/app.ts b/web/models/app.ts index 898584b5df..020e100719 100644 --- a/web/models/app.ts +++ b/web/models/app.ts @@ -122,3 +122,8 @@ export type UpdateOpenAIKeyResponse = ValidateOpenAIKeyResponse export type GenerationIntroductionResponse = { introduction: string } + +export type AppVoicesListResponse = [{ + name: string + value: string +}] diff --git a/web/models/debug.ts b/web/models/debug.ts index 3cf34820e4..a341e209b4 100644 --- a/web/models/debug.ts +++ b/web/models/debug.ts @@ -75,7 +75,11 @@ export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig export type SpeechToTextConfig = MoreLikeThisConfig -export type TextToSpeechConfig = MoreLikeThisConfig +export type TextToSpeechConfig = { + enabled: boolean + voice?: string + language?: string +} export type CitationConfig = MoreLikeThisConfig diff --git a/web/service/apps.ts b/web/service/apps.ts index c4ff2c1cce..fde8128754 100644 --- a/web/service/apps.ts +++ b/web/service/apps.ts @@ -1,6 +1,6 @@ import type { Fetcher } from 'swr' import { del, get, post } from './base' -import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' +import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, AppVoicesListResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app' import type { CommonResponse } from '@/models/common' import type { AppMode, ModelConfig } from '@/types/app' @@ -93,3 +93,7 @@ export const updateOpenAIKey: Fetcher = ({ url, body }) => { return post(url, { body }) } + +export const fetchAppVoices: Fetcher = ({ url }) => { + return get(url) +} diff --git a/web/types/app.ts b/web/types/app.ts index 32b14d3d46..440ee7288a 100644 --- a/web/types/app.ts +++ b/web/types/app.ts @@ -155,6 +155,8 @@ export type ModelConfig = { } text_to_speech: { enabled: boolean + voice?: string + language?: string } retriever_resource: { enabled: boolean