diff --git a/api/core/model_runtime/model_providers/__base/tts_model.py b/api/core/model_runtime/model_providers/__base/tts_model.py index c3f3b65fa4..f0edcff1a7 100644 --- a/api/core/model_runtime/model_providers/__base/tts_model.py +++ b/api/core/model_runtime/model_providers/__base/tts_model.py @@ -1,8 +1,13 @@ +import uuid +import hashlib +import subprocess from abc import abstractmethod from typing import Optional +from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.entities.model_entities import ModelType from core.model_runtime.model_providers.__base.ai_model import AIModel +from core.model_runtime.entities.model_entities import ModelPropertyKey class TTSModel(AIModel): @@ -40,3 +45,96 @@ class TTSModel(AIModel): :return: translated audio file """ raise NotImplementedError + + def _get_model_voice(self, model: str, credentials: dict) -> any: + """ + Get voice for given tts model + + :param model: model name + :param credentials: model credentials + :return: voice + """ + model_schema = self.get_model_schema(model, credentials) + + if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties: + return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE] + + def _get_model_audio_type(self, model: str, credentials: dict) -> str: + """ + Get audio type for given tts model + + :param model: model name + :param credentials: model credentials + :return: voice + """ + model_schema = self.get_model_schema(model, credentials) + + if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties: + return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE] + + def _get_model_word_limit(self, model: str, credentials: dict) -> int: + """ + Get audio type for given tts model + :return: audio type + """ + model_schema = self.get_model_schema(model, credentials) + + if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties: + return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT] + + def _get_model_workers_limit(self, model: str, credentials: dict) -> int: + """ + Get audio max workers for given tts model + :return: audio type + """ + model_schema = self.get_model_schema(model, credentials) + + if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties: + return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS] + + @staticmethod + def _split_text_into_sentences(text: str, limit: int, delimiters=None): + if delimiters is None: + delimiters = set('。!?;\n') + + buf = [] + word_count = 0 + for char in text: + buf.append(char) + if char in delimiters: + if word_count >= limit: + yield ''.join(buf) + buf = [] + word_count = 0 + else: + word_count += 1 + else: + word_count += 1 + + if buf: + yield ''.join(buf) + + @staticmethod + def _is_ffmpeg_installed(): + try: + output = subprocess.check_output("ffmpeg -version", shell=True) + if "ffmpeg version" in output.decode("utf-8"): + return True + else: + raise InvokeBadRequestError("ffmpeg is not installed, " + "details: https://docs.dify.ai/getting-started/install-self-hosted" + "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech") + except Exception: + raise InvokeBadRequestError("ffmpeg is not installed, " + "details: https://docs.dify.ai/getting-started/install-self-hosted" + "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech") + + # Todo: To improve the streaming function + @staticmethod + def _get_file_name(file_content: str) -> str: + hash_object = hashlib.sha256(file_content.encode()) + hex_digest = hash_object.hexdigest() + + namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31') + unique_uuid = uuid.uuid5(namespace_uuid, hex_digest) + return str(unique_uuid) diff --git a/api/core/model_runtime/model_providers/openai/tts/tts.py b/api/core/model_runtime/model_providers/openai/tts/tts.py index 6e14ebbda4..013392eb3a 100644 --- a/api/core/model_runtime/model_providers/openai/tts/tts.py +++ b/api/core/model_runtime/model_providers/openai/tts/tts.py @@ -1,18 +1,13 @@ -import uuid -import hashlib -import subprocess from io import BytesIO from typing import Optional from functools import reduce from pydub import AudioSegment -from core.model_runtime.entities.model_entities import ModelPropertyKey from core.model_runtime.errors.validate import CredentialsValidateFailedError from core.model_runtime.errors.invoke import InvokeBadRequestError from core.model_runtime.model_providers.__base.tts_model import TTSModel from core.model_runtime.model_providers.openai._common import _CommonOpenAI -from typing_extensions import Literal from flask import Response, stream_with_context from openai import OpenAI import concurrent.futures @@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): """ Model class for OpenAI Speech to text model. """ - - def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, - user: Optional[str] = None) -> any: + def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: """ _invoke text2speech model @@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): except Exception as ex: raise CredentialsValidateFailedError(str(ex)) - def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: + def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: """ _tts_invoke text2speech model @@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): raise InvokeBadRequestError(str(ex)) # Todo: To improve the streaming function - def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, - user: Optional[str] = None) -> any: + def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: """ _tts_invoke_streaming text2speech model @@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): except Exception as ex: raise InvokeBadRequestError(str(ex)) - def _get_model_voice(self, model: str, credentials: dict) -> Literal[ - "alloy", "echo", "fable", "onyx", "nova", "shimmer"]: - """ - Get voice for given tts model - - :param model: model name - :param credentials: model credentials - :return: voice - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties: - return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE] - - def _get_model_audio_type(self, model: str, credentials: dict) -> str: - """ - Get audio type for given tts model - - :param model: model name - :param credentials: model credentials - :return: voice - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties: - return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE] - - def _get_model_word_limit(self, model: str, credentials: dict) -> int: - """ - Get audio type for given tts model - :return: audio type - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties: - return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT] - - def _get_model_workers_limit(self, model: str, credentials: dict) -> int: - """ - Get audio max workers for given tts model - :return: audio type - """ - model_schema = self.get_model_schema(model, credentials) - - if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties: - return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS] - - @staticmethod - def _split_text_into_sentences(text: str, limit: int, delimiters=None): - if delimiters is None: - delimiters = set('。!?;\n') - - buf = [] - word_count = 0 - for char in text: - buf.append(char) - if char in delimiters: - if word_count >= limit: - yield ''.join(buf) - buf = [] - word_count = 0 - else: - word_count += 1 - else: - word_count += 1 - - if buf: - yield ''.join(buf) - - @staticmethod - def _get_file_name(file_content: str) -> str: - hash_object = hashlib.sha256(file_content.encode()) - hex_digest = hash_object.hexdigest() - - namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31') - unique_uuid = uuid.uuid5(namespace_uuid, hex_digest) - return str(unique_uuid) - def _process_sentence(self, sentence: str, model: str, credentials: dict): """ _tts_invoke openai text2speech model api @@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel): response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip()) if isinstance(response.read(), bytes): return response.read() - - @staticmethod - def _is_ffmpeg_installed(): - try: - output = subprocess.check_output("ffmpeg -version", shell=True) - if "ffmpeg version" in output.decode("utf-8"): - return True - else: - raise InvokeBadRequestError("ffmpeg is not installed, " - "details: https://docs.dify.ai/getting-started/install-self-hosted" - "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech") - except Exception: - raise InvokeBadRequestError("ffmpeg is not installed, " - "details: https://docs.dify.ai/getting-started/install-self-hosted" - "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech") diff --git a/api/core/model_runtime/model_providers/tongyi/_common.py b/api/core/model_runtime/model_providers/tongyi/_common.py new file mode 100644 index 0000000000..dfc0102666 --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/_common.py @@ -0,0 +1,23 @@ +from core.model_runtime.errors.invoke import InvokeError + + +class _CommonTongyi: + @staticmethod + def _to_credential_kwargs(credentials: dict) -> dict: + credentials_kwargs = { + "dashscope_api_key": credentials['dashscope_api_key'], + } + + return credentials_kwargs + + @property + def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]: + """ + Map model invoke error to unified error + The key is the error type thrown to the caller + The value is the error type thrown by the model, + which needs to be converted into a unified error type for the caller. + + :return: Invoke error mapping + """ + pass diff --git a/api/core/model_runtime/model_providers/tongyi/tongyi.yaml b/api/core/model_runtime/model_providers/tongyi/tongyi.yaml index 786d687ad2..500fd6e045 100644 --- a/api/core/model_runtime/model_providers/tongyi/tongyi.yaml +++ b/api/core/model_runtime/model_providers/tongyi/tongyi.yaml @@ -16,6 +16,7 @@ help: en_US: https://dashscope.console.aliyun.com/api-key_management supported_model_types: - llm + - tts configurate_methods: - predefined-model provider_credential_schema: diff --git a/api/core/model_runtime/model_providers/tongyi/tts/__init__.py b/api/core/model_runtime/model_providers/tongyi/tts/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml new file mode 100644 index 0000000000..e1b213ad28 --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml @@ -0,0 +1,7 @@ +model: tts-1 +model_type: tts +model_properties: + default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置 + word_limit: 120 + audio_type: 'mp3' + max_workers: 5 diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts.py b/api/core/model_runtime/model_providers/tongyi/tts/tts.py new file mode 100644 index 0000000000..1dd307e28f --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py @@ -0,0 +1,142 @@ +from io import BytesIO +from typing import Optional +from functools import reduce +from pydub import AudioSegment + +from core.model_runtime.errors.validate import CredentialsValidateFailedError +from core.model_runtime.errors.invoke import InvokeBadRequestError +from core.model_runtime.model_providers.__base.tts_model import TTSModel +from core.model_runtime.model_providers.tongyi._common import _CommonTongyi + +import dashscope +from flask import Response, stream_with_context +import concurrent.futures + + +class TongyiText2SpeechModel(_CommonTongyi, TTSModel): + """ + Model class for Tongyi Speech to text model. + """ + def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any: + """ + _invoke text2speech model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param streaming: output is streaming + :param user: unique user id + :return: text translated to audio file + """ + self._is_ffmpeg_installed() + audio_type = self._get_model_audio_type(model, credentials) + if streaming: + return Response(stream_with_context(self._tts_invoke_streaming(model=model, + credentials=credentials, + content_text=content_text, + user=user)), + status=200, mimetype=f'audio/{audio_type}') + else: + return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user) + + def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None: + """ + validate credentials text2speech model + + :param model: model name + :param credentials: model credentials + :param user: unique user id + :return: text translated to audio file + """ + try: + self._tts_invoke( + model=model, + credentials=credentials, + content_text='Hello world!', + user=user + ) + except Exception as ex: + raise CredentialsValidateFailedError(str(ex)) + + def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response: + """ + _tts_invoke text2speech model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param user: unique user id + :return: text translated to audio file + """ + audio_type = self._get_model_audio_type(model, credentials) + word_limit = self._get_model_word_limit(model, credentials) + max_workers = self._get_model_workers_limit(model, credentials) + + try: + sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) + audio_bytes_list = list() + + # Create a thread pool and map the function to the list of sentences + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = [executor.submit(self._process_sentence, model=model, sentence=sentence, + credentials=credentials, audio_type=audio_type) for sentence in sentences] + for future in futures: + try: + audio_bytes_list.append(future.result()) + except Exception as ex: + raise InvokeBadRequestError(str(ex)) + + audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in + audio_bytes_list if audio_bytes] + combined_segment = reduce(lambda x, y: x + y, audio_segments) + buffer: BytesIO = BytesIO() + combined_segment.export(buffer, format=audio_type) + buffer.seek(0) + return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}") + except Exception as ex: + raise InvokeBadRequestError(str(ex)) + + # Todo: To improve the streaming function + def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any: + """ + _tts_invoke_streaming text2speech model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param user: unique user id + :return: text translated to audio file + """ + # transform credentials to kwargs for model instance + dashscope.api_key = credentials.get('dashscope_api_key') + voice_name = self._get_model_voice(model, credentials) + word_limit = self._get_model_word_limit(model, credentials) + audio_type = self._get_model_audio_type(model, credentials) + try: + sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit)) + for sentence in sentences: + response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), + format=audio_type, word_timestamp_enabled=True, + phoneme_timestamp_enabled=True) + if isinstance(response.get_audio_data(), bytes): + return response.get_audio_data() + except Exception as ex: + raise InvokeBadRequestError(str(ex)) + + def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str): + """ + _tts_invoke Tongyi text2speech model api + + :param model: model name + :param credentials: model credentials + :param sentence: text content to be translated + :param audio_type: audio file type + :return: text translated to audio file + """ + # transform credentials to kwargs for model instance + dashscope.api_key = credentials.get('dashscope_api_key') + voice_name = self._get_model_voice(model, credentials) + + response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type) + if isinstance(response.get_audio_data(), bytes): + return response.get_audio_data() diff --git a/web/app/components/develop/template/template.en.mdx b/web/app/components/develop/template/template.en.mdx index 2ac9aecdea..f3e1e2fe0f 100644 --- a/web/app/components/develop/template/template.en.mdx +++ b/web/app/components/develop/template/template.en.mdx @@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran /> - Text to speech, only supports openai model. + Text to speech. ### Request Body diff --git a/web/app/components/develop/template/template.zh.mdx b/web/app/components/develop/template/template.zh.mdx index 2c5322c922..c3591836b5 100644 --- a/web/app/components/develop/template/template.zh.mdx +++ b/web/app/components/develop/template/template.zh.mdx @@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx' /> - 文字转语音,仅支持 openai 模型。 + 文字转语音。 ### Request Body diff --git a/web/app/components/develop/template/template_chat.en.mdx b/web/app/components/develop/template/template_chat.en.mdx index f6532510c7..69f6b9074b 100644 --- a/web/app/components/develop/template/template_chat.en.mdx +++ b/web/app/components/develop/template/template_chat.en.mdx @@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to /> - Text to speech, only supports openai model. + Text to speech. ### Request Body diff --git a/web/app/components/develop/template/template_chat.zh.mdx b/web/app/components/develop/template/template_chat.zh.mdx index b87a89b825..7f3a18832e 100644 --- a/web/app/components/develop/template/template_chat.zh.mdx +++ b/web/app/components/develop/template/template_chat.zh.mdx @@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx' /> - 文字转语音,仅支持 openai 模型。 + 文字转语音。 ### Request Body