diff --git a/api/controllers/console/explore/audio.py b/api/controllers/console/explore/audio.py index 651cdf16b5..784c0c6330 100644 --- a/api/controllers/console/explore/audio.py +++ b/api/controllers/console/explore/audio.py @@ -32,6 +32,7 @@ class ChatAudioApi(InstalledAppResource): response = AudioService.transcript_asr( tenant_id=app_model.tenant_id, file=file, + end_user=None ) return response diff --git a/api/controllers/service_api/app/audio.py b/api/controllers/service_api/app/audio.py index 3e642b69d3..9c5ae9a836 100644 --- a/api/controllers/service_api/app/audio.py +++ b/api/controllers/service_api/app/audio.py @@ -66,6 +66,7 @@ class TextApi(AppApiResource): parser = reqparse.RequestParser() parser.add_argument('text', type=str, required=True, nullable=False, location='json') parser.add_argument('user', type=str, required=True, nullable=False, location='json') + parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json') args = parser.parse_args() try: @@ -73,7 +74,7 @@ class TextApi(AppApiResource): tenant_id=app_model.tenant_id, text=args['text'], end_user=args['user'], - streaming=False + streaming=args['streaming'] ) return response diff --git a/api/controllers/web/audio.py b/api/controllers/web/audio.py index 310374a256..44ca7b660a 100644 --- a/api/controllers/web/audio.py +++ b/api/controllers/web/audio.py @@ -31,6 +31,7 @@ class AudioApi(WebApiResource): response = AudioService.transcript_asr( tenant_id=app_model.tenant_id, file=file, + end_user=end_user ) return response diff --git a/api/core/model_runtime/README.md b/api/core/model_runtime/README.md index eba1f3f2d0..d7748a8c3c 100644 --- a/api/core/model_runtime/README.md +++ b/api/core/model_runtime/README.md @@ -13,6 +13,7 @@ This module provides the interface for invoking and authenticating various model - `Text Embedding Model` - Text Embedding, pre-computed tokens capability - `Rerank Model` - Segment Rerank capability - `Speech-to-text Model` - Speech to text capability + - `Text-to-speech Model` - Text to speech capability - `Moderation` - Moderation capability - Model provider display diff --git a/api/core/model_runtime/README_CN.md b/api/core/model_runtime/README_CN.md index d9000845c6..6950cdc0c7 100644 --- a/api/core/model_runtime/README_CN.md +++ b/api/core/model_runtime/README_CN.md @@ -13,6 +13,7 @@ - `Text Embedidng Model` - 文本 Embedding ,预计算 tokens 能力 - `Rerank Model` - 分段 Rerank 能力 - `Speech-to-text Model` - 语音转文本能力 + - `Text-to-speech Model` - 文本转语音能力 - `Moderation` - Moderation 能力 - 模型供应商展示 diff --git a/api/core/model_runtime/docs/en_US/interfaces.md b/api/core/model_runtime/docs/en_US/interfaces.md index 3f6463a4ef..dc70bfad17 100644 --- a/api/core/model_runtime/docs/en_US/interfaces.md +++ b/api/core/model_runtime/docs/en_US/interfaces.md @@ -299,9 +299,7 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement - Invoke Invocation ```python - def _invoke(self, model: str, credentials: dict, - file: IO[bytes], user: Optional[str] = None) \ - -> str: + def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str: """ Invoke large language model @@ -331,6 +329,46 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement The string after speech-to-text conversion. +### Text2speech + +Inherit the `__base.text2speech_model.Text2SpeechModel` base class and implement the following interfaces: + +- Invoke Invocation + + ```python + def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): + """ + Invoke large language model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param streaming: output is streaming + :param user: unique user id + :return: translated audio file + """ + ``` + + - Parameters: + + - `model` (string) Model name + + - `credentials` (object) Credential information + + The parameters of credential information are defined by either the `provider_credential_schema` or `model_credential_schema` in the provider's YAML configuration file. Inputs such as `api_key` are included. + + - `content_text` (string) The text content that needs to be converted + + - `streaming` (bool) Whether to stream output + + - `user` (string) [optional] Unique identifier of the user + + This can help the provider monitor and detect abusive behavior. + + - Returns: + + Text converted speech stream。 + ### Moderation Inherit the `__base.moderation_model.ModerationModel` base class and implement the following interfaces: diff --git a/api/core/model_runtime/docs/en_US/provider_scale_out.md b/api/core/model_runtime/docs/en_US/provider_scale_out.md index 1d012c5a29..d93a5426b5 100644 --- a/api/core/model_runtime/docs/en_US/provider_scale_out.md +++ b/api/core/model_runtime/docs/en_US/provider_scale_out.md @@ -94,6 +94,7 @@ The currently supported model types are as follows: - `text_embedding` Text Embedding model - `rerank` Rerank model - `speech2text` Speech to text +- `tts` Text to speech - `moderation` Moderation Continuing with `Anthropic` as an example, since `Anthropic` only supports LLM, we create a `module` named `llm` in `model_providers.anthropic`. diff --git a/api/core/model_runtime/docs/en_US/schema.md b/api/core/model_runtime/docs/en_US/schema.md index 1dfc93c356..9606579e1c 100644 --- a/api/core/model_runtime/docs/en_US/schema.md +++ b/api/core/model_runtime/docs/en_US/schema.md @@ -47,6 +47,10 @@ - `max_chunks` (int) Maximum number of chunks (available for model types `text-embedding`, `moderation`) - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`) - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`) + - `default_voice` (string) default voice, e.g.:alloy,echo,fable,onyx,nova,shimmer(available for model type `tts`) + - `word_limit` (int) Single conversion word limit, paragraphwise by default(available for model type `tts`) + - `audio_type` (string) Support audio file extension format, e.g.:mp3,wav(available for model type `tts`) + - `max_workers` (int) Number of concurrent workers supporting text and audio conversion(available for model type`tts`) - `max_characters_per_chunk` (int) Maximum characters per chunk (available for model type `moderation`) - `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] Model invocation parameter rules - `pricing` ([PriceConfig](#PriceConfig)) [optional] Pricing information @@ -58,6 +62,7 @@ - `text-embedding` Text Embedding model - `rerank` Rerank model - `speech2text` Speech to text +- `tts` Text to speech - `moderation` Moderation ### ConfigurateMethod diff --git a/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md b/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md index 97b3720e8c..ccf78d0cdb 100644 --- a/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md +++ b/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md @@ -23,6 +23,7 @@ - `text_embedding` 文本 Embedding 模型 - `rerank` Rerank 模型 - `speech2text` 语音转文字 +- `tts` 文字转语音 - `moderation` 审查 `Xinference`支持`LLM`和`Text Embedding`和Rerank,那么我们开始编写`xinference.yaml`。 diff --git a/api/core/model_runtime/docs/zh_Hans/interfaces.md b/api/core/model_runtime/docs/zh_Hans/interfaces.md index ce70da1859..5bd505a0ee 100644 --- a/api/core/model_runtime/docs/zh_Hans/interfaces.md +++ b/api/core/model_runtime/docs/zh_Hans/interfaces.md @@ -369,6 +369,46 @@ class XinferenceProvider(Provider): 语音转换后的字符串。 +### Text2speech + +继承 `__base.text2speech_model.Text2SpeechModel` 基类,实现以下接口: + +- Invoke 调用 + + ```python + def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None): + """ + Invoke large language model + + :param model: model name + :param credentials: model credentials + :param content_text: text content to be translated + :param streaming: output is streaming + :param user: unique user id + :return: translated audio file + """ + ``` + + - 参数: + + - `model` (string) 模型名称 + + - `credentials` (object) 凭据信息 + + 凭据信息的参数由供应商 YAML 配置文件的 `provider_credential_schema` 或 `model_credential_schema` 定义,传入如:`api_key` 等。 + + - `content_text` (string) 需要转换的文本内容 + + - `streaming` (bool) 是否进行流式输出 + + - `user` (string) [optional] 用户的唯一标识符 + + 可以帮助供应商监控和检测滥用行为。 + + - 返回: + + 文本转换后的语音流。 + ### Moderation 继承 `__base.moderation_model.ModerationModel` 基类,实现以下接口: diff --git a/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md b/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md index 5f4e84680d..c90fb577ca 100644 --- a/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md +++ b/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md @@ -10,6 +10,7 @@ - `text_embedding` 文本 Embedding 模型 - `rerank` Rerank 模型 - `speech2text` 语音转文字 +- `tts` 文字转语音 - `moderation` 审查 依旧以 `Anthropic` 为例,`Anthropic` 仅支持 LLM,因此在 `model_providers.anthropic` 创建一个 `llm` 为名称的 `module`。 diff --git a/api/core/model_runtime/docs/zh_Hans/schema.md b/api/core/model_runtime/docs/zh_Hans/schema.md index 155750c067..1eab541d24 100644 --- a/api/core/model_runtime/docs/zh_Hans/schema.md +++ b/api/core/model_runtime/docs/zh_Hans/schema.md @@ -48,6 +48,10 @@ - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用) - `file_upload_limit` (int) 文件最大上传限制,单位:MB。(模型类型 `speech2text` 可用) - `supported_file_extensions` (string) 支持文件扩展格式,如:mp3,mp4(模型类型 `speech2text` 可用) + - `default_voice` (string) 缺省音色,可选:alloy,echo,fable,onyx,nova,shimmer(模型类型 `tts` 可用) + - `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用) + - `audio_type` (string) 支持音频文件扩展格式,如:mp3,wav(模型类型 `tts` 可用) + - `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用) - `max_characters_per_chunk` (int) 每块最大字符数 (模型类型 `moderation` 可用) - `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] 模型调用参数规则 - `pricing` ([PriceConfig](#PriceConfig)) [optional] 价格信息 @@ -59,6 +63,7 @@ - `text-embedding` 文本 Embedding 模型 - `rerank` Rerank 模型 - `speech2text` 语音转文字 +- `tts` 文字转语音 - `moderation` 审查 ### ConfigurateMethod diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml index e1b213ad28..8746fb9f02 100644 --- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml +++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml @@ -5,3 +5,8 @@ model_properties: word_limit: 120 audio_type: 'mp3' max_workers: 5 +pricing: + input: '1' + output: '0' + unit: '0.0001' + currency: RMB diff --git a/api/requirements.txt b/api/requirements.txt index 5df1af40ae..c5043d3bee 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -62,7 +62,6 @@ bs4~=0.0.1 markdown~=3.5.1 google-generativeai~=0.3.2 httpx[socks]~=0.24.1 -pydub~=0.25.1 matplotlib~=3.8.2 yfinance~=0.2.35 pydub~=0.25.1 diff --git a/api/services/audio_service.py b/api/services/audio_service.py index 44aac41880..bbd5bfb52b 100644 --- a/api/services/audio_service.py +++ b/api/services/audio_service.py @@ -56,7 +56,6 @@ class AudioService: raise ProviderNotSupportTextToSpeechServiceError() try: - audio_response = model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming) - return audio_response + return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming) except Exception as e: raise e