diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 53d530706..ed5765c1c 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -20,7 +20,7 @@ from api.utils.api_utils import server_error_response, get_data_error_result, va from api.db import StatusEnum, LLMType from api.db.db_models import TenantLLM from api.utils.api_utils import get_json_result -from rag.llm import EmbeddingModel, ChatModel, RerankModel,CvModel +from rag.llm import EmbeddingModel, ChatModel, RerankModel, CvModel, TTSModel import requests import ast @@ -142,6 +142,10 @@ def add_llm(): llm_name = req["llm_name"] api_key = '{' + f'"yiyan_ak": "{req.get("yiyan_ak", "")}", ' \ f'"yiyan_sk": "{req.get("yiyan_sk", "")}"' + '}' + elif factory == "Fish Audio": + llm_name = req["llm_name"] + api_key = '{' + f'"fish_audio_ak": "{req.get("fish_audio_ak", "")}", ' \ + f'"fish_audio_refid": "{req.get("fish_audio_refid", "59cb5986671546eaa6ca8ae6f29f6d22")}"' + '}' else: llm_name = req["llm_name"] api_key = req.get("api_key","xxxxxxxxxxxxxxx") @@ -215,6 +219,15 @@ def add_llm(): pass except Exception as e: msg += f"\nFail to access model({llm['llm_name']})." + str(e) + elif llm["model_type"] == LLMType.TTS: + mdl = TTSModel[factory]( + key=llm["api_key"], model_name=llm["llm_name"], base_url=llm["api_base"] + ) + try: + for resp in mdl.transcription("Hello~ Ragflower!"): + pass + except RuntimeError as e: + msg += f"\nFail to access model({llm['llm_name']})." + str(e) else: # TODO: check other type of models pass diff --git a/api/apps/user_app.py b/api/apps/user_app.py index 1535446d3..6ac8576e3 100644 --- a/api/apps/user_app.py +++ b/api/apps/user_app.py @@ -410,7 +410,7 @@ def tenant_info(): @manager.route("/set_tenant_info", methods=["POST"]) @login_required -@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id") +@validate_request("tenant_id", "asr_id", "embd_id", "img2txt_id", "llm_id", "tts_id") def set_tenant_info(): req = request.json try: diff --git a/api/db/__init__.py b/api/db/__init__.py index 03bf00fec..a24726cd2 100644 --- a/api/db/__init__.py +++ b/api/db/__init__.py @@ -55,6 +55,7 @@ class LLMType(StrEnum): SPEECH2TEXT = 'speech2text' IMAGE2TEXT = 'image2text' RERANK = 'rerank' + TTS = 'tts' class ChatStyle(StrEnum): diff --git a/api/db/db_models.py b/api/db/db_models.py index c8c1dd0ad..04bef2e83 100644 --- a/api/db/db_models.py +++ b/api/db/db_models.py @@ -449,6 +449,11 @@ class Tenant(DataBaseModel): null=False, help_text="default rerank model ID", index=True) + tts_id = CharField( + max_length=256, + null=True, + help_text="default tts model ID", + index=True) parser_ids = CharField( max_length=256, null=False, @@ -958,6 +963,13 @@ def migrate_db(): ) except Exception as e: pass + try: + migrate( + migrator.add_column("tenant","tts_id", + CharField(max_length=256,null=True,help_text="default tts model ID",index=True)) + ) + except Exception as e: + pass try: migrate( migrator.add_column('api_4_conversation', 'source', diff --git a/api/db/services/llm_service.py b/api/db/services/llm_service.py index 15cf545c0..c49669f80 100644 --- a/api/db/services/llm_service.py +++ b/api/db/services/llm_service.py @@ -15,7 +15,7 @@ # from api.db.services.user_service import TenantService from api.settings import database_logger -from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel +from rag.llm import EmbeddingModel, CvModel, ChatModel, RerankModel, Seq2txtModel, TTSModel from api.db import LLMType from api.db.db_models import DB, UserTenant from api.db.db_models import LLMFactories, LLM, TenantLLM @@ -75,6 +75,8 @@ class TenantLLMService(CommonService): mdlnm = tenant.llm_id if not llm_name else llm_name elif llm_type == LLMType.RERANK: mdlnm = tenant.rerank_id if not llm_name else llm_name + elif llm_type == LLMType.TTS: + mdlnm = tenant.tts_id if not llm_name else llm_name else: assert False, "LLM type error" @@ -127,6 +129,14 @@ class TenantLLMService(CommonService): model_config["api_key"], model_config["llm_name"], lang, base_url=model_config["api_base"] ) + if llm_type == LLMType.TTS: + if model_config["llm_factory"] not in TTSModel: + return + return TTSModel[model_config["llm_factory"]]( + model_config["api_key"], + model_config["llm_name"], + base_url=model_config["api_base"], + ) @classmethod @DB.connection_context() @@ -144,7 +154,9 @@ class TenantLLMService(CommonService): elif llm_type == LLMType.CHAT.value: mdlnm = tenant.llm_id if not llm_name else llm_name elif llm_type == LLMType.RERANK: - mdlnm = tenant.llm_id if not llm_name else llm_name + mdlnm = tenant.rerank_id if not llm_name else llm_name + elif llm_type == LLMType.TTS: + mdlnm = tenant.tts_id if not llm_name else llm_name else: assert False, "LLM type error" diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 4af341ee2..289dcffa1 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -3214,6 +3214,13 @@ "tags": "LLM", "status": "1", "llm": [] + }, + { + "name": "Fish Audio", + "logo": "", + "tags": "TTS", + "status": "1", + "llm": [] } ] } diff --git a/rag/llm/__init__.py b/rag/llm/__init__.py index 589e7d85d..73fad7cdd 100644 --- a/rag/llm/__init__.py +++ b/rag/llm/__init__.py @@ -18,6 +18,7 @@ from .chat_model import * from .cv_model import * from .rerank_model import * from .sequence2txt_model import * +from .tts_model import * EmbeddingModel = { "Ollama": OllamaEmbed, @@ -129,3 +130,7 @@ Seq2txtModel = { "Azure-OpenAI": AzureSeq2txt, "Xinference": XinferenceSeq2txt } + +TTSModel = { + "Fish Audio": FishAudioTTS +} \ No newline at end of file diff --git a/rag/llm/tts_model.py b/rag/llm/tts_model.py new file mode 100644 index 000000000..731725e54 --- /dev/null +++ b/rag/llm/tts_model.py @@ -0,0 +1,94 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from typing import Annotated, Literal +from abc import ABC +import httpx +import ormsgpack +from pydantic import BaseModel, conint +from rag.utils import num_tokens_from_string +import json + + +class ServeReferenceAudio(BaseModel): + audio: bytes + text: str + + +class ServeTTSRequest(BaseModel): + text: str + chunk_length: Annotated[int, conint(ge=100, le=300, strict=True)] = 200 + # Audio format + format: Literal["wav", "pcm", "mp3"] = "mp3" + mp3_bitrate: Literal[64, 128, 192] = 128 + # References audios for in-context learning + references: list[ServeReferenceAudio] = [] + # Reference id + # For example, if you want use https://fish.audio/m/7f92f8afb8ec43bf81429cc1c9199cb1/ + # Just pass 7f92f8afb8ec43bf81429cc1c9199cb1 + reference_id: str | None = None + # Normalize text for en & zh, this increase stability for numbers + normalize: bool = True + # Balance mode will reduce latency to 300ms, but may decrease stability + latency: Literal["normal", "balanced"] = "normal" + + +class Base(ABC): + def __init__(self, key, model_name, base_url): + pass + + def transcription(self, audio): + pass + + +class FishAudioTTS(Base): + def __init__(self, key, model_name, base_url="https://api.fish.audio/v1/tts"): + if not base_url: + base_url = "https://api.fish.audio/v1/tts" + key = json.loads(key) + self.headers = { + "api-key": key.get("fish_audio_ak"), + "content-type": "application/msgpack", + } + self.ref_id = key.get("fish_audio_refid") + self.base_url = base_url + + def transcription(self, text): + from http import HTTPStatus + + request = request = ServeTTSRequest(text=text, reference_id=self.ref_id) + + with httpx.Client() as client: + try: + with client.stream( + method="POST", + url=self.base_url, + content=ormsgpack.packb( + request, option=ormsgpack.OPT_SERIALIZE_PYDANTIC + ), + headers=self.headers, + timeout=None, + ) as response: + if response.status_code == HTTPStatus.OK: + for chunk in response.iter_bytes(): + yield chunk + else: + response.raise_for_status() + + yield num_tokens_from_string(text) + + except httpx.HTTPStatusError as e: + raise RuntimeError(f"**ERROR**: {e}") diff --git a/requirements.txt b/requirements.txt index ee98fd1f4..094e20151 100644 --- a/requirements.txt +++ b/requirements.txt @@ -47,6 +47,7 @@ openai==1.12.0 opencv_python==4.9.0.80 opencv_python_headless==4.9.0.80 openpyxl==3.1.2 +ormsgpack==1.5.0 pandas==2.2.2 pdfplumber==0.10.4 peewee==3.17.1 diff --git a/requirements_arm.txt b/requirements_arm.txt index f96c98fc5..57e795cfd 100644 --- a/requirements_arm.txt +++ b/requirements_arm.txt @@ -74,6 +74,7 @@ ollama==0.1.9 openai==1.12.0 opencv-python==4.9.0.80 openpyxl==3.1.2 +ormsgpack==1.5.0 packaging==23.2 pandas==2.2.1 pdfminer.six==20221105 diff --git a/web/src/assets/svg/llm/fish-audio.svg b/web/src/assets/svg/llm/fish-audio.svg new file mode 100644 index 000000000..ec44029a9 --- /dev/null +++ b/web/src/assets/svg/llm/fish-audio.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/web/src/constants/knowledge.ts b/web/src/constants/knowledge.ts index 586a15bcc..fe82e5e59 100644 --- a/web/src/constants/knowledge.ts +++ b/web/src/constants/knowledge.ts @@ -48,6 +48,7 @@ export enum LlmModelType { Image2text = 'image2text', Speech2text = 'speech2text', Rerank = 'rerank', + TTS = 'tts', } export enum KnowledgeSearchParams { diff --git a/web/src/hooks/llm-hooks.ts b/web/src/hooks/llm-hooks.ts index a270c151a..7fba686e4 100644 --- a/web/src/hooks/llm-hooks.ts +++ b/web/src/hooks/llm-hooks.ts @@ -87,6 +87,7 @@ export const useSelectLlmOptionsByModelType = () => { LlmModelType.Speech2text, ), [LlmModelType.Rerank]: groupOptionsByModelType(LlmModelType.Rerank), + [LlmModelType.TTS]: groupOptionsByModelType(LlmModelType.TTS), }; }; diff --git a/web/src/interfaces/database/knowledge.ts b/web/src/interfaces/database/knowledge.ts index 017ab6b1a..3fc103678 100644 --- a/web/src/interfaces/database/knowledge.ts +++ b/web/src/interfaces/database/knowledge.ts @@ -71,6 +71,7 @@ export interface ITenantInfo { tenant_id: string; chat_id: string; speech2text_id: string; + tts_id: string; } export interface IChunk { diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index 093687216..538dae4a5 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -490,6 +490,9 @@ The above is the content you need to summarize.`, 'The default ASR model all the newly created knowledgebase will use. Use this model to translate voices to corresponding text.', rerankModel: 'Rerank Model', rerankModelTip: `The default rerank model is used to rerank chunks retrieved by users' questions.`, + ttsModel: 'TTS Model', + ttsModelTip: + 'The default TTS model will be used to generate speech during conversations upon request.', workspace: 'Workspace', upgrade: 'Upgrade', addLlmTitle: 'Add LLM', @@ -502,6 +505,7 @@ The above is the content you need to summarize.`, baseUrlNameMessage: 'Please input your base url!', vision: 'Does it support Vision?', ollamaLink: 'How to integrate {{name}}', + FishAudioLink: 'How to use FishAudio', volcModelNameMessage: 'Please input your model name!', addEndpointID: 'EndpointID of the model', endpointIDMessage: 'Please input your EndpointID of the model', @@ -533,6 +537,13 @@ The above is the content you need to summarize.`, yiyanAKMessage: 'Please input your API KEY', addyiyanSK: 'yiyan Secret KEY', yiyanSKMessage: 'Please input your Secret KEY', + FishAudioModelNameMessage: + 'Please give your speech synthesis model a name', + addFishAudioAK: 'Fish Audio API KEY', + addFishAudioAKMessage: 'Please input your API KEY', + addFishAudioRefID: 'FishAudio Refrence ID', + addFishAudioRefIDMessage: + 'Please input the Reference ID (leave blank to use the default model).', }, message: { registered: 'Registered!', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 55adfc7db..62777e66d 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -443,6 +443,8 @@ export default { systemModelSettings: '系統模型設置', chatModel: '聊天模型', chatModelTip: '所有新創建的知識庫都會使用默認的聊天LLM。', + ttsModel: '語音合成模型', + ttsModelTip: '默認的tts模型會被用於在對話過程中請求語音生成時使用。', embeddingModel: '嵌入模型', embeddingModelTip: '所有新創建的知識庫都將使用的默認嵌入模型。', img2txtModel: 'img2Txt模型', @@ -465,6 +467,7 @@ export default { modelTypeMessage: '請輸入模型類型!', baseUrlNameMessage: '請輸入基礎 Url!', ollamaLink: '如何集成 {{name}}', + FishAudioLink: '如何使用Fish Audio', volcModelNameMessage: '請輸入模型名稱!', addEndpointID: '模型 EndpointID', endpointIDMessage: '請輸入模型對應的EndpointID', @@ -496,6 +499,10 @@ export default { yiyanAKMessage: '請輸入 API KEY', addyiyanSK: '一言 Secret KEY', yiyanSKMessage: '請輸入 Secret KEY', + addFishAudioAK: 'Fish Audio API KEY', + addFishAudioAKMessage: '請輸入 API KEY', + addFishAudioRefID: 'FishAudio Refrence ID', + addFishAudioRefIDMessage: '請輸入引用模型的ID(留空表示使用默認模型)', }, message: { registered: '註冊成功', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 72b966629..2275488be 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -460,6 +460,8 @@ export default { systemModelSettings: '系统模型设置', chatModel: '聊天模型', chatModelTip: '所有新创建的知识库都会使用默认的聊天LLM。', + ttsModel: 'TTS模型', + ttsModelTip: '默认的tts模型会被用于在对话过程中请求语音生成时使用', embeddingModel: '嵌入模型', embeddingModelTip: '所有新创建的知识库都将使用的默认嵌入模型。', img2txtModel: 'Img2txt模型', @@ -482,6 +484,7 @@ export default { modelTypeMessage: '请输入模型类型!', baseUrlNameMessage: '请输入基础 Url!', ollamaLink: '如何集成 {{name}}', + FishAudioLink: '如何使用Fish Audio', volcModelNameMessage: '请输入模型名称!', addEndpointID: '模型 EndpointID', endpointIDMessage: '请输入模型对应的EndpointID', @@ -513,6 +516,10 @@ export default { yiyanAKMessage: '请输入 API KEY', addyiyanSK: '一言 Secret KEY', yiyanSKMessage: '请输入 Secret KEY', + addFishAudioAK: 'Fish Audio API KEY', + FishAudioAKMessage: '请输入 API KEY', + addFishAudioRefID: 'FishAudio Refrence ID', + FishAudioRefIDMessage: '请输入引用模型的ID(留空表示使用默认模型)', }, message: { registered: '注册成功', diff --git a/web/src/pages/user-setting/setting-model/constant.ts b/web/src/pages/user-setting/setting-model/constant.ts index 443e9fb3e..4cb4ed9b3 100644 --- a/web/src/pages/user-setting/setting-model/constant.ts +++ b/web/src/pages/user-setting/setting-model/constant.ts @@ -35,6 +35,7 @@ export const IconMap = { 'Tencent Hunyuan': 'hunyuan', 'XunFei Spark': 'spark', BaiduYiyan: 'yiyan', + 'Fish Audio': 'fish-audio', }; export const BedrockRegionList = [ diff --git a/web/src/pages/user-setting/setting-model/fish-audio-modal/index.tsx b/web/src/pages/user-setting/setting-model/fish-audio-modal/index.tsx new file mode 100644 index 000000000..af82f9123 --- /dev/null +++ b/web/src/pages/user-setting/setting-model/fish-audio-modal/index.tsx @@ -0,0 +1,101 @@ +import { useTranslate } from '@/hooks/common-hooks'; +import { IModalProps } from '@/interfaces/common'; +import { IAddLlmRequestBody } from '@/interfaces/request/llm'; +import { Flex, Form, Input, Modal, Select, Space } from 'antd'; +import omit from 'lodash/omit'; + +type FieldType = IAddLlmRequestBody & { + fish_audio_ak: string; + fish_audio_refid: string; +}; + +const { Option } = Select; + +const FishAudioModal = ({ + visible, + hideModal, + onOk, + loading, + llmFactory, +}: IModalProps & { llmFactory: string }) => { + const [form] = Form.useForm(); + + const { t } = useTranslate('setting'); + + const handleOk = async () => { + const values = await form.validateFields(); + const modelType = values.model_type; + + const data = { + ...omit(values), + model_type: modelType, + llm_factory: llmFactory, + }; + console.info(data); + + onOk?.(data); + }; + + return ( + { + return ( + + + {t('FishAudioLink')} + + {originNode} + + ); + }} + confirmLoading={loading} + > +
+ + label={t('modelType')} + name="model_type" + initialValue={'tts'} + rules={[{ required: true, message: t('modelTypeMessage') }]} + > + + + + label={t('modelName')} + name="llm_name" + rules={[{ required: true, message: t('FishAudioModelNameMessage') }]} + > + + + + label={t('addFishAudioAK')} + name="FishAudio_ak" + rules={[{ required: true, message: t('FishAudioAKMessage') }]} + > + + + + label={t('addFishAudioRefID')} + name="FishAudio_refid" + rules={[{ required: false, message: t('FishAudioRefIDMessage') }]} + > + + + +
+ ); +}; + +export default FishAudioModal; diff --git a/web/src/pages/user-setting/setting-model/hooks.ts b/web/src/pages/user-setting/setting-model/hooks.ts index 00665d12b..68391fc45 100644 --- a/web/src/pages/user-setting/setting-model/hooks.ts +++ b/web/src/pages/user-setting/setting-model/hooks.ts @@ -244,6 +244,33 @@ export const useSubmityiyan = () => { }; }; +export const useSubmitFishAudio = () => { + const { addLlm, loading } = useAddLlm(); + const { + visible: FishAudioAddingVisible, + hideModal: hideFishAudioAddingModal, + showModal: showFishAudioAddingModal, + } = useSetModalState(); + + const onFishAudioAddingOk = useCallback( + async (payload: IAddLlmRequestBody) => { + const ret = await addLlm(payload); + if (ret === 0) { + hideFishAudioAddingModal(); + } + }, + [hideFishAudioAddingModal, addLlm], + ); + + return { + FishAudioAddingLoading: loading, + onFishAudioAddingOk, + FishAudioAddingVisible, + hideFishAudioAddingModal, + showFishAudioAddingModal, + }; +}; + export const useSubmitBedrock = () => { const { addLlm, loading } = useAddLlm(); const { diff --git a/web/src/pages/user-setting/setting-model/index.tsx b/web/src/pages/user-setting/setting-model/index.tsx index 1536a5a11..809c8b799 100644 --- a/web/src/pages/user-setting/setting-model/index.tsx +++ b/web/src/pages/user-setting/setting-model/index.tsx @@ -30,10 +30,12 @@ import { isLocalLlmFactory } from '../utils'; import ApiKeyModal from './api-key-modal'; import BedrockModal from './bedrock-modal'; import { IconMap } from './constant'; +import FishAudioModal from './fish-audio-modal'; import { useHandleDeleteLlm, useSubmitApiKey, useSubmitBedrock, + useSubmitFishAudio, useSubmitHunyuan, useSubmitOllama, useSubmitSpark, @@ -98,7 +100,8 @@ const ModelCard = ({ item, clickApiKey }: IModelCardProps) => { item.name === 'VolcEngine' || item.name === 'Tencent Hunyuan' || item.name === 'XunFei Spark' || - item.name === 'BaiduYiyan' + item.name === 'BaiduYiyan' || + item.name === 'Fish Audio' ? t('addTheModel') : 'API-Key'} @@ -196,6 +199,14 @@ const UserSettingModel = () => { yiyanAddingLoading, } = useSubmityiyan(); + const { + FishAudioAddingVisible, + hideFishAudioAddingModal, + showFishAudioAddingModal, + onFishAudioAddingOk, + FishAudioAddingLoading, + } = useSubmitFishAudio(); + const { bedrockAddingLoading, onBedrockAddingOk, @@ -211,6 +222,7 @@ const UserSettingModel = () => { 'Tencent Hunyuan': showHunyuanAddingModal, 'XunFei Spark': showSparkAddingModal, BaiduYiyan: showyiyanAddingModal, + 'Fish Audio': showFishAudioAddingModal, }), [ showBedrockAddingModal, @@ -218,6 +230,7 @@ const UserSettingModel = () => { showHunyuanAddingModal, showSparkAddingModal, showyiyanAddingModal, + showFishAudioAddingModal, ], ); @@ -350,6 +363,13 @@ const UserSettingModel = () => { loading={yiyanAddingLoading} llmFactory={'BaiduYiyan'} > + label={t('addSparkAPIPassword')} name="spark_api_password" - rules={[{ required: true, message: t('SparkPasswordMessage') }]} + rules={[{ required: true, message: t('SparkAPIPasswordMessage') }]} > - + diff --git a/web/src/pages/user-setting/setting-model/system-model-setting-modal/index.tsx b/web/src/pages/user-setting/setting-model/system-model-setting-modal/index.tsx index 8bc65e8fc..e6517e4ea 100644 --- a/web/src/pages/user-setting/setting-model/system-model-setting-modal/index.tsx +++ b/web/src/pages/user-setting/setting-model/system-model-setting-modal/index.tsx @@ -83,6 +83,13 @@ const SystemModelSettingModal = ({ > + );