From 351bbdb36c21c19180ceb12e17b8f0468e8633aa Mon Sep 17 00:00:00 2001 From: Pawel Ochman Date: Tue, 17 Sep 2024 08:47:30 +0100 Subject: [PATCH 1/6] Added Azure speach service option (UI) --- src/lib/components/admin/Settings/Audio.svelte | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 1c114c9dd..5a9e91271 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -224,6 +224,7 @@ + @@ -252,6 +253,17 @@ /> + {:else if TTS_ENGINE === 'azurespeechservice'} +
+
+ +
+
{/if}
From d6b68f405e4383ea95e20c2bd2bf60b415f316e8 Mon Sep 17 00:00:00 2001 From: Pawel Ochman Date: Tue, 17 Sep 2024 09:13:10 +0100 Subject: [PATCH 2/6] added azure speech service support --- backend/open_webui/apps/audio/main.py | 24 ++++++++++++++++++++++++ backend/requirements.txt | 2 ++ 2 files changed, 26 insertions(+) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 8f643ffd3..54b5e7d79 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -301,6 +301,30 @@ async def speech(request: Request, user=Depends(get_verified_user)): detail=error_detail, ) + elif app.state.config.TTS_ENGINE == "azurespeechservice": + payload = None + try: + payload = json.loads(body.decode("utf-8")) + except Exception as e: + log.exception(e) + raise HTTPException(status_code=400, detail="Invalid JSON payload") + + import azure.cognitiveservices.speech as speechsdk + + config = speechsdk.SpeechConfig(subscription=app.state.config.TTS_API_KEY, region="uksouth") + speaker_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False, filename=str(file_path)) + + client = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=speaker_config) + result = client.speak_text(payload["input"]) + + if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + return FileResponse(file_path) + else: + raise HTTPException( + status_code=500, + detail=f"Error synthesizing speech - {result.reason}") + + @app.post("/transcriptions") def transcribe( diff --git a/backend/requirements.txt b/backend/requirements.txt index ba1252f56..6fa289b0a 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -79,6 +79,8 @@ extract_msg pydub duckduckgo-search~=6.2.11 +azure-cognitiveservices-speech==1.40.0 + ## Tests docker~=7.1.0 pytest~=8.3.2 From eacb69074e1f5c2a71fa09c8f5079c7ffef7743d Mon Sep 17 00:00:00 2001 From: Pawel Ochman Date: Wed, 18 Sep 2024 12:24:55 +0100 Subject: [PATCH 3/6] remove dependency and migrate to raw rest calls --- backend/open_webui/apps/audio/main.py | 27 ++++++++++++++++++++------- backend/requirements.txt | 2 -- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 54b5e7d79..bf6ff15e6 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -309,20 +309,33 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - import azure.cognitiveservices.speech as speechsdk + region = "uksouth" + language = "en-GB-SoniaNeural" + locale = "en-GB" + output_format = "audio-24khz-160kbitrate-mono-mp3" + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" - config = speechsdk.SpeechConfig(subscription=app.state.config.TTS_API_KEY, region="uksouth") - speaker_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=False, filename=str(file_path)) + headers = { + 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY, + 'Content-Type': 'application/ssml+xml', + 'X-Microsoft-OutputFormat': output_format + } - client = speechsdk.SpeechSynthesizer(speech_config=config, audio_config=speaker_config) - result = client.speak_text(payload["input"]) + data = f""" + {payload["input"]} + """ - if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: + response = requests.post(url, headers=headers, data=data) + + if response.status_code == 200: + with open(file_path, "wb") as f: + f.write(response.content) return FileResponse(file_path) else: + log.error(f"Error synthesizing speech - {response.reason}") raise HTTPException( status_code=500, - detail=f"Error synthesizing speech - {result.reason}") + detail=f"Error synthesizing speech - {response.reason}") diff --git a/backend/requirements.txt b/backend/requirements.txt index 6fa289b0a..ba1252f56 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -79,8 +79,6 @@ extract_msg pydub duckduckgo-search~=6.2.11 -azure-cognitiveservices-speech==1.40.0 - ## Tests docker~=7.1.0 pytest~=8.3.2 From 4d9677e8082737f9b78e0103e0df409587f7cd81 Mon Sep 17 00:00:00 2001 From: Pawel Ochman Date: Wed, 18 Sep 2024 14:13:42 +0100 Subject: [PATCH 4/6] Update configuration page, expose all Azure settings through ENV variables --- backend/open_webui/apps/audio/main.py | 37 ++++- backend/open_webui/config.py | 12 ++ .../components/admin/Settings/Audio.svelte | 133 ++++++++++++------ 3 files changed, 138 insertions(+), 44 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index bf6ff15e6..0d389daf2 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -19,6 +19,8 @@ from open_webui.config import ( AUDIO_TTS_OPENAI_API_KEY, AUDIO_TTS_SPLIT_ON, AUDIO_TTS_VOICE, + AUDIO_TTS_AZURE_SPEECH_REGION, + AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT, CACHE_DIR, CORS_ALLOW_ORIGIN, WHISPER_MODEL, @@ -62,6 +64,9 @@ app.state.config.TTS_VOICE = AUDIO_TTS_VOICE app.state.config.TTS_API_KEY = AUDIO_TTS_API_KEY app.state.config.TTS_SPLIT_ON = AUDIO_TTS_SPLIT_ON +app.state.config.TTS_AZURE_SPEECH_REGION = AUDIO_TTS_AZURE_SPEECH_REGION +app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT + # setting device type for whisper model whisper_device_type = DEVICE_TYPE if DEVICE_TYPE and DEVICE_TYPE == "cuda" else "cpu" log.info(f"whisper_device_type: {whisper_device_type}") @@ -78,6 +83,8 @@ class TTSConfigForm(BaseModel): MODEL: str VOICE: str SPLIT_ON: str + AZURE_SPEECH_REGION: str + AZURE_SPEECH_OUTPUT_FORMAT: str class STTConfigForm(BaseModel): @@ -130,6 +137,8 @@ async def get_audio_config(user=Depends(get_admin_user)): "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -151,6 +160,8 @@ async def update_audio_config( app.state.config.TTS_MODEL = form_data.tts.MODEL app.state.config.TTS_VOICE = form_data.tts.VOICE app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON + app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION + app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -166,6 +177,8 @@ async def update_audio_config( "MODEL": app.state.config.TTS_MODEL, "VOICE": app.state.config.TTS_VOICE, "SPLIT_ON": app.state.config.TTS_SPLIT_ON, + "AZURE_SPEECH_REGION": app.state.config.TTS_AZURE_SPEECH_REGION, + "AZURE_SPEECH_OUTPUT_FORMAT": app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, "stt": { "OPENAI_API_BASE_URL": app.state.config.STT_OPENAI_API_BASE_URL, @@ -309,10 +322,10 @@ async def speech(request: Request, user=Depends(get_verified_user)): log.exception(e) raise HTTPException(status_code=400, detail="Invalid JSON payload") - region = "uksouth" - language = "en-GB-SoniaNeural" - locale = "en-GB" - output_format = "audio-24khz-160kbitrate-mono-mp3" + region = app.state.config.TTS_AZURE_SPEECH_REGION + language = app.state.config.TTS_VOICE + locale = "-".join(app.state.config.TTS_VOICE.split("-")[:1]) + output_format = app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" headers = { @@ -515,6 +528,22 @@ def get_available_voices() -> dict: except Exception: # Avoided @lru_cache with exception pass + elif app.state.config.TTS_ENGINE == "azurespeechservice": + try: + region = app.state.config.TTS_AZURE_SPEECH_REGION + url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" + headers = { + 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY + } + + response = requests.get(url, headers=headers) + response.raise_for_status() + voices = response.json() + for voice in voices: + ret[voice['ShortName']] = f"{voice['DisplayName']} ({voice['ShortName']})" + except requests.RequestException as e: + log.error(f"Error fetching voices: {str(e)}") + return ret diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index 439e82e43..c7c78b8e6 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1472,3 +1472,15 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( "audio.tts.split_on", os.getenv("AUDIO_TTS_SPLIT_ON", "punctuation"), ) + +AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_REGION", + "audio.tts.azure_speech_region", + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "uksouth"), +) + +AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", + "audio.tts.azure_speech_output_format", + os.getenv("AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", 'audio-24khz-160kbitrate-mono-mp3'), +) diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 5a9e91271..15db5a62d 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -31,6 +31,8 @@ let TTS_MODEL = ''; let TTS_VOICE = ''; let TTS_SPLIT_ON: TTS_RESPONSE_SPLIT = TTS_RESPONSE_SPLIT.PUNCTUATION; + let TTS_AZURE_SPEECH_REGION = ''; + let TTS_AZURE_SPEECH_OUTPUT_FORMAT = ''; let STT_OPENAI_API_BASE_URL = ''; let STT_OPENAI_API_KEY = ''; @@ -87,7 +89,9 @@ ENGINE: TTS_ENGINE, MODEL: TTS_MODEL, VOICE: TTS_VOICE, - SPLIT_ON: TTS_SPLIT_ON + SPLIT_ON: TTS_SPLIT_ON, + AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, + AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT, }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -120,6 +124,9 @@ TTS_SPLIT_ON = res.tts.SPLIT_ON || TTS_RESPONSE_SPLIT.PUNCTUATION; + TTS_AZURE_SPEECH_OUTPUT_FORMAT = res.tts.AZURE_SPEECH_OUTPUT_FORMAT; + TTS_AZURE_SPEECH_REGION = res.tts.AZURE_SPEECH_REGION; + STT_OPENAI_API_BASE_URL = res.stt.OPENAI_API_BASE_URL; STT_OPENAI_API_KEY = res.stt.OPENAI_API_KEY; @@ -262,6 +269,12 @@ bind:value={TTS_API_KEY} required /> + {/if} @@ -330,48 +343,88 @@ - {:else if TTS_ENGINE === 'elevenlabs'} -
-
-
{$i18n.t('TTS Voice')}
-
-
- - - - {#each voices as voice} - - {/each} - + {:else if TTS_ENGINE === 'elevenlabs'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
{$i18n.t('TTS Model')}
+
+
+ + + + {#each models as model} + +
+
+
+
+ {:else if TTS_ENGINE === 'azurespeechservice'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
+ {$i18n.t('Output format')} + + {$i18n.t('Available list')} + +
+
+
+ +
-
-
{$i18n.t('TTS Model')}
-
-
- - - - {#each models as model} - -
-
-
-
- {/if} + {/if}
From afa42dd2e4266fb4c371b403a6b73839deea4366 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Thu, 19 Sep 2024 02:40:54 +0200 Subject: [PATCH 5/6] refac --- backend/open_webui/apps/audio/main.py | 28 +-- .../components/admin/Settings/Audio.svelte | 169 +++++++++--------- 2 files changed, 100 insertions(+), 97 deletions(-) diff --git a/backend/open_webui/apps/audio/main.py b/backend/open_webui/apps/audio/main.py index 0d389daf2..0eee533bd 100644 --- a/backend/open_webui/apps/audio/main.py +++ b/backend/open_webui/apps/audio/main.py @@ -161,7 +161,9 @@ async def update_audio_config( app.state.config.TTS_VOICE = form_data.tts.VOICE app.state.config.TTS_SPLIT_ON = form_data.tts.SPLIT_ON app.state.config.TTS_AZURE_SPEECH_REGION = form_data.tts.AZURE_SPEECH_REGION - app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT + app.state.config.TTS_AZURE_SPEECH_OUTPUT_FORMAT = ( + form_data.tts.AZURE_SPEECH_OUTPUT_FORMAT + ) app.state.config.STT_OPENAI_API_BASE_URL = form_data.stt.OPENAI_API_BASE_URL app.state.config.STT_OPENAI_API_KEY = form_data.stt.OPENAI_API_KEY @@ -314,7 +316,7 @@ async def speech(request: Request, user=Depends(get_verified_user)): detail=error_detail, ) - elif app.state.config.TTS_ENGINE == "azurespeechservice": + elif app.state.config.TTS_ENGINE == "azure": payload = None try: payload = json.loads(body.decode("utf-8")) @@ -329,9 +331,9 @@ async def speech(request: Request, user=Depends(get_verified_user)): url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/v1" headers = { - 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY, - 'Content-Type': 'application/ssml+xml', - 'X-Microsoft-OutputFormat': output_format + "Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY, + "Content-Type": "application/ssml+xml", + "X-Microsoft-OutputFormat": output_format, } data = f""" @@ -347,9 +349,8 @@ async def speech(request: Request, user=Depends(get_verified_user)): else: log.error(f"Error synthesizing speech - {response.reason}") raise HTTPException( - status_code=500, - detail=f"Error synthesizing speech - {response.reason}") - + status_code=500, detail=f"Error synthesizing speech - {response.reason}" + ) @app.post("/transcriptions") @@ -528,23 +529,22 @@ def get_available_voices() -> dict: except Exception: # Avoided @lru_cache with exception pass - elif app.state.config.TTS_ENGINE == "azurespeechservice": + elif app.state.config.TTS_ENGINE == "azure": try: region = app.state.config.TTS_AZURE_SPEECH_REGION url = f"https://{region}.tts.speech.microsoft.com/cognitiveservices/voices/list" - headers = { - 'Ocp-Apim-Subscription-Key': app.state.config.TTS_API_KEY - } + headers = {"Ocp-Apim-Subscription-Key": app.state.config.TTS_API_KEY} response = requests.get(url, headers=headers) response.raise_for_status() voices = response.json() for voice in voices: - ret[voice['ShortName']] = f"{voice['DisplayName']} ({voice['ShortName']})" + ret[voice["ShortName"]] = ( + f"{voice['DisplayName']} ({voice['ShortName']})" + ) except requests.RequestException as e: log.error(f"Error fetching voices: {str(e)}") - return ret diff --git a/src/lib/components/admin/Settings/Audio.svelte b/src/lib/components/admin/Settings/Audio.svelte index 15db5a62d..040bc5e1a 100644 --- a/src/lib/components/admin/Settings/Audio.svelte +++ b/src/lib/components/admin/Settings/Audio.svelte @@ -91,7 +91,7 @@ VOICE: TTS_VOICE, SPLIT_ON: TTS_SPLIT_ON, AZURE_SPEECH_REGION: TTS_AZURE_SPEECH_REGION, - AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT, + AZURE_SPEECH_OUTPUT_FORMAT: TTS_AZURE_SPEECH_OUTPUT_FORMAT }, stt: { OPENAI_API_BASE_URL: STT_OPENAI_API_BASE_URL, @@ -231,7 +231,7 @@ - +
@@ -260,7 +260,7 @@ />
- {:else if TTS_ENGINE === 'azurespeechservice'} + {:else if TTS_ENGINE === 'azure'}
-
+ {/if}
@@ -343,88 +343,91 @@ - {:else if TTS_ENGINE === 'elevenlabs'} -
-
-
{$i18n.t('TTS Voice')}
-
-
- - - - {#each voices as voice} - - {/each} - -
-
-
-
-
{$i18n.t('TTS Model')}
-
-
- - - - {#each models as model} - -
-
-
-
- {:else if TTS_ENGINE === 'azurespeechservice'} -
-
-
{$i18n.t('TTS Voice')}
-
-
- - - - {#each voices as voice} - - {/each} - -
-
-
-
-
- {$i18n.t('Output format')} - - {$i18n.t('Available list')} - -
-
-
- -
+ {:else if TTS_ENGINE === 'elevenlabs'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} +
- {/if} +
+
{$i18n.t('TTS Model')}
+
+
+ + + + {#each models as model} + +
+
+
+
+ {:else if TTS_ENGINE === 'azure'} +
+
+
{$i18n.t('TTS Voice')}
+
+
+ + + + {#each voices as voice} + + {/each} + +
+
+
+
+
+ {$i18n.t('Output format')} + + {$i18n.t('Available list')} + +
+
+
+ +
+
+
+
+ {/if}
From b4f1a0b5a6c58984bd8162f3497902a0f89e3a13 Mon Sep 17 00:00:00 2001 From: "Timothy J. Baek" Date: Thu, 19 Sep 2024 02:42:24 +0200 Subject: [PATCH 6/6] refac --- backend/open_webui/config.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/backend/open_webui/config.py b/backend/open_webui/config.py index c7c78b8e6..7ad10ccdc 100644 --- a/backend/open_webui/config.py +++ b/backend/open_webui/config.py @@ -1475,12 +1475,14 @@ AUDIO_TTS_SPLIT_ON = PersistentConfig( AUDIO_TTS_AZURE_SPEECH_REGION = PersistentConfig( "AUDIO_TTS_AZURE_SPEECH_REGION", - "audio.tts.azure_speech_region", - os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "uksouth"), + "audio.tts.azure.speech_region", + os.getenv("AUDIO_TTS_AZURE_SPEECH_REGION", "eastus"), ) AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT = PersistentConfig( "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", - "audio.tts.azure_speech_output_format", - os.getenv("AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", 'audio-24khz-160kbitrate-mono-mp3'), + "audio.tts.azure.speech_output_format", + os.getenv( + "AUDIO_TTS_AZURE_SPEECH_OUTPUT_FORMAT", "audio-24khz-160kbitrate-mono-mp3" + ), )