feat: compress audio

Co-Authored-By: Beck Bekmyradov <47065940+bekmuradov@users.noreply.github.com>
This commit is contained in:
Timothy J. Baek 2024-09-30 00:30:12 +02:00
parent 8206c47a47
commit 7152af949b
3 changed files with 139 additions and 97 deletions

View File

@ -5,6 +5,8 @@ import os
import uuid import uuid
from functools import lru_cache from functools import lru_cache
from pathlib import Path from pathlib import Path
from pydub import AudioSegment
from pydub.silence import split_on_silence
import requests import requests
from open_webui.config import ( from open_webui.config import (
@ -35,7 +37,12 @@ from fastapi import Depends, FastAPI, File, HTTPException, Request, UploadFile,
from fastapi.middleware.cors import CORSMiddleware from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse from fastapi.responses import FileResponse
from pydantic import BaseModel from pydantic import BaseModel
from open_webui.utils.utils import get_admin_user, get_current_user, get_verified_user from open_webui.utils.utils import get_admin_user, get_verified_user
# Constants
MAX_FILE_SIZE_MB = 25
MAX_FILE_SIZE = MAX_FILE_SIZE_MB * 1024 * 1024 # Convert MB to bytes
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
log.setLevel(SRC_LOG_LEVELS["AUDIO"]) log.setLevel(SRC_LOG_LEVELS["AUDIO"])
@ -353,10 +360,103 @@ async def speech(request: Request, user=Depends(get_verified_user)):
) )
def transcribe(file_path):
print("transcribe", file_path)
filename = os.path.basename(file_path)
file_dir = os.path.dirname(file_path)
id = filename.split(".")[0]
if app.state.config.STT_ENGINE == "":
from faster_whisper import WhisperModel
whisper_kwargs = {
"model_size_or_path": WHISPER_MODEL,
"device": whisper_device_type,
"compute_type": "int8",
"download_root": WHISPER_MODEL_DIR,
"local_files_only": not WHISPER_MODEL_AUTO_UPDATE,
}
log.debug(f"whisper_kwargs: {whisper_kwargs}")
try:
model = WhisperModel(**whisper_kwargs)
except Exception:
log.warning(
"WhisperModel initialization failed, attempting download with local_files_only=False"
)
whisper_kwargs["local_files_only"] = False
model = WhisperModel(**whisper_kwargs)
segments, info = model.transcribe(file_path, beam_size=5)
log.info(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
transcript = "".join([segment.text for segment in list(segments)])
data = {"text": transcript.strip()}
# save the transcript to a json file
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump(data, f)
print(data)
return data
elif app.state.config.STT_ENGINE == "openai":
if is_mp4_audio(file_path):
print("is_mp4_audio")
os.rename(file_path, file_path.replace(".wav", ".mp4"))
# Convert MP4 audio file to WAV format
convert_mp4_to_wav(file_path.replace(".wav", ".mp4"), file_path)
headers = {"Authorization": f"Bearer {app.state.config.STT_OPENAI_API_KEY}"}
files = {"file": (filename, open(file_path, "rb"))}
data = {"model": app.state.config.STT_MODEL}
print(files, data)
r = None
try:
r = requests.post(
url=f"{app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions",
headers=headers,
files=files,
data=data,
)
r.raise_for_status()
data = r.json()
# save the transcript to a json file
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump(data, f)
print(data)
return data
except Exception as e:
log.exception(e)
error_detail = "Open WebUI: Server Connection Error"
if r is not None:
try:
res = r.json()
if "error" in res:
error_detail = f"External: {res['error']['message']}"
except Exception:
error_detail = f"External: {e}"
raise error_detail
@app.post("/transcriptions") @app.post("/transcriptions")
def transcribe( def transcription(
file: UploadFile = File(...), file: UploadFile = File(...),
user=Depends(get_current_user), user=Depends(get_verified_user),
): ):
log.info(f"file.content_type: {file.content_type}") log.info(f"file.content_type: {file.content_type}")
@ -368,111 +468,53 @@ def transcribe(
try: try:
ext = file.filename.split(".")[-1] ext = file.filename.split(".")[-1]
id = uuid.uuid4() id = uuid.uuid4()
filename = f"{id}.{ext}" filename = f"{id}.{ext}"
contents = file.file.read()
file_dir = f"{CACHE_DIR}/audio/transcriptions" file_dir = f"{CACHE_DIR}/audio/transcriptions"
os.makedirs(file_dir, exist_ok=True) os.makedirs(file_dir, exist_ok=True)
file_path = f"{file_dir}/{filename}" file_path = f"{file_dir}/{filename}"
print(filename)
contents = file.file.read()
with open(file_path, "wb") as f: with open(file_path, "wb") as f:
f.write(contents) f.write(contents)
f.close()
if app.state.config.STT_ENGINE == "": try:
from faster_whisper import WhisperModel if os.path.getsize(file_path) > MAX_FILE_SIZE: # file is bigger than 25MB
log.debug(f"File size is larger than {MAX_FILE_SIZE_MB}MB")
audio = AudioSegment.from_file(file_path)
audio = audio.set_frame_rate(16000).set_channels(1) # Compress audio
compressed_path = f"{file_dir}/{id}_compressed.opus"
audio.export(compressed_path, format="opus", bitrate="32k")
log.debug(f"Compressed audio to {compressed_path}")
file_path = compressed_path
whisper_kwargs = { if (
"model_size_or_path": WHISPER_MODEL, os.path.getsize(file_path) > MAX_FILE_SIZE
"device": whisper_device_type, ): # Still larger than 25MB after compression
"compute_type": "int8", chunks = split_on_silence(
"download_root": WHISPER_MODEL_DIR, audio, min_silence_len=500, silence_thresh=-40
"local_files_only": not WHISPER_MODEL_AUTO_UPDATE, )
} texts = []
for i, chunk in enumerate(chunks):
log.debug(f"whisper_kwargs: {whisper_kwargs}") chunk_file_path = f"{file_dir}/{id}_chunk{i}.{ext}"
chunk.export(chunk_file_path, format=ext)
try: text = transcribe(chunk_file_path)
model = WhisperModel(**whisper_kwargs) texts.append(text)
except Exception: data = {"text": " ".join(texts)}
log.warning( else:
"WhisperModel initialization failed, attempting download with local_files_only=False" data = transcribe(file_path)
) else:
whisper_kwargs["local_files_only"] = False data = transcribe(file_path)
model = WhisperModel(**whisper_kwargs)
segments, info = model.transcribe(file_path, beam_size=5)
log.info(
"Detected language '%s' with probability %f"
% (info.language, info.language_probability)
)
transcript = "".join([segment.text for segment in list(segments)])
data = {"text": transcript.strip()}
# save the transcript to a json file
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump(data, f)
print(data)
return data return data
except Exception as e:
elif app.state.config.STT_ENGINE == "openai": log.exception(e)
if is_mp4_audio(file_path): raise HTTPException(
print("is_mp4_audio") status_code=status.HTTP_400_BAD_REQUEST,
os.rename(file_path, file_path.replace(".wav", ".mp4")) detail=ERROR_MESSAGES.DEFAULT(e),
# Convert MP4 audio file to WAV format )
convert_mp4_to_wav(file_path.replace(".wav", ".mp4"), file_path)
headers = {"Authorization": f"Bearer {app.state.config.STT_OPENAI_API_KEY}"}
files = {"file": (filename, open(file_path, "rb"))}
data = {"model": app.state.config.STT_MODEL}
print(files, data)
r = None
try:
r = requests.post(
url=f"{app.state.config.STT_OPENAI_API_BASE_URL}/audio/transcriptions",
headers=headers,
files=files,
data=data,
)
r.raise_for_status()
data = r.json()
# save the transcript to a json file
transcript_file = f"{file_dir}/{id}.json"
with open(transcript_file, "w") as f:
json.dump(data, f)
print(data)
return data
except Exception as e:
log.exception(e)
error_detail = "Open WebUI: Server Connection Error"
if r is not None:
try:
res = r.json()
if "error" in res:
error_detail = f"External: {res['error']['message']}"
except Exception:
error_detail = f"External: {e}"
raise HTTPException(
status_code=r.status_code if r != None else 500,
detail=error_detail,
)
except Exception as e: except Exception as e:
log.exception(e) log.exception(e)

View File

@ -700,7 +700,7 @@
childrenIds: [], childrenIds: [],
role: 'user', role: 'user',
content: userPrompt, content: userPrompt,
files: chatFiles.length > 0 ? chatFiles : undefined, files: _files.length > 0 ? _files : undefined,
timestamp: Math.floor(Date.now() / 1000), // Unix epoch timestamp: Math.floor(Date.now() / 1000), // Unix epoch
models: selectedModels models: selectedModels
}; };

View File

@ -54,7 +54,7 @@
</div> </div>
<div> <div>
<div class="flex flex-col md:flex-row gap-1 justify-between w-full"> <div class="flex flex-col items-center md:flex-row gap-1 justify-between w-full">
<div class=" flex flex-wrap text-sm gap-1 text-gray-500"> <div class=" flex flex-wrap text-sm gap-1 text-gray-500">
{#if file.size} {#if file.size}
<div class="capitalize shrink-0">{formatFileSize(file.size)}</div> <div class="capitalize shrink-0">{formatFileSize(file.size)}</div>