From b92504bebcc483cded2af4381c0e4a5eba27a394 Mon Sep 17 00:00:00 2001 From: Tao Wang <74752235+taowang1993@users.noreply.github.com> Date: Fri, 18 Oct 2024 03:10:33 -0700 Subject: [PATCH] Added Llama 3.2 Vision Models Speech2Text Models for Groq (#9479) --- .../model_providers/groq/groq.yaml | 1 + .../llm/llama-3.2-11b-vision-preview.yaml | 26 ++++++++++++++++ .../llm/llama-3.2-90b-vision-preview.yaml | 26 ++++++++++++++++ .../groq/speech2text/__init__.py | 0 .../distil-whisper-large-v3-en.yaml | 5 ++++ .../groq/speech2text/speech2text.py | 30 +++++++++++++++++++ .../speech2text/whisper-large-v3-turbo.yaml | 5 ++++ .../groq/speech2text/whisper-large-v3.yaml | 5 ++++ 8 files changed, 98 insertions(+) create mode 100644 api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml create mode 100644 api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/__init__.py create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/speech2text.py create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml diff --git a/api/core/model_runtime/model_providers/groq/groq.yaml b/api/core/model_runtime/model_providers/groq/groq.yaml index db17cc8bdd..d6534e1bf1 100644 --- a/api/core/model_runtime/model_providers/groq/groq.yaml +++ b/api/core/model_runtime/model_providers/groq/groq.yaml @@ -18,6 +18,7 @@ help: en_US: https://console.groq.com/ supported_model_types: - llm + - speech2text configurate_methods: - predefined-model provider_credential_schema: diff --git a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml new file mode 100644 index 0000000000..5632218797 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml @@ -0,0 +1,26 @@ +model: llama-3.2-11b-vision-preview +label: + zh_Hans: Llama 3.2 11B Vision (Preview) + en_US: Llama 3.2 11B Vision (Preview) +model_type: llm +features: + - agent-thought + - vision +model_properties: + mode: chat + context_size: 131072 +parameter_rules: + - name: temperature + use_template: temperature + - name: top_p + use_template: top_p + - name: max_tokens + use_template: max_tokens + default: 512 + min: 1 + max: 8192 +pricing: + input: '0.05' + output: '0.1' + unit: '0.000001' + currency: USD diff --git a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml new file mode 100644 index 0000000000..e7b93101e8 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml @@ -0,0 +1,26 @@ +model: llama-3.2-90b-vision-preview +label: + zh_Hans: Llama 3.2 90B Vision (Preview) + en_US: Llama 3.2 90B Vision (Preview) +model_type: llm +features: + - agent-thought + - vision +model_properties: + mode: chat + context_size: 131072 +parameter_rules: + - name: temperature + use_template: temperature + - name: top_p + use_template: top_p + - name: max_tokens + use_template: max_tokens + default: 512 + min: 1 + max: 8192 +pricing: + input: '0.05' + output: '0.1' + unit: '0.000001' + currency: USD diff --git a/api/core/model_runtime/model_providers/groq/speech2text/__init__.py b/api/core/model_runtime/model_providers/groq/speech2text/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml new file mode 100644 index 0000000000..202d006a66 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml @@ -0,0 +1,5 @@ +model: distil-whisper-large-v3-en +model_type: speech2text +model_properties: + file_upload_limit: 1 + supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm diff --git a/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py new file mode 100644 index 0000000000..75feeb9cb9 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py @@ -0,0 +1,30 @@ +from typing import IO, Optional + +from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import OAICompatSpeech2TextModel + + +class GroqSpeech2TextModel(OAICompatSpeech2TextModel): + """ + Model class for Groq Speech to text model. + """ + + def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str: + """ + Invoke speech2text model + + :param model: model name + :param credentials: model credentials + :param file: audio file + :param user: unique user id + :return: text for given audio file + """ + self._add_custom_parameters(credentials) + return super()._invoke(model, credentials, file) + + def validate_credentials(self, model: str, credentials: dict) -> None: + self._add_custom_parameters(credentials) + return super().validate_credentials(model, credentials) + + @classmethod + def _add_custom_parameters(cls, credentials: dict) -> None: + credentials["endpoint_url"] = "https://api.groq.com/openai/v1" diff --git a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml new file mode 100644 index 0000000000..3882a3f4f2 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml @@ -0,0 +1,5 @@ +model: whisper-large-v3-turbo +model_type: speech2text +model_properties: + file_upload_limit: 1 + supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm diff --git a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml new file mode 100644 index 0000000000..ed02477d70 --- /dev/null +++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml @@ -0,0 +1,5 @@ +model: whisper-large-v3 +model_type: speech2text +model_properties: + file_upload_limit: 1 + supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm