Added Llama 3.2 Vision Models Speech2Text Models for Groq (#9479)

2025-08-14 04:25:59 +08:00 · 2024-10-18 03:10:33 -07:00 · 2024-10-18 03:10:33 -07:00 · b92504bebc
commit b92504bebc
parent e0846792d2
8 changed files with 98 additions and 0 deletions
--- a/api/core/model_runtime/model_providers/groq/groq.yaml
+++ b/api/core/model_runtime/model_providers/groq/groq.yaml
@ -18,6 +18,7 @@ help:
    en_US: https://console.groq.com/
 supported_model_types:
  - llm
+  - speech2text
 configurate_methods:
  - predefined-model
 provider_credential_schema:
--- a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
@ -0,0 +1,26 @@
+model: llama-3.2-11b-vision-preview
+label:
+  zh_Hans: Llama 3.2 11B Vision (Preview)
+  en_US: Llama 3.2 11B Vision (Preview)
+model_type: llm
+features:
+  - agent-thought
+  - vision
+model_properties:
+  mode: chat
+  context_size: 131072
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 8192
+pricing:
+  input: '0.05'
+  output: '0.1'
+  unit: '0.000001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
@ -0,0 +1,26 @@
+model: llama-3.2-90b-vision-preview
+label:
+  zh_Hans: Llama 3.2 90B Vision (Preview)
+  en_US: Llama 3.2 90B Vision (Preview)
+model_type: llm
+features:
+  - agent-thought
+  - vision
+model_properties:
+  mode: chat
+  context_size: 131072
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 8192
+pricing:
+  input: '0.05'
+  output: '0.1'
+  unit: '0.000001'
+  currency: USD
--- a/api/core/model_runtime/model_providers/groq/speech2text/init.py
+++ b/api/core/model_runtime/model_providers/groq/speech2text/init.py
--- a/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
@ -0,0 +1,5 @@
+model: distil-whisper-large-v3-en
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
--- a/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
+++ b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
@ -0,0 +1,30 @@
+from typing import IO, Optional
+
+from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import OAICompatSpeech2TextModel
+
+
+class GroqSpeech2TextModel(OAICompatSpeech2TextModel):
+    """
+    Model class for Groq Speech to text model.
+    """
+
+    def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str:
+        """
+        Invoke speech2text model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param file: audio file
+        :param user: unique user id
+        :return: text for given audio file
+        """
+        self._add_custom_parameters(credentials)
+        return super()._invoke(model, credentials, file)
+
+    def validate_credentials(self, model: str, credentials: dict) -> None:
+        self._add_custom_parameters(credentials)
+        return super().validate_credentials(model, credentials)
+
+    @classmethod
+    def _add_custom_parameters(cls, credentials: dict) -> None:
+        credentials["endpoint_url"] = "https://api.groq.com/openai/v1"
--- a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
@ -0,0 +1,5 @@
+model: whisper-large-v3-turbo
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
--- a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
@ -0,0 +1,5 @@
+model: whisper-large-v3
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm