Added Llama 3.2 Vision Models Speech2Text Models for Groq (#9479)

2025-08-14 05:16:00 +08:00 · 2024-10-18 03:10:33 -07:00 · 2024-10-18 03:10:33 -07:00 · b92504bebc
commit b92504bebc
parent e0846792d2
8 changed files with 98 additions and 0 deletions
--- a/api/core/model_runtime/model_providers/groq/groq.yaml
+++ b/api/core/model_runtime/model_providers/groq/groq.yaml
@ -18,6 +18,7 @@ help:
    en_US: https://console.groq.com/
 supported_model_types:
  - llm
  - speech2text
 configurate_methods:
  - predefined-model
 provider_credential_schema:
--- a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
@ -0,0 +1,26 @@
 model: llama-3.2-11b-vision-preview
 label:
  zh_Hans: Llama 3.2 11B Vision (Preview)
  en_US: Llama 3.2 11B Vision (Preview)
 model_type: llm
 features:
  - agent-thought
  - vision
 model_properties:
  mode: chat
  context_size: 131072
 parameter_rules:
  - name: temperature
    use_template: temperature
  - name: top_p
    use_template: top_p
  - name: max_tokens
    use_template: max_tokens
    default: 512
    min: 1
    max: 8192
 pricing:
  input: '0.05'
  output: '0.1'
  unit: '0.000001'
  currency: USD
--- a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
@ -0,0 +1,26 @@
 model: llama-3.2-90b-vision-preview
 label:
  zh_Hans: Llama 3.2 90B Vision (Preview)
  en_US: Llama 3.2 90B Vision (Preview)
 model_type: llm
 features:
  - agent-thought
  - vision
 model_properties:
  mode: chat
  context_size: 131072
 parameter_rules:
  - name: temperature
    use_template: temperature
  - name: top_p
    use_template: top_p
  - name: max_tokens
    use_template: max_tokens
    default: 512
    min: 1
    max: 8192
 pricing:
  input: '0.05'
  output: '0.1'
  unit: '0.000001'
  currency: USD
--- a/api/core/model_runtime/model_providers/groq/speech2text/init.py
+++ b/api/core/model_runtime/model_providers/groq/speech2text/init.py
--- a/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
@ -0,0 +1,5 @@
 model: distil-whisper-large-v3-en
 model_type: speech2text
 model_properties:
  file_upload_limit: 1
  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
--- a/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
+++ b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
@ -0,0 +1,30 @@
 from typing import IO, Optional
 from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import OAICompatSpeech2TextModel
 class GroqSpeech2TextModel(OAICompatSpeech2TextModel):
    """
    Model class for Groq Speech to text model.
    """
    def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str:
        """
        Invoke speech2text model
        :param model: model name
        :param credentials: model credentials
        :param file: audio file
        :param user: unique user id
        :return: text for given audio file
        """
        self._add_custom_parameters(credentials)
        return super()._invoke(model, credentials, file)
    def validate_credentials(self, model: str, credentials: dict) -> None:
        self._add_custom_parameters(credentials)
        return super().validate_credentials(model, credentials)
    @classmethod
    def _add_custom_parameters(cls, credentials: dict) -> None:
        credentials["endpoint_url"] = "https://api.groq.com/openai/v1"
--- a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
@ -0,0 +1,5 @@
 model: whisper-large-v3-turbo
 model_type: speech2text
 model_properties:
  file_upload_limit: 1
  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
--- a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
@ -0,0 +1,5 @@
 model: whisper-large-v3
 model_type: speech2text
 model_properties:
  file_upload_limit: 1
  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm