From b92504bebcc483cded2af4381c0e4a5eba27a394 Mon Sep 17 00:00:00 2001
From: Tao Wang <74752235+taowang1993@users.noreply.github.com>
Date: Fri, 18 Oct 2024 03:10:33 -0700
Subject: [PATCH] Added Llama 3.2 Vision Models Speech2Text Models for Groq
 (#9479)

---
 .../model_providers/groq/groq.yaml            |  1 +
 .../llm/llama-3.2-11b-vision-preview.yaml     | 26 ++++++++++++++++
 .../llm/llama-3.2-90b-vision-preview.yaml     | 26 ++++++++++++++++
 .../groq/speech2text/__init__.py              |  0
 .../distil-whisper-large-v3-en.yaml           |  5 ++++
 .../groq/speech2text/speech2text.py           | 30 +++++++++++++++++++
 .../speech2text/whisper-large-v3-turbo.yaml   |  5 ++++
 .../groq/speech2text/whisper-large-v3.yaml    |  5 ++++
 8 files changed, 98 insertions(+)
 create mode 100644 api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
 create mode 100644 api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
 create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/__init__.py
 create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
 create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
 create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
 create mode 100644 api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml

diff --git a/api/core/model_runtime/model_providers/groq/groq.yaml b/api/core/model_runtime/model_providers/groq/groq.yaml
index db17cc8bdd..d6534e1bf1 100644
--- a/api/core/model_runtime/model_providers/groq/groq.yaml
+++ b/api/core/model_runtime/model_providers/groq/groq.yaml
@@ -18,6 +18,7 @@ help:
     en_US: https://console.groq.com/
 supported_model_types:
   - llm
+  - speech2text
 configurate_methods:
   - predefined-model
 provider_credential_schema:
diff --git a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
new file mode 100644
index 0000000000..5632218797
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-11b-vision-preview.yaml
@@ -0,0 +1,26 @@
+model: llama-3.2-11b-vision-preview
+label:
+  zh_Hans: Llama 3.2 11B Vision (Preview)
+  en_US: Llama 3.2 11B Vision (Preview)
+model_type: llm
+features:
+  - agent-thought
+  - vision
+model_properties:
+  mode: chat
+  context_size: 131072
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 8192
+pricing:
+  input: '0.05'
+  output: '0.1'
+  unit: '0.000001'
+  currency: USD
diff --git a/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
new file mode 100644
index 0000000000..e7b93101e8
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/llm/llama-3.2-90b-vision-preview.yaml
@@ -0,0 +1,26 @@
+model: llama-3.2-90b-vision-preview
+label:
+  zh_Hans: Llama 3.2 90B Vision (Preview)
+  en_US: Llama 3.2 90B Vision (Preview)
+model_type: llm
+features:
+  - agent-thought
+  - vision
+model_properties:
+  mode: chat
+  context_size: 131072
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+  - name: top_p
+    use_template: top_p
+  - name: max_tokens
+    use_template: max_tokens
+    default: 512
+    min: 1
+    max: 8192
+pricing:
+  input: '0.05'
+  output: '0.1'
+  unit: '0.000001'
+  currency: USD
diff --git a/api/core/model_runtime/model_providers/groq/speech2text/__init__.py b/api/core/model_runtime/model_providers/groq/speech2text/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
new file mode 100644
index 0000000000..202d006a66
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/speech2text/distil-whisper-large-v3-en.yaml
@@ -0,0 +1,5 @@
+model: distil-whisper-large-v3-en
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
diff --git a/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
new file mode 100644
index 0000000000..75feeb9cb9
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/speech2text/speech2text.py
@@ -0,0 +1,30 @@
+from typing import IO, Optional
+
+from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import OAICompatSpeech2TextModel
+
+
+class GroqSpeech2TextModel(OAICompatSpeech2TextModel):
+    """
+    Model class for Groq Speech to text model.
+    """
+
+    def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str:
+        """
+        Invoke speech2text model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param file: audio file
+        :param user: unique user id
+        :return: text for given audio file
+        """
+        self._add_custom_parameters(credentials)
+        return super()._invoke(model, credentials, file)
+
+    def validate_credentials(self, model: str, credentials: dict) -> None:
+        self._add_custom_parameters(credentials)
+        return super().validate_credentials(model, credentials)
+
+    @classmethod
+    def _add_custom_parameters(cls, credentials: dict) -> None:
+        credentials["endpoint_url"] = "https://api.groq.com/openai/v1"
diff --git a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
new file mode 100644
index 0000000000..3882a3f4f2
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3-turbo.yaml
@@ -0,0 +1,5 @@
+model: whisper-large-v3-turbo
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm
diff --git a/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
new file mode 100644
index 0000000000..ed02477d70
--- /dev/null
+++ b/api/core/model_runtime/model_providers/groq/speech2text/whisper-large-v3.yaml
@@ -0,0 +1,5 @@
+model: whisper-large-v3
+model_type: speech2text
+model_properties:
+  file_upload_limit: 1
+  supported_file_extensions: flac,mp3,mp4,mpeg,mpga,m4a,ogg,wav,webm