feat: add support of speech2text function for OpenAI-API-compatible and Siliconflow (#7197)

2025-08-12 20:29:02 +08:00 · 2024-08-12 21:38:59 +08:00 · 2024-08-12 21:38:59 +08:00 · a12ddc47e7
commit a12ddc47e7
parent 57ce8449b0
10 changed files with 231 additions and 0 deletions
--- a/api/core/model_runtime/model_providers/openai_api_compatible/openai_api_compatible.yaml
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/openai_api_compatible.yaml
@ -7,6 +7,7 @@ description:
 supported_model_types:
  - llm
  - text-embedding
  - speech2text
 configurate_methods:
  - customizable-model
 model_credential_schema:
@ -61,6 +62,22 @@ model_credential_schema:
        zh_Hans: 模型上下文长度
        en_US: Model context size
      required: true
      show_on:
        - variable: __model_type
          value: llm
      type: text-input
      default: '4096'
      placeholder:
        zh_Hans: 在此输入您的模型上下文长度
        en_US: Enter your Model context size
    - variable: context_size
      label:
        zh_Hans: 模型上下文长度
        en_US: Model context size
      required: true
      show_on:
        - variable: __model_type
          value: text-embedding
      type: text-input
      default: '4096'
      placeholder:
--- a/api/core/model_runtime/model_providers/openai_api_compatible/speech2text/init.py
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/speech2text/init.py
--- a/api/core/model_runtime/model_providers/openai_api_compatible/speech2text/speech2text.py
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/speech2text/speech2text.py
@ -0,0 +1,63 @@
 from typing import IO, Optional
 from urllib.parse import urljoin
 import requests
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.speech2text_model import Speech2TextModel
 from core.model_runtime.model_providers.openai_api_compatible._common import _CommonOAI_API_Compat
 class OAICompatSpeech2TextModel(_CommonOAI_API_Compat, Speech2TextModel):
    """
    Model class for OpenAI Compatible Speech to text model.
    """
    def _invoke(
            self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None
    ) -> str:
        """
        Invoke speech2text model
        :param model: model name
        :param credentials: model credentials
        :param file: audio file
        :param user: unique user id
        :return: text for given audio file
        """
        headers = {}
        api_key = credentials.get("api_key")
        if api_key:
            headers["Authorization"] = f"Bearer {api_key}"
        endpoint_url = credentials.get("endpoint_url")
        if not endpoint_url.endswith("/"):
            endpoint_url += "/"
        endpoint_url = urljoin(endpoint_url, "audio/transcriptions")
        payload = {"model": model}
        files = [("file", file)]
        response = requests.post(endpoint_url, headers=headers, data=payload, files=files)
        if response.status_code != 200:
            raise InvokeBadRequestError(response.text)
        response_data = response.json()
        return response_data["text"]
    def validate_credentials(self, model: str, credentials: dict) -> None:
        """
        Validate model credentials
        :param model: model name
        :param credentials: model credentials
        :return:
        """
        try:
            audio_file_path = self._get_demo_file_path()
            with open(audio_file_path, "rb") as audio_file:
                self._invoke(model, credentials, audio_file)
        except Exception as ex:
            raise CredentialsValidateFailedError(str(ex))
--- a/api/core/model_runtime/model_providers/siliconflow/siliconflow.py
+++ b/api/core/model_runtime/model_providers/siliconflow/siliconflow.py
@ -6,6 +6,7 @@ from core.model_runtime.model_providers.__base.model_provider import ModelProvid
 logger = logging.getLogger(__name__)
 class SiliconflowProvider(ModelProvider):
    def validate_provider_credentials(self, credentials: dict) -> None:
--- a/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml
+++ b/api/core/model_runtime/model_providers/siliconflow/siliconflow.yaml
@ -16,6 +16,7 @@ help:
 supported_model_types:
  - llm
  - text-embedding
  - speech2text
 configurate_methods:
  - predefined-model
 provider_credential_schema:
--- a/api/core/model_runtime/model_providers/siliconflow/speech2text/init.py
+++ b/api/core/model_runtime/model_providers/siliconflow/speech2text/init.py
--- a/api/core/model_runtime/model_providers/siliconflow/speech2text/sense-voice-small.yaml
+++ b/api/core/model_runtime/model_providers/siliconflow/speech2text/sense-voice-small.yaml
@ -0,0 +1,5 @@
 model: iic/SenseVoiceSmall
 model_type: speech2text
 model_properties:
  file_upload_limit: 1
  supported_file_extensions: mp3,wav
--- a/api/core/model_runtime/model_providers/siliconflow/speech2text/speech2text.py
+++ b/api/core/model_runtime/model_providers/siliconflow/speech2text/speech2text.py
@ -0,0 +1,32 @@
 from typing import IO, Optional
 from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import OAICompatSpeech2TextModel
 class SiliconflowSpeech2TextModel(OAICompatSpeech2TextModel):
    """
    Model class for Siliconflow Speech to text model.
    """
    def _invoke(
            self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None
    ) -> str:
        """
        Invoke speech2text model
        :param model: model name
        :param credentials: model credentials
        :param file: audio file
        :param user: unique user id
        :return: text for given audio file
        """
        self._add_custom_parameters(credentials)
        return super()._invoke(model, credentials, file)
    def validate_credentials(self, model: str, credentials: dict) -> None:
        self._add_custom_parameters(credentials)
        return super().validate_credentials(model, credentials)
    @classmethod
    def _add_custom_parameters(cls, credentials: dict) -> None:
        credentials["endpoint_url"] = "https://api.siliconflow.cn/v1"
--- a/api/tests/integration_tests/model_runtime/openai_api_compatible/test_speech2text.py
+++ b/api/tests/integration_tests/model_runtime/openai_api_compatible/test_speech2text.py
@ -0,0 +1,59 @@
 import os
 import pytest
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.openai_api_compatible.speech2text.speech2text import (
    OAICompatSpeech2TextModel,
 )
 def test_validate_credentials():
    model = OAICompatSpeech2TextModel()
    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model="whisper-1",
            credentials={
                "api_key": "invalid_key",
                "endpoint_url": "https://api.openai.com/v1/"
            },
        )
    model.validate_credentials(
        model="whisper-1",
        credentials={
            "api_key": os.environ.get("OPENAI_API_KEY"),
            "endpoint_url": "https://api.openai.com/v1/"
        },
    )
 def test_invoke_model():
    model = OAICompatSpeech2TextModel()
    # Get the directory of the current file
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Get assets directory
    assets_dir = os.path.join(os.path.dirname(current_dir), "assets")
    # Construct the path to the audio file
    audio_file_path = os.path.join(assets_dir, "audio.mp3")
    # Open the file and get the file object
    with open(audio_file_path, "rb") as audio_file:
        file = audio_file
        result = model.invoke(
            model="whisper-1",
            credentials={
                "api_key": os.environ.get("OPENAI_API_KEY"),
                "endpoint_url": "https://api.openai.com/v1/"
            },
            file=file,
            user="abc-123",
        )
        assert isinstance(result, str)
        assert result == '1, 2, 3, 4, 5, 6, 7, 8, 9, 10'
--- a/api/tests/integration_tests/model_runtime/siliconflow/test_speech2text.py
+++ b/api/tests/integration_tests/model_runtime/siliconflow/test_speech2text.py
@ -0,0 +1,53 @@
 import os
 import pytest
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.siliconflow.speech2text.speech2text import SiliconflowSpeech2TextModel
 def test_validate_credentials():
    model = SiliconflowSpeech2TextModel()
    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model="iic/SenseVoiceSmall",
            credentials={
                "api_key": "invalid_key"
            },
        )
    model.validate_credentials(
        model="iic/SenseVoiceSmall",
        credentials={
            "api_key": os.environ.get("API_KEY")
        },
    )
 def test_invoke_model():
    model = SiliconflowSpeech2TextModel()
    # Get the directory of the current file
    current_dir = os.path.dirname(os.path.abspath(__file__))
    # Get assets directory
    assets_dir = os.path.join(os.path.dirname(current_dir), "assets")
    # Construct the path to the audio file
    audio_file_path = os.path.join(assets_dir, "audio.mp3")
    # Open the file and get the file object
    with open(audio_file_path, "rb") as audio_file:
        file = audio_file
        result = model.invoke(
            model="iic/SenseVoiceSmall",
            credentials={
                "api_key": os.environ.get("API_KEY")
            },
            file=file
        )
        assert isinstance(result, str)
        assert result == '1,2,3,4,5,6,7,8,9,10.'