dify/api/tests/integration_tests/model_runtime/gpustack/test_llm.py

import os
from collections.abc import Generator

import pytest

from core.model_runtime.entities.llm_entities import (
    LLMResult,
    LLMResultChunk,
    LLMResultChunkDelta,
)
from core.model_runtime.entities.message_entities import (
    AssistantPromptMessage,
    PromptMessageTool,
    SystemPromptMessage,
    UserPromptMessage,
)
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.gpustack.llm.llm import GPUStackLanguageModel


def test_validate_credentials_for_chat_model():
    model = GPUStackLanguageModel()

    with pytest.raises(CredentialsValidateFailedError):
        model.validate_credentials(
            model="llama-3.2-1b-instruct",
            credentials={
                "endpoint_url": "invalid_url",
                "api_key": "invalid_api_key",
                "mode": "chat",
            },
        )

    model.validate_credentials(
        model="llama-3.2-1b-instruct",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "chat",
        },
    )


def test_invoke_completion_model():
    model = GPUStackLanguageModel()

    response = model.invoke(
        model="llama-3.2-1b-instruct",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "completion",
        },
        prompt_messages=[UserPromptMessage(content="ping")],
        model_parameters={"temperature": 0.7, "top_p": 1.0, "max_tokens": 10},
        stop=[],
        user="abc-123",
        stream=False,
    )

    assert isinstance(response, LLMResult)
    assert len(response.message.content) > 0
    assert response.usage.total_tokens > 0


def test_invoke_chat_model():
    model = GPUStackLanguageModel()

    response = model.invoke(
        model="llama-3.2-1b-instruct",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "chat",
        },
        prompt_messages=[UserPromptMessage(content="ping")],
        model_parameters={"temperature": 0.7, "top_p": 1.0, "max_tokens": 10},
        stop=[],
        user="abc-123",
        stream=False,
    )

    assert isinstance(response, LLMResult)
    assert len(response.message.content) > 0
    assert response.usage.total_tokens > 0


def test_invoke_stream_chat_model():
    model = GPUStackLanguageModel()

    response = model.invoke(
        model="llama-3.2-1b-instruct",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "chat",
        },
        prompt_messages=[UserPromptMessage(content="Hello World!")],
        model_parameters={"temperature": 0.7, "top_p": 1.0, "max_tokens": 10},
        stop=["you"],
        stream=True,
        user="abc-123",
    )

    assert isinstance(response, Generator)
    for chunk in response:
        assert isinstance(chunk, LLMResultChunk)
        assert isinstance(chunk.delta, LLMResultChunkDelta)
        assert isinstance(chunk.delta.message, AssistantPromptMessage)
        assert len(chunk.delta.message.content) > 0 if chunk.delta.finish_reason is None else True


def test_get_num_tokens():
    model = GPUStackLanguageModel()

    num_tokens = model.get_num_tokens(
        model="????",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "chat",
        },
        prompt_messages=[
            SystemPromptMessage(
                content="You are a helpful AI assistant.",
            ),
            UserPromptMessage(content="Hello World!"),
        ],
        tools=[
            PromptMessageTool(
                name="get_current_weather",
                description="Get the current weather in a given location",
                parameters={
                    "type": "object",
                    "properties": {
                        "location": {
                            "type": "string",
                            "description": "The city and state e.g. San Francisco, CA",
                        },
                        "unit": {"type": "string", "enum": ["c", "f"]},
                    },
                    "required": ["location"],
                },
            )
        ],
    )

    assert isinstance(num_tokens, int)
    assert num_tokens == 80

    num_tokens = model.get_num_tokens(
        model="????",
        credentials={
            "endpoint_url": os.environ.get("GPUSTACK_SERVER_URL"),
            "api_key": os.environ.get("GPUSTACK_API_KEY"),
            "mode": "chat",
        },
        prompt_messages=[UserPromptMessage(content="Hello World!")],
    )

    assert isinstance(num_tokens, int)
    assert num_tokens == 10