From 34bf2877c8fe566379bac166afee92078e67de73 Mon Sep 17 00:00:00 2001 From: takatost Date: Fri, 12 Jan 2024 19:19:12 +0800 Subject: [PATCH] fix: tongyi stream generate not incremental and add qwen max models (#2013) --- .../model_providers/__base/ai_model.py | 11 ++-- .../model_providers/tongyi/llm/llm.py | 62 ++++++++++++++----- .../tongyi/llm/qwen-max-1201.yaml | 57 +++++++++++++++++ .../tongyi/llm/qwen-max-longcontext.yaml | 57 +++++++++++++++++ .../model_providers/tongyi/llm/qwen-max.yaml | 57 +++++++++++++++++ .../model_providers/tongyi/llm/qwen-plus.yaml | 12 ++-- .../tongyi/llm/qwen-turbo.yaml | 12 ++-- api/requirements.txt | 2 +- 8 files changed, 239 insertions(+), 31 deletions(-) create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml diff --git a/api/core/model_runtime/model_providers/__base/ai_model.py b/api/core/model_runtime/model_providers/__base/ai_model.py index 87ffc5896d..11f9a7a6fb 100644 --- a/api/core/model_runtime/model_providers/__base/ai_model.py +++ b/api/core/model_runtime/model_providers/__base/ai_model.py @@ -1,6 +1,4 @@ import decimal -import json -import logging import os from abc import ABC, abstractmethod from typing import Optional @@ -12,7 +10,6 @@ from core.model_runtime.entities.model_entities import (AIModelEntity, DefaultPa PriceConfig, PriceInfo, PriceType) from core.model_runtime.errors.invoke import InvokeAuthorizationError, InvokeError from core.model_runtime.model_providers.__base.tokenizers.gpt2_tokenzier import GPT2Tokenizer -from pydantic import ValidationError class AIModel(ABC): @@ -54,14 +51,16 @@ class AIModel(ABC): :param error: model invoke error :return: unified error """ + provider_name = self.__class__.__module__.split('.')[-3] + for invoke_error, model_errors in self._invoke_error_mapping.items(): if isinstance(error, tuple(model_errors)): if invoke_error == InvokeAuthorizationError: - return invoke_error(description="Incorrect model credentials provided, please check and try again. ") + return invoke_error(description=f"[{provider_name}] Incorrect model credentials provided, please check and try again. ") - return invoke_error(description=f"{invoke_error.description}: {str(error)}") + return invoke_error(description=f"[{provider_name}] {invoke_error.description}, {str(error)}") - return InvokeError(description=f"Error: {str(error)}") + return InvokeError(description=f"[{provider_name}] Error: {str(error)}") def get_price(self, model: str, credentials: dict, price_type: PriceType, tokens: int) -> PriceInfo: """ diff --git a/api/core/model_runtime/model_providers/tongyi/llm/llm.py b/api/core/model_runtime/model_providers/tongyi/llm/llm.py index 5cc05db0fb..033fdd2cc2 100644 --- a/api/core/model_runtime/model_providers/tongyi/llm/llm.py +++ b/api/core/model_runtime/model_providers/tongyi/llm/llm.py @@ -1,8 +1,8 @@ -from http import HTTPStatus from typing import Generator, List, Optional, Union -import dashscope -from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta +from dashscope import get_tokenizer + +from core.model_runtime.entities.llm_entities import LLMResult, LLMResultChunk, LLMResultChunkDelta, LLMMode from core.model_runtime.entities.message_entities import (AssistantPromptMessage, PromptMessage, PromptMessageTool, SystemPromptMessage, UserPromptMessage) from core.model_runtime.errors.invoke import (InvokeAuthorizationError, InvokeBadRequestError, InvokeConnectionError, @@ -51,19 +51,12 @@ class TongyiLargeLanguageModel(LargeLanguageModel): :param tools: tools for tool calling :return: """ - # transform credentials to kwargs for model instance - credentials_kwargs = self._to_credential_kwargs(credentials) + tokenizer = get_tokenizer(model) - response = dashscope.Tokenization.call( - model=model, - prompt=self._convert_messages_to_prompt(prompt_messages), - **credentials_kwargs - ) - - if response.status_code == HTTPStatus.OK: - return response['usage']['input_tokens'] - else: - raise self._invoke_error_mapping[InvokeBadRequestError][0](response['message']) + # convert string to token ids + tokens = tokenizer.encode(self._convert_messages_to_prompt(prompt_messages)) + + return len(tokens) def validate_credentials(self, model: str, credentials: dict) -> None: """ @@ -119,14 +112,22 @@ class TongyiLargeLanguageModel(LargeLanguageModel): params = { 'model': model, - 'prompt': self._convert_messages_to_prompt(prompt_messages), **model_parameters, **credentials_kwargs } + + mode = self.get_model_mode(model, credentials) + + if mode == LLMMode.CHAT: + params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages) + else: + params['prompt'] = self._convert_messages_to_prompt(prompt_messages) + if stream: responses = stream_generate_with_retry( client, stream=True, + incremental_output=True, **params ) @@ -267,6 +268,35 @@ class TongyiLargeLanguageModel(LargeLanguageModel): # trim off the trailing ' ' that might come from the "Assistant: " return text.rstrip() + def _convert_prompt_messages_to_tongyi_messages(self, prompt_messages: list[PromptMessage]) -> list[dict]: + """ + Convert prompt messages to tongyi messages + + :param prompt_messages: prompt messages + :return: tongyi messages + """ + tongyi_messages = [] + for prompt_message in prompt_messages: + if isinstance(prompt_message, SystemPromptMessage): + tongyi_messages.append({ + 'role': 'system', + 'content': prompt_message.content, + }) + elif isinstance(prompt_message, UserPromptMessage): + tongyi_messages.append({ + 'role': 'user', + 'content': prompt_message.content, + }) + elif isinstance(prompt_message, AssistantPromptMessage): + tongyi_messages.append({ + 'role': 'assistant', + 'content': prompt_message.content, + }) + else: + raise ValueError(f"Got unknown type {prompt_message}") + + return tongyi_messages + @property def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]: """ diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml new file mode 100644 index 0000000000..d7914b7ba1 --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml @@ -0,0 +1,57 @@ +model: qwen-max-1201 +label: + en_US: qwen-max-1201 +model_type: llm +model_properties: + mode: chat + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + default: 1.0 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: top_p + use_template: top_p + default: 0.8 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: max_tokens + use_template: max_tokens + default: 1500 + min: 1 + max: 6000 + help: + zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 + en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. + - name: top_k + label: + zh_Hans: 取样数量 + en_US: Top k + type: int + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect. + required: false + - name: seed + label: + zh_Hans: 随机种子 + en_US: Random seed + type: int + help: + zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。 + en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. + required: false + - name: repetition_penalty + label: + en_US: Repetition penalty + type: float + default: 1.1 + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. + required: false diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml new file mode 100644 index 0000000000..2e296df185 --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml @@ -0,0 +1,57 @@ +model: qwen-max-longcontext +label: + en_US: qwen-max-longcontext +model_type: llm +model_properties: + mode: chat + context_size: 30000 +parameter_rules: + - name: temperature + use_template: temperature + default: 1.0 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: top_p + use_template: top_p + default: 0.8 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: max_tokens + use_template: max_tokens + default: 2000 + min: 1 + max: 28000 + help: + zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 + en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. + - name: top_k + label: + zh_Hans: 取样数量 + en_US: Top k + type: int + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect. + required: false + - name: seed + label: + zh_Hans: 随机种子 + en_US: Random seed + type: int + help: + zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。 + en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. + required: false + - name: repetition_penalty + label: + en_US: Repetition penalty + type: float + default: 1.1 + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. + required: false diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml new file mode 100644 index 0000000000..a611ca7a94 --- /dev/null +++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml @@ -0,0 +1,57 @@ +model: qwen-max +label: + en_US: qwen-max +model_type: llm +model_properties: + mode: chat + context_size: 8192 +parameter_rules: + - name: temperature + use_template: temperature + default: 1.0 + min: 0.0 + max: 2.0 + help: + zh_Hans: 用于控制随机性和多样性的程度。具体来说,temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值,使得更多的低概率词被选择,生成结果更加多样化;而较低的temperature值则会增强概率分布的峰值,使得高概率词更容易被选择,生成结果更加确定。 + en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain. + - name: top_p + use_template: top_p + default: 0.8 + help: + zh_Hans: 生成过程中核采样方法概率阈值,例如,取值为0.8时,仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为(0,1.0),取值越大,生成的随机性越高;取值越低,生成的确定性越高。 + en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated. + - name: max_tokens + use_template: max_tokens + default: 1500 + min: 1 + max: 6000 + help: + zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 + en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. + - name: top_k + label: + zh_Hans: 取样数量 + en_US: Top k + type: int + help: + zh_Hans: 生成时,采样候选集的大小。例如,取值为50时,仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大,生成的随机性越高;取值越小,生成的确定性越高。默认不传递该参数,取值为None或当top_k大于100时,表示不启用top_k策略,此时,仅有top_p策略生效。 + en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated. This parameter is not passed by default. The value is None or when top_k is greater than 100, it means that the top_k policy is not enabled. At this time, only the top_p policy takes effect. + required: false + - name: seed + label: + zh_Hans: 随机种子 + en_US: Random seed + type: int + help: + zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。 + en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. + required: false + - name: repetition_penalty + label: + en_US: Repetition penalty + type: float + default: 1.1 + help: + zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 + en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. + required: false diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml index e74fb7b252..509cc7168c 100644 --- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml +++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml @@ -24,7 +24,7 @@ parameter_rules: use_template: max_tokens default: 2000 min: 1 - max: 2000 + max: 30000 help: zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. @@ -42,10 +42,9 @@ parameter_rules: zh_Hans: 随机种子 en_US: Random seed type: int - default: 1234 help: - zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。 - en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234. + zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。 + en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. required: false - name: repetition_penalty label: @@ -55,3 +54,8 @@ parameter_rules: help: zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. +pricing: + input: '0.02' + output: '0.02' + unit: '0.001' + currency: RMB diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml index 8507881f48..8d67537c5b 100644 --- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml +++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml @@ -24,7 +24,7 @@ parameter_rules: use_template: max_tokens default: 1500 min: 1 - max: 1500 + max: 6000 help: zh_Hans: 用于限制模型生成token的数量,max_tokens设置的是生成上限,并不表示一定会生成这么多的token数量。 en_US: It is used to limit the number of tokens generated by the model. max_tokens sets the upper limit of generation, which does not mean that so many tokens will be generated. @@ -42,10 +42,9 @@ parameter_rules: zh_Hans: 随机种子 en_US: Random seed type: int - default: 1234 help: - zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。默认值 1234。 - en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. Default value 1234. + zh_Hans: 生成时,随机数的种子,用于控制模型生成的随机性。如果使用相同的种子,每次运行生成的结果都将相同;当需要复现模型的生成结果时,可以使用相同的种子。seed参数支持无符号64位整数类型。 + en_US: When generating, the random number seed is used to control the randomness of model generation. If you use the same seed, the results generated by each run will be the same; when you need to reproduce the results of the model, you can use the same seed. The seed parameter supports unsigned 64-bit integer types. required: false - name: repetition_penalty label: @@ -56,3 +55,8 @@ parameter_rules: zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。 en_US: Used to control the repetition of model generation. Increasing the repetition_penalty can reduce the repetition of model generation. 1.0 means no punishment. required: false +pricing: + input: '0.008' + output: '0.008' + unit: '0.001' + currency: RMB diff --git a/api/requirements.txt b/api/requirements.txt index 030c8f7072..40a61386e4 100644 --- a/api/requirements.txt +++ b/api/requirements.txt @@ -44,7 +44,7 @@ readabilipy==0.2.0 google-search-results==2.4.2 replicate~=0.22.0 websocket-client~=1.7.0 -dashscope~=1.13.5 +dashscope[tokenizer]~=1.14.0 huggingface_hub~=0.16.4 transformers~=4.31.0 pandas==1.5.3