From 5b447d61a6ac1c0d4b0499623a5babfe6f8e5c7e Mon Sep 17 00:00:00 2001
From: takatost <takatost@users.noreply.github.com>
Date: Mon, 15 Apr 2024 22:28:32 +0800
Subject: [PATCH] feat: refactor tongyi models (#3496)

---
 api/app.py                                    |   7 +-
 .../app/apps/advanced_chat/app_generator.py   |   3 +
 api/core/app/apps/agent_chat/app_generator.py |   3 +
 api/core/app/apps/chat/app_generator.py       |   3 +
 api/core/app/apps/completion/app_generator.py |   3 +
 api/core/app/apps/workflow/app_generator.py   |   3 +
 .../model_providers/cohere/llm/llm.py         |   2 +-
 .../model_providers/ollama/ollama.yaml        |   4 +-
 .../model_providers/tongyi/llm/_client.py     |  82 -----
 .../model_providers/tongyi/llm/llm.py         | 301 ++++++++++++++----
 .../tongyi/llm/qwen-max-0403.yaml             |  81 +++++
 .../tongyi/llm/qwen-max-1201.yaml             |   6 +-
 .../tongyi/llm/qwen-max-longcontext.yaml      |   6 +-
 .../model_providers/tongyi/llm/qwen-max.yaml  |   6 +-
 .../tongyi/llm/qwen-plus-chat.yaml            |  81 +++++
 .../model_providers/tongyi/llm/qwen-plus.yaml |   4 +-
 .../tongyi/llm/qwen-turbo-chat.yaml           |  81 +++++
 .../tongyi/llm/qwen-turbo.yaml                |   4 +-
 .../tongyi/llm/qwen-vl-max.yaml               |  47 +++
 .../tongyi/llm/qwen-vl-plus.yaml              |  47 +++
 .../tongyi/text_embedding/text_embedding.py   |  22 +-
 .../model_providers/tongyi/tts/tts.py         |   4 +-
 .../triton_inference_server.yaml              |  12 +-
 api/requirements.txt                          |   4 +-
 api/tests/unit_tests/core/rag/__init__.py     |   0
 .../core/rag/datasource/__init__.py           |   0
 .../core/rag/datasource/vdb/__init__.py       |   0
 .../rag/datasource/vdb/milvus/__init__.py     |   0
 .../rag/datasource/vdb/milvus/test_milvus.py} |   0
 29 files changed, 639 insertions(+), 177 deletions(-)
 delete mode 100644 api/core/model_runtime/model_providers/tongyi/llm/_client.py
 create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-max-0403.yaml
 create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-plus-chat.yaml
 create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo-chat.yaml
 create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
 create mode 100644 api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
 create mode 100644 api/tests/unit_tests/core/rag/__init__.py
 create mode 100644 api/tests/unit_tests/core/rag/datasource/__init__.py
 create mode 100644 api/tests/unit_tests/core/rag/datasource/vdb/__init__.py
 create mode 100644 api/tests/unit_tests/core/rag/datasource/vdb/milvus/__init__.py
 rename api/tests/{unittests/test_model.py => unit_tests/core/rag/datasource/vdb/milvus/test_milvus.py} (100%)

diff --git a/api/app.py b/api/app.py
index a921cbce04..ad91b5636f 100644
--- a/api/app.py
+++ b/api/app.py
@@ -1,7 +1,5 @@
 import os
 
-from werkzeug.exceptions import Unauthorized
-
 if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
     from gevent import monkey
 
@@ -11,10 +9,6 @@ if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
 
     grpc.experimental.gevent.init_gevent()
 
-    import langchain
-
-    langchain.verbose = True
-
 import json
 import logging
 import threading
@@ -24,6 +18,7 @@ import warnings
 from flask import Flask, Response, request
 from flask_cors import CORS
 
+from werkzeug.exceptions import Unauthorized
 from commands import register_commands
 from config import CloudEditionConfig, Config
 from extensions import (
diff --git a/api/core/app/apps/advanced_chat/app_generator.py b/api/core/app/apps/advanced_chat/app_generator.py
index 37e10f4bcf..e5cf585f82 100644
--- a/api/core/app/apps/advanced_chat/app_generator.py
+++ b/api/core/app/apps/advanced_chat/app_generator.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import threading
 import uuid
 from collections.abc import Generator
@@ -189,6 +190,8 @@ class AdvancedChatAppGenerator(MessageBasedAppGenerator):
                 logger.exception("Validation Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except (ValueError, InvokeError) as e:
+                if os.environ.get("DEBUG") and os.environ.get("DEBUG").lower() == 'true':
+                    logger.exception("Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except Exception as e:
                 logger.exception("Unknown Error when generating")
diff --git a/api/core/app/apps/agent_chat/app_generator.py b/api/core/app/apps/agent_chat/app_generator.py
index 632cf4f80a..847d314409 100644
--- a/api/core/app/apps/agent_chat/app_generator.py
+++ b/api/core/app/apps/agent_chat/app_generator.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import threading
 import uuid
 from collections.abc import Generator
@@ -198,6 +199,8 @@ class AgentChatAppGenerator(MessageBasedAppGenerator):
                 logger.exception("Validation Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except (ValueError, InvokeError) as e:
+                if os.environ.get("DEBUG") and os.environ.get("DEBUG").lower() == 'true':
+                    logger.exception("Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except Exception as e:
                 logger.exception("Unknown Error when generating")
diff --git a/api/core/app/apps/chat/app_generator.py b/api/core/app/apps/chat/app_generator.py
index 6bf309ca1b..e67901cca8 100644
--- a/api/core/app/apps/chat/app_generator.py
+++ b/api/core/app/apps/chat/app_generator.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import threading
 import uuid
 from collections.abc import Generator
@@ -195,6 +196,8 @@ class ChatAppGenerator(MessageBasedAppGenerator):
                 logger.exception("Validation Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except (ValueError, InvokeError) as e:
+                if os.environ.get("DEBUG") and os.environ.get("DEBUG").lower() == 'true':
+                    logger.exception("Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except Exception as e:
                 logger.exception("Unknown Error when generating")
diff --git a/api/core/app/apps/completion/app_generator.py b/api/core/app/apps/completion/app_generator.py
index 0e83da3dfd..5f93afcad7 100644
--- a/api/core/app/apps/completion/app_generator.py
+++ b/api/core/app/apps/completion/app_generator.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import threading
 import uuid
 from collections.abc import Generator
@@ -184,6 +185,8 @@ class CompletionAppGenerator(MessageBasedAppGenerator):
                 logger.exception("Validation Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except (ValueError, InvokeError) as e:
+                if os.environ.get("DEBUG") and os.environ.get("DEBUG").lower() == 'true':
+                    logger.exception("Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except Exception as e:
                 logger.exception("Unknown Error when generating")
diff --git a/api/core/app/apps/workflow/app_generator.py b/api/core/app/apps/workflow/app_generator.py
index 759790e7ee..a9b038ab51 100644
--- a/api/core/app/apps/workflow/app_generator.py
+++ b/api/core/app/apps/workflow/app_generator.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import threading
 import uuid
 from collections.abc import Generator
@@ -137,6 +138,8 @@ class WorkflowAppGenerator(BaseAppGenerator):
                 logger.exception("Validation Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except (ValueError, InvokeError) as e:
+                if os.environ.get("DEBUG") and os.environ.get("DEBUG").lower() == 'true':
+                    logger.exception("Error when generating")
                 queue_manager.publish_error(e, PublishFrom.APPLICATION_MANAGER)
             except Exception as e:
                 logger.exception("Unknown Error when generating")
diff --git a/api/core/model_runtime/model_providers/cohere/llm/llm.py b/api/core/model_runtime/model_providers/cohere/llm/llm.py
index cc8e5b0eae..6ace77b813 100644
--- a/api/core/model_runtime/model_providers/cohere/llm/llm.py
+++ b/api/core/model_runtime/model_providers/cohere/llm/llm.py
@@ -602,7 +602,7 @@ class CohereLargeLanguageModel(LargeLanguageModel):
             parameter_definitions = {}
             for p_key, p_val in properties.items():
                 required = False
-                if property in required_properties:
+                if p_key in required_properties:
                     required = True
 
                 desc = p_val['description']
diff --git a/api/core/model_runtime/model_providers/ollama/ollama.yaml b/api/core/model_runtime/model_providers/ollama/ollama.yaml
index 782667fdab..33747753bd 100644
--- a/api/core/model_runtime/model_providers/ollama/ollama.yaml
+++ b/api/core/model_runtime/model_providers/ollama/ollama.yaml
@@ -90,9 +90,9 @@ model_credential_schema:
       options:
         - value: 'true'
           label:
-            en_US: Yes
+            en_US: 'Yes'
             zh_Hans: 是
         - value: 'false'
           label:
-            en_US: No
+            en_US: 'No'
             zh_Hans: 否
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/_client.py b/api/core/model_runtime/model_providers/tongyi/llm/_client.py
deleted file mode 100644
index cfe33558e1..0000000000
--- a/api/core/model_runtime/model_providers/tongyi/llm/_client.py
+++ /dev/null
@@ -1,82 +0,0 @@
-from typing import Any, Optional
-
-from langchain.callbacks.manager import CallbackManagerForLLMRun
-from langchain.llms import Tongyi
-from langchain.llms.tongyi import generate_with_retry, stream_generate_with_retry
-from langchain.schema import Generation, LLMResult
-
-
-class EnhanceTongyi(Tongyi):
-    @property
-    def _default_params(self) -> dict[str, Any]:
-        """Get the default parameters for calling OpenAI API."""
-        normal_params = {
-            "top_p": self.top_p,
-            "api_key": self.dashscope_api_key
-        }
-
-        return {**normal_params, **self.model_kwargs}
-
-    def _generate(
-        self,
-        prompts: list[str],
-        stop: Optional[list[str]] = None,
-        run_manager: Optional[CallbackManagerForLLMRun] = None,
-        **kwargs: Any,
-    ) -> LLMResult:
-        generations = []
-        params: dict[str, Any] = {
-            **{"model": self.model_name},
-            **self._default_params,
-            **kwargs,
-        }
-        if self.streaming:
-            if len(prompts) > 1:
-                raise ValueError("Cannot stream results with multiple prompts.")
-            params["stream"] = True
-            text = ''
-            for stream_resp in stream_generate_with_retry(
-                self, prompt=prompts[0], **params
-            ):
-                if not generations:
-                    current_text = stream_resp["output"]["text"]
-                else:
-                    current_text = stream_resp["output"]["text"][len(text):]
-
-                text = stream_resp["output"]["text"]
-
-                generations.append(
-                    [
-                        Generation(
-                            text=current_text,
-                            generation_info=dict(
-                                finish_reason=stream_resp["output"]["finish_reason"],
-                            ),
-                        )
-                    ]
-                )
-
-                if run_manager:
-                    run_manager.on_llm_new_token(
-                        current_text,
-                        verbose=self.verbose,
-                        logprobs=None,
-                    )
-        else:
-            for prompt in prompts:
-                completion = generate_with_retry(
-                    self,
-                    prompt=prompt,
-                    **params,
-                )
-                generations.append(
-                    [
-                        Generation(
-                            text=completion["output"]["text"],
-                            generation_info=dict(
-                                finish_reason=completion["output"]["finish_reason"],
-                            ),
-                        )
-                    ]
-                )
-        return LLMResult(generations=generations)
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/llm.py b/api/core/model_runtime/model_providers/tongyi/llm/llm.py
index 405f93498e..3d0a80144c 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/llm.py
+++ b/api/core/model_runtime/model_providers/tongyi/llm/llm.py
@@ -1,8 +1,13 @@
+import base64
+import os
+import tempfile
+import uuid
 from collections.abc import Generator
-from typing import Optional, Union
+from http import HTTPStatus
+from typing import Optional, Union, cast
 
-from dashscope import get_tokenizer
-from dashscope.api_entities.dashscope_response import DashScopeAPIResponse
+from dashscope import Generation, MultiModalConversation, get_tokenizer
+from dashscope.api_entities.dashscope_response import GenerationResponse
 from dashscope.common.error import (
     AuthenticationError,
     InvalidParameter,
@@ -11,17 +16,21 @@ from dashscope.common.error import (
     UnsupportedHTTPMethod,
     UnsupportedModel,
 )
-from langchain.llms.tongyi import generate_with_retry, stream_generate_with_retry
 
 from core.model_runtime.callbacks.base_callback import Callback
 from core.model_runtime.entities.llm_entities import LLMMode, LLMResult, LLMResultChunk, LLMResultChunkDelta
 from core.model_runtime.entities.message_entities import (
     AssistantPromptMessage,
+    ImagePromptMessageContent,
     PromptMessage,
+    PromptMessageContentType,
     PromptMessageTool,
     SystemPromptMessage,
+    TextPromptMessageContent,
+    ToolPromptMessage,
     UserPromptMessage,
 )
+from core.model_runtime.entities.model_entities import ModelFeature
 from core.model_runtime.errors.invoke import (
     InvokeAuthorizationError,
     InvokeBadRequestError,
@@ -33,10 +42,9 @@ from core.model_runtime.errors.invoke import (
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
 
-from ._client import EnhanceTongyi
-
 
 class TongyiLargeLanguageModel(LargeLanguageModel):
+    tokenizers = {}
 
     def _invoke(self, model: str, credentials: dict,
                 prompt_messages: list[PromptMessage], model_parameters: dict,
@@ -57,13 +65,13 @@ class TongyiLargeLanguageModel(LargeLanguageModel):
         :return: full response or stream response chunk generator result
         """
         # invoke model
-        return self._generate(model, credentials, prompt_messages, model_parameters, stop, stream, user)
-    
-    def _code_block_mode_wrapper(self, model: str, credentials: dict, 
-                                 prompt_messages: list[PromptMessage], model_parameters: dict, 
-                                 tools: list[PromptMessageTool] | None = None, stop: list[str] | None = None, 
+        return self._generate(model, credentials, prompt_messages, model_parameters, tools, stop, stream, user)
+
+    def _code_block_mode_wrapper(self, model: str, credentials: dict,
+                                 prompt_messages: list[PromptMessage], model_parameters: dict,
+                                 tools: list[PromptMessageTool] | None = None, stop: list[str] | None = None,
                                  stream: bool = True, user: str | None = None, callbacks: list[Callback] = None) \
-                            -> LLMResult | Generator:
+            -> LLMResult | Generator:
         """
         Wrapper for code block mode
         """
@@ -88,7 +96,7 @@ if you are not sure about the structure.
                 stream=stream,
                 user=user
             )
-        
+
         model_parameters.pop("response_format")
         stop = stop or []
         stop.extend(["\n```", "```\n"])
@@ -99,13 +107,13 @@ if you are not sure about the structure.
             # override the system message
             prompt_messages[0] = SystemPromptMessage(
                 content=block_prompts
-                    .replace("{{instructions}}", prompt_messages[0].content)
+                .replace("{{instructions}}", prompt_messages[0].content)
             )
         else:
             # insert the system message
             prompt_messages.insert(0, SystemPromptMessage(
                 content=block_prompts
-                    .replace("{{instructions}}", f"Please output a valid {code_block} object.")
+                .replace("{{instructions}}", f"Please output a valid {code_block} object.")
             ))
 
         mode = self.get_model_mode(model, credentials)
@@ -138,7 +146,7 @@ if you are not sure about the structure.
                 prompt_messages=prompt_messages,
                 input_generator=response
             )
-        
+
         return response
 
     def get_num_tokens(self, model: str, credentials: dict, prompt_messages: list[PromptMessage],
@@ -152,7 +160,14 @@ if you are not sure about the structure.
         :param tools: tools for tool calling
         :return:
         """
-        tokenizer = get_tokenizer(model)
+        if model in ['qwen-turbo-chat', 'qwen-plus-chat']:
+            model = model.replace('-chat', '')
+
+        if model in self.tokenizers:
+            tokenizer = self.tokenizers[model]
+        else:
+            tokenizer = get_tokenizer(model)
+            self.tokenizers[model] = tokenizer
 
         # convert string to token ids
         tokens = tokenizer.encode(self._convert_messages_to_prompt(prompt_messages))
@@ -184,6 +199,7 @@ if you are not sure about the structure.
 
     def _generate(self, model: str, credentials: dict,
                   prompt_messages: list[PromptMessage], model_parameters: dict,
+                  tools: Optional[list[PromptMessageTool]] = None,
                   stop: Optional[list[str]] = None, stream: bool = True,
                   user: Optional[str] = None) -> Union[LLMResult, Generator]:
         """
@@ -192,24 +208,27 @@ if you are not sure about the structure.
         :param model: model name
         :param credentials: credentials
         :param prompt_messages: prompt messages
+        :param tools: tools for tool calling
         :param model_parameters: model parameters
         :param stop: stop words
         :param stream: is stream response
         :param user: unique user id
         :return: full response or stream response chunk generator result
         """
-        extra_model_kwargs = {}
-        if stop:
-            extra_model_kwargs['stop'] = stop
-
         # transform credentials to kwargs for model instance
         credentials_kwargs = self._to_credential_kwargs(credentials)
 
-        client = EnhanceTongyi(
-            model_name=model,
-            streaming=stream,
-            dashscope_api_key=credentials_kwargs['api_key'],
-        )
+        mode = self.get_model_mode(model, credentials)
+
+        if model in ['qwen-turbo-chat', 'qwen-plus-chat']:
+            model = model.replace('-chat', '')
+
+        extra_model_kwargs = {}
+        if tools:
+            extra_model_kwargs['tools'] = self._convert_tools(tools)
+
+        if stop:
+            extra_model_kwargs['stop'] = stop
 
         params = {
             'model': model,
@@ -218,30 +237,27 @@ if you are not sure about the structure.
             **extra_model_kwargs,
         }
 
-        mode = self.get_model_mode(model, credentials)
+        model_schema = self.get_model_schema(model, credentials)
+        if ModelFeature.VISION in (model_schema.features or []):
+            params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages, rich_content=True)
 
-        if mode == LLMMode.CHAT:
-            params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages)
+            response = MultiModalConversation.call(**params, stream=stream)
         else:
-            params['prompt'] = self._convert_messages_to_prompt(prompt_messages)
+            if mode == LLMMode.CHAT:
+                params['messages'] = self._convert_prompt_messages_to_tongyi_messages(prompt_messages)
+            else:
+                params['prompt'] = prompt_messages[0].content.rstrip()
+
+            response = Generation.call(**params,
+                                       result_format='message',
+                                       stream=stream)
 
         if stream:
-            responses = stream_generate_with_retry(
-                client, 
-                stream=True,
-                incremental_output=True,
-                **params
-            )
+            return self._handle_generate_stream_response(model, credentials, response, prompt_messages)
 
-            return self._handle_generate_stream_response(model, credentials, responses, prompt_messages)
-
-        response = generate_with_retry(
-            client,
-            **params,
-        )
         return self._handle_generate_response(model, credentials, response, prompt_messages)
-        
-    def _handle_generate_response(self, model: str, credentials: dict, response: DashScopeAPIResponse,
+
+    def _handle_generate_response(self, model: str, credentials: dict, response: GenerationResponse,
                                   prompt_messages: list[PromptMessage]) -> LLMResult:
         """
         Handle llm response
@@ -254,7 +270,7 @@ if you are not sure about the structure.
         """
         # transform assistant message to prompt message
         assistant_prompt_message = AssistantPromptMessage(
-            content=response.output.text
+            content=response.output.choices[0].message.content,
         )
 
         # transform usage
@@ -270,32 +286,65 @@ if you are not sure about the structure.
 
         return result
 
-    def _handle_generate_stream_response(self, model: str, credentials: dict, responses: Generator,
+    def _handle_generate_stream_response(self, model: str, credentials: dict,
+                                         responses: Generator[GenerationResponse, None, None],
                                          prompt_messages: list[PromptMessage]) -> Generator:
         """
         Handle llm stream response
 
         :param model: model name
         :param credentials: credentials
-        :param response: response
+        :param responses: response
         :param prompt_messages: prompt messages
         :return: llm response chunk generator result
         """
+        full_text = ''
+        tool_calls = []
         for index, response in enumerate(responses):
-            resp_finish_reason = response.output.finish_reason
-            resp_content = response.output.text
-            usage = response.usage
+            if response.status_code != 200 and response.status_code != HTTPStatus.OK:
+                raise ServiceUnavailableError(
+                    f"Failed to invoke model {model}, status code: {response.status_code}, "
+                    f"message: {response.message}"
+                )
 
-            if resp_finish_reason is None and (resp_content is None or resp_content == ''):
-                continue
+            resp_finish_reason = response.output.choices[0].finish_reason
 
-            # transform assistant message to prompt message
-            assistant_prompt_message = AssistantPromptMessage(
-                content=resp_content if resp_content else '',
-            )
+            if resp_finish_reason is not None and resp_finish_reason != 'null':
+                resp_content = response.output.choices[0].message.content
+
+                assistant_prompt_message = AssistantPromptMessage(
+                    content='',
+                )
+
+                if 'tool_calls' in response.output.choices[0].message:
+                    tool_calls = response.output.choices[0].message['tool_calls']
+                elif resp_content:
+                    # special for qwen-vl
+                    if isinstance(resp_content, list):
+                        resp_content = resp_content[0]['text']
+
+                    # transform assistant message to prompt message
+                    assistant_prompt_message.content = resp_content.replace(full_text, '', 1)
+
+                    full_text = resp_content
+
+                if tool_calls:
+                    message_tool_calls = []
+                    for tool_call_obj in tool_calls:
+                        message_tool_call = AssistantPromptMessage.ToolCall(
+                            id=tool_call_obj['function']['name'],
+                            type='function',
+                            function=AssistantPromptMessage.ToolCall.ToolCallFunction(
+                                name=tool_call_obj['function']['name'],
+                                arguments=tool_call_obj['function']['arguments']
+                            )
+                        )
+                        message_tool_calls.append(message_tool_call)
+
+                    assistant_prompt_message.tool_calls = message_tool_calls
 
-            if resp_finish_reason is not None:
                 # transform usage
+                usage = response.usage
                 usage = self._calc_response_usage(model, credentials, usage.input_tokens, usage.output_tokens)
 
                 yield LLMResultChunk(
@@ -309,6 +358,23 @@ if you are not sure about the structure.
                     )
                 )
             else:
+                resp_content = response.output.choices[0].message.content
+                if not resp_content:
+                    if 'tool_calls' in response.output.choices[0].message:
+                        tool_calls = response.output.choices[0].message['tool_calls']
+                    continue
+
+                # special for qwen-vl
+                if isinstance(resp_content, list):
+                    resp_content = resp_content[0]['text']
+
+                # transform assistant message to prompt message
+                assistant_prompt_message = AssistantPromptMessage(
+                    content=resp_content.replace(full_text, '', 1),
+                )
+
+                full_text = resp_content
+
                 yield LLMResultChunk(
                     model=model,
                     prompt_messages=prompt_messages,
@@ -343,11 +409,20 @@ if you are not sure about the structure.
         content = message.content
 
         if isinstance(message, UserPromptMessage):
-            message_text = f"{human_prompt} {content}"
+            if isinstance(content, str):
+                message_text = f"{human_prompt} {content}"
+            else:
+                message_text = ""
+                for sub_message in content:
+                    if sub_message.type == PromptMessageContentType.TEXT:
+                        message_text = f"{human_prompt} {sub_message.data}"
+                        break
         elif isinstance(message, AssistantPromptMessage):
             message_text = f"{ai_prompt} {content}"
         elif isinstance(message, SystemPromptMessage):
             message_text = content
+        elif isinstance(message, ToolPromptMessage):
+            message_text = content
         else:
             raise ValueError(f"Got unknown type {message}")
 
@@ -370,7 +445,8 @@ if you are not sure about the structure.
         # trim off the trailing ' ' that might come from the "Assistant: "
         return text.rstrip()
 
-    def _convert_prompt_messages_to_tongyi_messages(self, prompt_messages: list[PromptMessage]) -> list[dict]:
+    def _convert_prompt_messages_to_tongyi_messages(self, prompt_messages: list[PromptMessage],
+                                                    rich_content: bool = False) -> list[dict]:
         """
         Convert prompt messages to tongyi messages
 
@@ -382,23 +458,118 @@ if you are not sure about the structure.
             if isinstance(prompt_message, SystemPromptMessage):
                 tongyi_messages.append({
                     'role': 'system',
-                    'content': prompt_message.content,
+                    'content': prompt_message.content if not rich_content else [{"text": prompt_message.content}],
                 })
             elif isinstance(prompt_message, UserPromptMessage):
-                tongyi_messages.append({
-                    'role': 'user',
-                    'content': prompt_message.content,
-                })
+                if isinstance(prompt_message.content, str):
+                    tongyi_messages.append({
+                        'role': 'user',
+                        'content': prompt_message.content if not rich_content else [{"text": prompt_message.content}],
+                    })
+                else:
+                    sub_messages = []
+                    for message_content in prompt_message.content:
+                        if message_content.type == PromptMessageContentType.TEXT:
+                            message_content = cast(TextPromptMessageContent, message_content)
+                            sub_message_dict = {
+                                "text": message_content.data
+                            }
+                            sub_messages.append(sub_message_dict)
+                        elif message_content.type == PromptMessageContentType.IMAGE:
+                            message_content = cast(ImagePromptMessageContent, message_content)
+
+                            image_url = message_content.data
+                            if message_content.data.startswith("data:"):
+                                # convert image base64 data to file in /tmp
+                                image_url = self._save_base64_image_to_file(message_content.data)
+
+                            sub_message_dict = {
+                                "image": image_url
+                            }
+                            sub_messages.append(sub_message_dict)
+
+                    # resort sub_messages to ensure text is always at last
+                    sub_messages = sorted(sub_messages, key=lambda x: 'text' in x)
+
+                    tongyi_messages.append({
+                        'role': 'user',
+                        'content': sub_messages
+                    })
             elif isinstance(prompt_message, AssistantPromptMessage):
+                content = prompt_message.content
+                if not content:
+                    content = ' '
                 tongyi_messages.append({
                     'role': 'assistant',
-                    'content': prompt_message.content,
+                    'content': content if not rich_content else [{"text": content}],
+                })
+            elif isinstance(prompt_message, ToolPromptMessage):
+                tongyi_messages.append({
+                    "role": "tool",
+                    "content": prompt_message.content,
+                    "name": prompt_message.tool_call_id
                 })
             else:
                 raise ValueError(f"Got unknown type {prompt_message}")
 
         return tongyi_messages
 
+    def _save_base64_image_to_file(self, base64_image: str) -> str:
+        """
+        Save base64 image to file
+        'data:{upload_file.mime_type};base64,{encoded_string}'
+
+        :param base64_image: base64 image data
+        :return: image file path
+        """
+        # get mime type and encoded string
+        mime_type, encoded_string = base64_image.split(',')[0].split(';')[0].split(':')[1], base64_image.split(',')[1]
+
+        # save image to file
+        temp_dir = tempfile.gettempdir()
+
+        file_path = os.path.join(temp_dir, f"{uuid.uuid4()}.{mime_type.split('/')[1]}")
+
+        with open(file_path, "wb") as image_file:
+            image_file.write(base64.b64decode(encoded_string))
+
+        return f"file://{file_path}"
+
+    def _convert_tools(self, tools: list[PromptMessageTool]) -> list[dict]:
+        """
+        Convert tools
+        """
+        tool_definitions = []
+        for tool in tools:
+            properties = tool.parameters['properties']
+            required_properties = tool.parameters['required']
+
+            properties_definitions = {}
+            for p_key, p_val in properties.items():
+                desc = p_val['description']
+                if 'enum' in p_val:
+                    desc += (f"; Only accepts one of the following predefined options: "
+                             f"[{', '.join(p_val['enum'])}]")
+
+                properties_definitions[p_key] = {
+                    'description': desc,
+                    'type': p_val['type'],
+                }
+
+            tool_definition = {
+                "type": "function",
+                "function": {
+                    "name": tool.name,
+                    "description": tool.description,
+                    "parameters": properties_definitions,
+                    "required": required_properties
+                }
+            }
+
+            tool_definitions.append(tool_definition)
+
+        return tool_definitions
+
     @property
     def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
         """
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-0403.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-0403.yaml
new file mode 100644
index 0000000000..865c0c8138
--- /dev/null
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-0403.yaml
@@ -0,0 +1,81 @@
+model: qwen-max-0403
+label:
+  en_US: qwen-max-0403
+model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
+model_properties:
+  mode: chat
+  context_size: 8192
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    type: float
+    default: 0.3
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: max_tokens
+    use_template: max_tokens
+    type: int
+    default: 2000
+    min: 1
+    max: 2000
+    help:
+      zh_Hans: 用于指定模型在生成内容时token的最大数量，它定义了生成的上限，但不保证每次都会生成到这个数量。
+      en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time.
+  - name: top_p
+    use_template: top_p
+    type: float
+    default: 0.8
+    min: 0.1
+    max: 0.9
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: top_k
+    type: int
+    min: 0
+    max: 99
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated.
+  - name: seed
+    required: false
+    type: int
+    default: 1234
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    help:
+      zh_Hans: 生成时使用的随机数种子，用户控制模型生成内容的随机性。支持无符号64位整数，默认值为 1234。在使用seed时，模型将尽可能生成相同或相似的结果，但目前不保证每次生成的结果完全相同。
+      en_US: The random number seed used when generating, the user controls the randomness of the content generated by the model. Supports unsigned 64-bit integers, default value is 1234. When using seed, the model will try its best to generate the same or similar results, but there is currently no guarantee that the results will be exactly the same every time.
+  - name: repetition_penalty
+    required: false
+    type: float
+    default: 1.1
+    label:
+      en_US: Repetition penalty
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment.
+  - name: enable_search
+    type: boolean
+    default: false
+    help:
+      zh_Hans: 模型内置了互联网搜索服务，该参数控制模型在生成文本时是否参考使用互联网搜索结果。启用互联网搜索，模型会将搜索结果作为文本生成过程中的参考信息，但模型会基于其内部逻辑“自行判断”是否使用互联网搜索结果。
+      en_US: The model has a built-in Internet search service. This parameter controls whether the model refers to Internet search results when generating text. When Internet search is enabled, the model will use the search results as reference information in the text generation process, but the model will "judge" whether to use Internet search results based on its internal logic.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.12'
+  output: '0.12'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml
index 691347e701..533d99aa55 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-1201.yaml
@@ -2,6 +2,10 @@ model: qwen-max-1201
 label:
   en_US: qwen-max-1201
 model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
 model_properties:
   mode: chat
   context_size: 8192
@@ -9,7 +13,7 @@ parameter_rules:
   - name: temperature
     use_template: temperature
     type: float
-    default: 0.85
+    default: 0.3
     min: 0.0
     max: 2.0
     help:
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml
index 91129d37dd..dbe3ece396 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max-longcontext.yaml
@@ -2,6 +2,10 @@ model: qwen-max-longcontext
 label:
   en_US: qwen-max-longcontext
 model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
 model_properties:
   mode: chat
   context_size: 32768
@@ -9,7 +13,7 @@ parameter_rules:
   - name: temperature
     use_template: temperature
     type: float
-    default: 0.85
+    default: 0.3
     min: 0.0
     max: 2.0
     help:
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml
index 5d6b69f21f..9a0f1afc03 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-max.yaml
@@ -2,6 +2,10 @@ model: qwen-max
 label:
   en_US: qwen-max
 model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
 model_properties:
   mode: chat
   context_size: 8192
@@ -9,7 +13,7 @@ parameter_rules:
   - name: temperature
     use_template: temperature
     type: float
-    default: 0.85
+    default: 0.3
     min: 0.0
     max: 2.0
     help:
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus-chat.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus-chat.yaml
new file mode 100644
index 0000000000..ae3ec0fc04
--- /dev/null
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus-chat.yaml
@@ -0,0 +1,81 @@
+model: qwen-plus-chat
+label:
+  en_US: qwen-plus-chat
+model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
+model_properties:
+  mode: chat
+  context_size: 32768
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    type: float
+    default: 0.3
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: max_tokens
+    use_template: max_tokens
+    type: int
+    default: 1500
+    min: 1
+    max: 1500
+    help:
+      zh_Hans: 用于指定模型在生成内容时token的最大数量，它定义了生成的上限，但不保证每次都会生成到这个数量。
+      en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time.
+  - name: top_p
+    use_template: top_p
+    type: float
+    default: 0.8
+    min: 0.1
+    max: 0.9
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: top_k
+    type: int
+    min: 0
+    max: 99
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated.
+  - name: seed
+    required: false
+    type: int
+    default: 1234
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    help:
+      zh_Hans: 生成时使用的随机数种子，用户控制模型生成内容的随机性。支持无符号64位整数，默认值为 1234。在使用seed时，模型将尽可能生成相同或相似的结果，但目前不保证每次生成的结果完全相同。
+      en_US: The random number seed used when generating, the user controls the randomness of the content generated by the model. Supports unsigned 64-bit integers, default value is 1234. When using seed, the model will try its best to generate the same or similar results, but there is currently no guarantee that the results will be exactly the same every time.
+  - name: repetition_penalty
+    required: false
+    type: float
+    default: 1.1
+    label:
+      en_US: Repetition penalty
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment.
+  - name: enable_search
+    type: boolean
+    default: false
+    help:
+      zh_Hans: 模型内置了互联网搜索服务，该参数控制模型在生成文本时是否参考使用互联网搜索结果。启用互联网搜索，模型会将搜索结果作为文本生成过程中的参考信息，但模型会基于其内部逻辑“自行判断”是否使用互联网搜索结果。
+      en_US: The model has a built-in Internet search service. This parameter controls whether the model refers to Internet search results when generating text. When Internet search is enabled, the model will use the search results as reference information in the text generation process, but the model will "judge" whether to use Internet search results based on its internal logic.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.02'
+  output: '0.02'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml
index 7c25e8802b..bfa04792a0 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-plus.yaml
@@ -2,6 +2,8 @@ model: qwen-plus
 label:
   en_US: qwen-plus
 model_type: llm
+features:
+  - agent-thought
 model_properties:
   mode: completion
   context_size: 32768
@@ -9,7 +11,7 @@ parameter_rules:
   - name: temperature
     use_template: temperature
     type: float
-    default: 0.85
+    default: 0.3
     min: 0.0
     max: 2.0
     help:
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo-chat.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo-chat.yaml
new file mode 100644
index 0000000000..dc8208fac6
--- /dev/null
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo-chat.yaml
@@ -0,0 +1,81 @@
+model: qwen-turbo-chat
+label:
+  en_US: qwen-turbo-chat
+model_type: llm
+features:
+  - multi-tool-call
+  - agent-thought
+  - stream-tool-call
+model_properties:
+  mode: chat
+  context_size: 8192
+parameter_rules:
+  - name: temperature
+    use_template: temperature
+    type: float
+    default: 0.3
+    min: 0.0
+    max: 2.0
+    help:
+      zh_Hans: 用于控制随机性和多样性的程度。具体来说，temperature值控制了生成文本时对每个候选词的概率分布进行平滑的程度。较高的temperature值会降低概率分布的峰值，使得更多的低概率词被选择，生成结果更加多样化；而较低的temperature值则会增强概率分布的峰值，使得高概率词更容易被选择，生成结果更加确定。
+      en_US: Used to control the degree of randomness and diversity. Specifically, the temperature value controls the degree to which the probability distribution of each candidate word is smoothed when generating text. A higher temperature value will reduce the peak value of the probability distribution, allowing more low-probability words to be selected, and the generated results will be more diverse; while a lower temperature value will enhance the peak value of the probability distribution, making it easier for high-probability words to be selected. , the generated results are more certain.
+  - name: max_tokens
+    use_template: max_tokens
+    type: int
+    default: 1500
+    min: 1
+    max: 1500
+    help:
+      zh_Hans: 用于指定模型在生成内容时token的最大数量，它定义了生成的上限，但不保证每次都会生成到这个数量。
+      en_US: It is used to specify the maximum number of tokens when the model generates content. It defines the upper limit of generation, but does not guarantee that this number will be generated every time.
+  - name: top_p
+    use_template: top_p
+    type: float
+    default: 0.8
+    min: 0.1
+    max: 0.9
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: top_k
+    type: int
+    min: 0
+    max: 99
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated.
+  - name: seed
+    required: false
+    type: int
+    default: 1234
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    help:
+      zh_Hans: 生成时使用的随机数种子，用户控制模型生成内容的随机性。支持无符号64位整数，默认值为 1234。在使用seed时，模型将尽可能生成相同或相似的结果，但目前不保证每次生成的结果完全相同。
+      en_US: The random number seed used when generating, the user controls the randomness of the content generated by the model. Supports unsigned 64-bit integers, default value is 1234. When using seed, the model will try its best to generate the same or similar results, but there is currently no guarantee that the results will be exactly the same every time.
+  - name: repetition_penalty
+    required: false
+    type: float
+    default: 1.1
+    label:
+      en_US: Repetition penalty
+    help:
+      zh_Hans: 用于控制模型生成时的重复度。提高repetition_penalty时可以降低模型生成的重复度。1.0表示不做惩罚。
+      en_US: Used to control the repeatability when generating models. Increasing repetition_penalty can reduce the duplication of model generation. 1.0 means no punishment.
+  - name: enable_search
+    type: boolean
+    default: false
+    help:
+      zh_Hans: 模型内置了互联网搜索服务，该参数控制模型在生成文本时是否参考使用互联网搜索结果。启用互联网搜索，模型会将搜索结果作为文本生成过程中的参考信息，但模型会基于其内部逻辑“自行判断”是否使用互联网搜索结果。
+      en_US: The model has a built-in Internet search service. This parameter controls whether the model refers to Internet search results when generating text. When Internet search is enabled, the model will use the search results as reference information in the text generation process, but the model will "judge" whether to use Internet search results based on its internal logic.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.008'
+  output: '0.008'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml
index 20b46de6f3..140dc68af8 100644
--- a/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-turbo.yaml
@@ -2,6 +2,8 @@ model: qwen-turbo
 label:
   en_US: qwen-turbo
 model_type: llm
+features:
+  - agent-thought
 model_properties:
   mode: completion
   context_size: 8192
@@ -9,7 +11,7 @@ parameter_rules:
   - name: temperature
     use_template: temperature
     type: float
-    default: 0.85
+    default: 0.3
     min: 0.0
     max: 2.0
     help:
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
new file mode 100644
index 0000000000..f917ccaa5d
--- /dev/null
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-max.yaml
@@ -0,0 +1,47 @@
+model: qwen-vl-max
+label:
+  en_US: qwen-vl-max
+model_type: llm
+features:
+  - vision
+  - agent-thought
+model_properties:
+  mode: chat
+  context_size: 8192
+parameter_rules:
+  - name: top_p
+    use_template: top_p
+    type: float
+    default: 0.8
+    min: 0.1
+    max: 0.9
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: top_k
+    type: int
+    min: 0
+    max: 99
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated.
+  - name: seed
+    required: false
+    type: int
+    default: 1234
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    help:
+      zh_Hans: 生成时使用的随机数种子，用户控制模型生成内容的随机性。支持无符号64位整数，默认值为 1234。在使用seed时，模型将尽可能生成相同或相似的结果，但目前不保证每次生成的结果完全相同。
+      en_US: The random number seed used when generating, the user controls the randomness of the content generated by the model. Supports unsigned 64-bit integers, default value is 1234. When using seed, the model will try its best to generate the same or similar results, but there is currently no guarantee that the results will be exactly the same every time.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.02'
+  output: '0.02'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
new file mode 100644
index 0000000000..e2dd8c4e57
--- /dev/null
+++ b/api/core/model_runtime/model_providers/tongyi/llm/qwen-vl-plus.yaml
@@ -0,0 +1,47 @@
+model: qwen-vl-plus
+label:
+  en_US: qwen-vl-plus
+model_type: llm
+features:
+  - vision
+  - agent-thought
+model_properties:
+  mode: chat
+  context_size: 32768
+parameter_rules:
+  - name: top_p
+    use_template: top_p
+    type: float
+    default: 0.8
+    min: 0.1
+    max: 0.9
+    help:
+      zh_Hans: 生成过程中核采样方法概率阈值，例如，取值为0.8时，仅保留概率加起来大于等于0.8的最可能token的最小集合作为候选集。取值范围为（0,1.0)，取值越大，生成的随机性越高；取值越低，生成的确定性越高。
+      en_US: The probability threshold of the kernel sampling method during the generation process. For example, when the value is 0.8, only the smallest set of the most likely tokens with a sum of probabilities greater than or equal to 0.8 is retained as the candidate set. The value range is (0,1.0). The larger the value, the higher the randomness generated; the lower the value, the higher the certainty generated.
+  - name: top_k
+    type: int
+    min: 0
+    max: 99
+    label:
+      zh_Hans: 取样数量
+      en_US: Top k
+    help:
+      zh_Hans: 生成时，采样候选集的大小。例如，取值为50时，仅将单次生成中得分最高的50个token组成随机采样的候选集。取值越大，生成的随机性越高；取值越小，生成的确定性越高。
+      en_US: The size of the sample candidate set when generated. For example, when the value is 50, only the 50 highest-scoring tokens in a single generation form a randomly sampled candidate set. The larger the value, the higher the randomness generated; the smaller the value, the higher the certainty generated.
+  - name: seed
+    required: false
+    type: int
+    default: 1234
+    label:
+      zh_Hans: 随机种子
+      en_US: Random seed
+    help:
+      zh_Hans: 生成时使用的随机数种子，用户控制模型生成内容的随机性。支持无符号64位整数，默认值为 1234。在使用seed时，模型将尽可能生成相同或相似的结果，但目前不保证每次生成的结果完全相同。
+      en_US: The random number seed used when generating, the user controls the randomness of the content generated by the model. Supports unsigned 64-bit integers, default value is 1234. When using seed, the model will try its best to generate the same or similar results, but there is currently no guarantee that the results will be exactly the same every time.
+  - name: response_format
+    use_template: response_format
+pricing:
+  input: '0.008'
+  output: '0.008'
+  unit: '0.001'
+  currency: RMB
diff --git a/api/core/model_runtime/model_providers/tongyi/text_embedding/text_embedding.py b/api/core/model_runtime/model_providers/tongyi/text_embedding/text_embedding.py
index a5f3660fb2..c207ffc1e3 100644
--- a/api/core/model_runtime/model_providers/tongyi/text_embedding/text_embedding.py
+++ b/api/core/model_runtime/model_providers/tongyi/text_embedding/text_embedding.py
@@ -37,8 +37,11 @@ class TongyiTextEmbeddingModel(_CommonTongyi, TextEmbeddingModel):
         :return: embeddings result
         """
         credentials_kwargs = self._to_credential_kwargs(credentials)
-        dashscope.api_key = credentials_kwargs["dashscope_api_key"]
-        embeddings, embedding_used_tokens = self.embed_documents(model, texts)
+        embeddings, embedding_used_tokens = self.embed_documents(
+            credentials_kwargs=credentials_kwargs,
+            model=model,
+            texts=texts
+        )
 
         return TextEmbeddingResult(
             embeddings=embeddings,
@@ -74,17 +77,19 @@ class TongyiTextEmbeddingModel(_CommonTongyi, TextEmbeddingModel):
         try:
             # transform credentials to kwargs for model instance
             credentials_kwargs = self._to_credential_kwargs(credentials)
-            dashscope.api_key = credentials_kwargs["dashscope_api_key"]
+
             # call embedding model
-            self.embed_documents(model=model, texts=["ping"])
+            self.embed_documents(credentials_kwargs=credentials_kwargs, model=model, texts=["ping"])
         except Exception as ex:
             raise CredentialsValidateFailedError(str(ex))
 
     @staticmethod
-    def embed_documents(model: str, texts: list[str]) -> tuple[list[list[float]], int]:
+    def embed_documents(credentials_kwargs: dict, model: str, texts: list[str]) -> tuple[list[list[float]], int]:
         """Call out to Tongyi's embedding endpoint.
 
         Args:
+            credentials_kwargs: The credentials to use for the call.
+            model: The model to use for embedding.
             texts: The list of texts to embed.
 
         Returns:
@@ -93,7 +98,12 @@ class TongyiTextEmbeddingModel(_CommonTongyi, TextEmbeddingModel):
         embeddings = []
         embedding_used_tokens = 0
         for text in texts:
-            response = dashscope.TextEmbedding.call(model=model, input=text, text_type="document")
+            response = dashscope.TextEmbedding.call(
+                api_key=credentials_kwargs["dashscope_api_key"],
+                model=model,
+                input=text,
+                text_type="document"
+            )
             data = response.output["embeddings"][0]
             embeddings.append(data["embedding"])
             embedding_used_tokens += response.usage["total_tokens"]
diff --git a/api/core/model_runtime/model_providers/tongyi/tts/tts.py b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
index 937f469bdf..b00f7c7c93 100644
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@@ -118,7 +118,6 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         :param content_text: text content to be translated
         :return: text translated to audio file
         """
-        dashscope.api_key = credentials.get('dashscope_api_key')
         word_limit = self._get_model_word_limit(model, credentials)
         audio_type = self._get_model_audio_type(model, credentials)
         tts_file_id = self._get_file_name(content_text)
@@ -127,6 +126,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
             sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
             for sentence in sentences:
                 response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
+                                                                      api_key=credentials.get('dashscope_api_key'),
                                                                       text=sentence.strip(),
                                                                       format=audio_type, word_timestamp_enabled=True,
                                                                       phoneme_timestamp_enabled=True)
@@ -146,8 +146,8 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         :param audio_type: audio file type
         :return: text translated to audio file
         """
-        dashscope.api_key = credentials.get('dashscope_api_key')
         response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
+                                                              api_key=credentials.get('dashscope_api_key'),
                                                               text=sentence.strip(),
                                                               format=audio_type)
         if isinstance(response.get_audio_data(), bytes):
diff --git a/api/core/model_runtime/model_providers/triton_inference_server/triton_inference_server.yaml b/api/core/model_runtime/model_providers/triton_inference_server/triton_inference_server.yaml
index 50a804743d..ca2fad33ad 100644
--- a/api/core/model_runtime/model_providers/triton_inference_server/triton_inference_server.yaml
+++ b/api/core/model_runtime/model_providers/triton_inference_server/triton_inference_server.yaml
@@ -43,7 +43,7 @@ model_credential_schema:
       placeholder:
         zh_Hans: 在此输入您的上下文大小
         en_US: Enter the context size
-      default: 2048
+      default: '2048'
     - variable: completion_type
       label:
         zh_Hans: 补全类型
@@ -69,16 +69,16 @@ model_credential_schema:
         en_US: Stream output
       type: select
       required: true
-      default: true
+      default: 'true'
       placeholder:
         zh_Hans: 是否支持流式输出
         en_US: Whether to support stream output
       options:
         - label:
             zh_Hans: 是
-            en_US: Yes
-          value: true
+            en_US: 'Yes'
+          value: 'true'
         - label:
             zh_Hans: 否
-            en_US: No
-          value: false
+            en_US: 'No'
+          value: 'false'
diff --git a/api/requirements.txt b/api/requirements.txt
index 87fccbf81b..545379dd2e 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -9,7 +9,6 @@ flask-restful~=0.3.10
 flask-cors~=4.0.0
 gunicorn~=21.2.0
 gevent~=23.9.1
-langchain==0.0.250
 openai~=1.13.3
 tiktoken~=0.6.0
 psycopg2-binary~=2.9.6
@@ -47,7 +46,7 @@ google-search-results==2.4.2
 googleapis-common-protos==1.63.0
 replicate~=0.22.0
 websocket-client~=1.7.0
-dashscope[tokenizer]~=1.14.0
+dashscope[tokenizer]~=1.17.0
 huggingface_hub~=0.16.4
 transformers~=4.35.0
 tokenizers~=0.15.0
@@ -79,4 +78,5 @@ azure-storage-blob==12.9.0
 azure-identity==1.15.0
 lxml==5.1.0
 xlrd~=2.0.1
+pydantic~=1.10.0
 pgvecto-rs==0.1.4
diff --git a/api/tests/unit_tests/core/rag/__init__.py b/api/tests/unit_tests/core/rag/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/api/tests/unit_tests/core/rag/datasource/__init__.py b/api/tests/unit_tests/core/rag/datasource/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/api/tests/unit_tests/core/rag/datasource/vdb/__init__.py b/api/tests/unit_tests/core/rag/datasource/vdb/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/api/tests/unit_tests/core/rag/datasource/vdb/milvus/__init__.py b/api/tests/unit_tests/core/rag/datasource/vdb/milvus/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/api/tests/unittests/test_model.py b/api/tests/unit_tests/core/rag/datasource/vdb/milvus/test_milvus.py
similarity index 100%
rename from api/tests/unittests/test_model.py
rename to api/tests/unit_tests/core/rag/datasource/vdb/milvus/test_milvus.py