fix: xinference last token being ignored (#1013)

2025-07-06 07:05:11 +08:00 · 2023-08-25 18:15:05 +08:00 · 2023-08-25 18:15:05 +08:00 · 2d9616c29c
commit 2d9616c29c
parent 915e26527b
1 changed files with 55 additions and 37 deletions
--- a/api/core/third_party/langchain/llms/xinference_llm.py
+++ b/api/core/third_party/langchain/llms/xinference_llm.py
@ -3,8 +3,11 @@ from typing import Optional, List, Any, Union, Generator
 from langchain.callbacks.manager import CallbackManagerForLLMRun
 from langchain.llms import Xinference
 from langchain.llms.utils import enforce_stop_tokens
-from xinference.client import RESTfulChatglmCppChatModelHandle, \
+from xinference.client import (
-    RESTfulChatModelHandle, RESTfulGenerateModelHandle
+    RESTfulChatglmCppChatModelHandle,
    RESTfulChatModelHandle,
    RESTfulGenerateModelHandle,
 )
 class XinferenceLLM(Xinference):
@ -29,7 +32,9 @@ class XinferenceLLM(Xinference):
        model = self.client.get_model(self.model_uid)
        if isinstance(model, RESTfulChatModelHandle):
-            generate_config: "LlamaCppGenerateConfig" = kwargs.get("generate_config", {})
+            generate_config: "LlamaCppGenerateConfig" = kwargs.get(
                "generate_config", {}
            )
            if stop:
                generate_config["stop"] = stop
@ -48,7 +53,9 @@ class XinferenceLLM(Xinference):
                completion = model.chat(prompt=prompt, generate_config=generate_config)
                return completion["choices"][0]["message"]["content"]
        elif isinstance(model, RESTfulGenerateModelHandle):
-            generate_config: "LlamaCppGenerateConfig" = kwargs.get("generate_config", {})
+            generate_config: "LlamaCppGenerateConfig" = kwargs.get(
                "generate_config", {}
            )
            if stop:
                generate_config["stop"] = stop
@ -65,10 +72,14 @@ class XinferenceLLM(Xinference):
                return combined_text_output
            else:
-                completion = model.generate(prompt=prompt, generate_config=generate_config)
+                completion = model.generate(
                    prompt=prompt, generate_config=generate_config
                )
                return completion["choices"][0]["text"]
        elif isinstance(model, RESTfulChatglmCppChatModelHandle):
-            generate_config: "ChatglmCppGenerateConfig" = kwargs.get("generate_config", {})
+            generate_config: "ChatglmCppGenerateConfig" = kwargs.get(
                "generate_config", {}
            )
            if generate_config and generate_config.get("stream"):
                combined_text_output = ""
@ -91,11 +102,20 @@ class XinferenceLLM(Xinference):
    def _stream_generate(
        self,
-            model: Union["RESTfulGenerateModelHandle", "RESTfulChatModelHandle", "RESTfulChatglmCppChatModelHandle"],
+        model: Union[
            "RESTfulGenerateModelHandle",
            "RESTfulChatModelHandle",
            "RESTfulChatglmCppChatModelHandle",
        ],
        prompt: str,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        generate_config: Optional[
-                Union["LlamaCppGenerateConfig", "PytorchGenerateConfig", "ChatglmCppGenerateConfig"]] = None,
+            Union[
                "LlamaCppGenerateConfig",
                "PytorchGenerateConfig",
                "ChatglmCppGenerateConfig",
            ]
        ] = None,
    ) -> Generator[str, None, None]:
        """
        Args:
@ -108,7 +128,9 @@ class XinferenceLLM(Xinference):
        Yields:
            A string token.
        """
-        if isinstance(model, (RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle)):
+        if isinstance(
            model, (RESTfulChatModelHandle, RESTfulChatglmCppChatModelHandle)
        ):
            streaming_response = model.chat(
                prompt=prompt, generate_config=generate_config
            )
@ -123,14 +145,10 @@ class XinferenceLLM(Xinference):
                if choices:
                    choice = choices[0]
                    if isinstance(choice, dict):
-                        if 'finish_reason' in choice and choice['finish_reason'] \
+                        if "text" in choice:
                                and choice['finish_reason'] in ['stop', 'length']:
                            break
                        if 'text' in choice:
                            token = choice.get("text", "")
-                        elif 'delta' in choice and 'content' in choice['delta']:
+                        elif "delta" in choice and "content" in choice["delta"]:
-                            token = choice.get('delta').get('content')
+                            token = choice.get("delta").get("content")
                        else:
                            continue
                        log_probs = choice.get("logprobs")