diff --git a/api/core/llm_generator/prompts.py b/api/core/llm_generator/prompts.py index 170a28432b..a1737f00c6 100644 --- a/api/core/llm_generator/prompts.py +++ b/api/core/llm_generator/prompts.py @@ -64,6 +64,7 @@ User Input: SUGGESTED_QUESTIONS_AFTER_ANSWER_INSTRUCTION_PROMPT = ( "Please help me predict the three most likely questions that human would ask, " "and keeping each question under 20 characters.\n" + "MAKE SURE your output is the SAME language as the Assistant's latest response(if the main response is written in Chinese, then the language of your output must be using Chinese.)!\n" "The output must be an array in JSON format following the specified schema:\n" "[\"question1\",\"question2\",\"question3\"]\n" ) diff --git a/api/core/memory/token_buffer_memory.py b/api/core/memory/token_buffer_memory.py index 21f1965e93..b33d4dd7cb 100644 --- a/api/core/memory/token_buffer_memory.py +++ b/api/core/memory/token_buffer_memory.py @@ -103,7 +103,7 @@ class TokenBufferMemory: if curr_message_tokens > max_token_limit: pruned_memory = [] - while curr_message_tokens > max_token_limit and prompt_messages: + while curr_message_tokens > max_token_limit and len(prompt_messages)>1: pruned_memory.append(prompt_messages.pop(0)) curr_message_tokens = self.model_instance.get_llm_num_tokens( prompt_messages