Refine search query. (#5235)

### What problem does this PR solve? #5173 #5214 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2025-07-16 07:11:46 +08:00 · 2025-02-21 18:32:32 +08:00 · 2025-02-21 18:32:32 +08:00 · 3444cb15e3
commit 3444cb15e3
parent 0151d42156
2 changed files with 85 additions and 87 deletions
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -876,12 +876,16 @@ def reasoning(chunk_info: dict, question: str, chat_mdl: LLMBundle, embd_mdl: LL
        "Once you have all the information you need, continue your reasoning.\n\n"
        "-- Example --\n"
        "Question: \"Find the minimum number of vertices in a Steiner tree that includes all specified vertices in a given tree.\"\n"
        "Assistant thinking steps:\n"
        "- I need to understand what a Steiner tree is and how to compute the minimum number of vertices required to include all specified vertices in a given tree.\n\n"
        "Assistant:\n"
-        f"{BEGIN_SEARCH_QUERY}Minimum Steiner Tree problem in trees{END_SEARCH_QUERY}\n\n"
+        "  - I need to understand what a Steiner tree is.\n\n" 
-        "(System returns processed information from relevant web pages)\n\n"
+        f"    {BEGIN_SEARCH_QUERY}What's Steiner tree{END_SEARCH_QUERY}\n\n"
-        "Assistant continues reasoning with the new information...\n\n"
+        f"    {BEGIN_SEARCH_RESULT}\n(System returns processed information from relevant web pages)\n{END_SEARCH_RESULT}\n\n"
        "User:\nContinues reasoning with the new information.\n\n"
        "Assistant:\n"
        "  - I need to understand what the difference between minimum number of vertices and edges in the Steiner tree is.\n\n" 
        f"    {BEGIN_SEARCH_QUERY}What's the difference between minimum number of vertices and edges in the Steiner tree{END_SEARCH_QUERY}\n\n"
        f"    {BEGIN_SEARCH_RESULT}\n(System returns processed information from relevant web pages)\n{END_SEARCH_RESULT}\n\n"
        "User:\nContinues reasoning with the new information...\n\n"
        "**Remember**:\n"
        f"- You have a dataset to search, so you just provide a proper search query.\n"
        f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
@ -943,7 +947,7 @@ def reasoning(chunk_info: dict, question: str, chat_mdl: LLMBundle, embd_mdl: LL
        query_think = ""
        if msg_hisotry[-1]["role"] != "user":
-            msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information...\n"})
+            msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"})
        for ans in chat_mdl.chat_streamly(reason_prompt, msg_hisotry, {"temperature": 0.7}):
            ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
            if not ans:
@ -954,15 +958,13 @@ def reasoning(chunk_info: dict, question: str, chat_mdl: LLMBundle, embd_mdl: LL
        think += rm_query_tags(query_think)
        all_reasoning_steps.append(query_think)
        msg_hisotry.append({"role": "assistant", "content": query_think})
-        search_query = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY)
+        queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY)
-        if not search_query:
+        if not queries:
            if ii > 0:
                break
-            search_query = question
+            queries = [question]
            txt = f"\n{BEGIN_SEARCH_QUERY}{question}{END_SEARCH_QUERY}\n\n"
            think += txt
            msg_hisotry[-1]["content"] += txt
        for search_query in queries:
            logging.info(f"[THINK]Query: {ii}. {search_query}")
            think += f"\n\n> {ii+1}. {search_query}\n\n"
            yield {"answer": think + "</think>", "reference": {}, "audio_binary": None}
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -17,7 +17,6 @@
 import logging
 import random
 from collections import Counter
 from typing import Optional
 from rag.utils import num_tokens_from_string
 from . import rag_tokenizer
@ -604,9 +603,6 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):
    return cks, images
-def extract_between(text: str, start_tag: str, end_tag: str) -> Optional[str]:
+def extract_between(text: str, start_tag: str, end_tag: str) -> list[str]:
    pattern = re.escape(start_tag) + r"(.*?)" + re.escape(end_tag)
-    matches = re.findall(pattern, text, flags=re.DOTALL)
+    return re.findall(pattern, text, flags=re.DOTALL)
    if matches:
        return matches[-1].strip()
    return None