Refactoring: Optimization of the Deep Research Module Code Structure (#5959)

This commit refactors the deep research module (deep_research.py), with
the following major improvements: The complex thinking and retrieval
logic has been broken down into multiple independent private methods,
enhancing code readability and maintainability. Static methods and class
methods have been introduced to simplify the logic for tag processing.
The search and reasoning processes have been optimized, increasing the
modularity of the code. The flexibility of information retrieval and
processing has been improved. The refactored code structure is now
clearer, making it easier to understand and extend the functionality of
the deep research module.

### What problem does this PR solve?

increase  the modularity of the code

### Type of change

- [x] Refactoring

Co-authored-by: wenju.li <wenju.li@deepctr.cn>
This commit is contained in:
liwenju0 2025-03-12 15:34:52 +08:00 committed by GitHub
parent 80f87913bb
commit 870a6e93da
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -36,63 +36,44 @@ class DeepResearcher:
self._kb_retrieve = kb_retrieve
self._kg_retrieve = kg_retrieve
def thinking(self, chunk_info: dict, question: str):
def rm_query_tags(line):
@staticmethod
def _remove_query_tags(text):
"""Remove query tags from text"""
pattern = re.escape(BEGIN_SEARCH_QUERY) + r"(.*?)" + re.escape(END_SEARCH_QUERY)
return re.sub(pattern, "", line)
return re.sub(pattern, "", text)
def rm_result_tags(line):
@staticmethod
def _remove_result_tags(text):
"""Remove result tags from text"""
pattern = re.escape(BEGIN_SEARCH_RESULT) + r"(.*?)" + re.escape(END_SEARCH_RESULT)
return re.sub(pattern, "", line)
executed_search_queries = []
msg_hisotry = [{"role": "user", "content": f'Question:\"{question}\"\n'}]
all_reasoning_steps = []
think = "<think>"
for ii in range(MAX_SEARCH_LIMIT + 1):
if ii == MAX_SEARCH_LIMIT - 1:
summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n"
yield {"answer": think + summary_think + "</think>", "reference": {}, "audio_binary": None}
all_reasoning_steps.append(summary_think)
msg_hisotry.append({"role": "assistant", "content": summary_think})
break
return re.sub(pattern, "", text)
def _generate_reasoning(self, msg_history):
"""Generate reasoning steps"""
query_think = ""
if msg_hisotry[-1]["role"] != "user":
msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"})
if msg_history[-1]["role"] != "user":
msg_history.append({"role": "user", "content": "Continues reasoning with the new information.\n"})
else:
msg_hisotry[-1]["content"] += "\n\nContinues reasoning with the new information.\n"
for ans in self.chat_mdl.chat_streamly(REASON_PROMPT, msg_hisotry, {"temperature": 0.7}):
msg_history[-1]["content"] += "\n\nContinues reasoning with the new information.\n"
for ans in self.chat_mdl.chat_streamly(REASON_PROMPT, msg_history, {"temperature": 0.7}):
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
if not ans:
continue
query_think = ans
yield {"answer": think + rm_query_tags(query_think) + "</think>", "reference": {}, "audio_binary": None}
yield query_think
return query_think
think += rm_query_tags(query_think)
all_reasoning_steps.append(query_think)
def _extract_search_queries(self, query_think, question, step_index):
"""Extract search queries from thinking"""
queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY)
if not queries:
if ii > 0:
break
if not queries and step_index == 0:
# If this is the first step and no queries are found, use the original question as the query
queries = [question]
return queries
for search_query in queries:
logging.info(f"[THINK]Query: {ii}. {search_query}")
msg_hisotry.append({"role": "assistant", "content": search_query})
think += f"\n\n> {ii +1}. {search_query}\n\n"
yield {"answer": think + "</think>", "reference": {}, "audio_binary": None}
summary_think = ""
# The search query has been searched in previous steps.
if search_query in executed_search_queries:
summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n"
yield {"answer": think + summary_think + "</think>", "reference": {}, "audio_binary": None}
all_reasoning_steps.append(summary_think)
msg_hisotry.append({"role": "user", "content": summary_think})
think += summary_think
continue
def _truncate_previous_reasoning(self, all_reasoning_steps):
"""Truncate previous reasoning steps to maintain a reasonable length"""
truncated_prev_reasoning = ""
for i, step in enumerate(all_reasoning_steps):
truncated_prev_reasoning += f"Step {i + 1}: {step}\n\n"
@ -108,41 +89,50 @@ class DeepResearcher:
else:
if truncated_prev_reasoning[-len('\n\n...\n\n'):] != '\n\n...\n\n':
truncated_prev_reasoning += '...\n\n'
truncated_prev_reasoning = truncated_prev_reasoning.strip('\n')
# Retrieval procedure:
# 1. KB search
# 2. Web search (optional)
# 3. KG search (optional)
return truncated_prev_reasoning.strip('\n')
def _retrieve_information(self, search_query):
"""Retrieve information from different sources"""
# 1. Knowledge base retrieval
kbinfos = self._kb_retrieve(question=search_query) if self._kb_retrieve else {"chunks": [], "doc_aggs": []}
# 2. Web retrieval (if Tavily API is configured)
if self.prompt_config.get("tavily_api_key"):
tav = Tavily(self.prompt_config["tavily_api_key"])
tav_res = tav.retrieve_chunks(search_query)
kbinfos["chunks"].extend(tav_res["chunks"])
kbinfos["doc_aggs"].extend(tav_res["doc_aggs"])
# 3. Knowledge graph retrieval (if configured)
if self.prompt_config.get("use_kg") and self._kg_retrieve:
ck = self._kg_retrieve(question=search_query)
if ck["content_with_weight"]:
kbinfos["chunks"].insert(0, ck)
# Merge chunk info for citations
return kbinfos
def _update_chunk_info(self, chunk_info, kbinfos):
"""Update chunk information for citations"""
if not chunk_info["chunks"]:
# If this is the first retrieval, use the retrieval results directly
for k in chunk_info.keys():
chunk_info[k] = kbinfos[k]
else:
# Merge newly retrieved information, avoiding duplicates
cids = [c["chunk_id"] for c in chunk_info["chunks"]]
for c in kbinfos["chunks"]:
if c["chunk_id"] in cids:
continue
if c["chunk_id"] not in cids:
chunk_info["chunks"].append(c)
dids = [d["doc_id"] for d in chunk_info["doc_aggs"]]
for d in kbinfos["doc_aggs"]:
if d["doc_id"] in dids:
continue
if d["doc_id"] not in dids:
chunk_info["doc_aggs"].append(d)
think += "\n\n"
def _extract_relevant_info(self, truncated_prev_reasoning, search_query, kbinfos):
"""Extract and summarize relevant information"""
summary_think = ""
for ans in self.chat_mdl.chat_streamly(
RELEVANT_EXTRACTION_PROMPT.format(
prev_reasoning=truncated_prev_reasoning,
@ -156,12 +146,78 @@ class DeepResearcher:
if not ans:
continue
summary_think = ans
yield {"answer": think + rm_result_tags(summary_think) + "</think>", "reference": {}, "audio_binary": None}
yield summary_think
return summary_think
def thinking(self, chunk_info: dict, question: str):
executed_search_queries = []
msg_history = [{"role": "user", "content": f'Question:\"{question}\"\n'}]
all_reasoning_steps = []
think = "<think>"
for step_index in range(MAX_SEARCH_LIMIT + 1):
# Check if the maximum search limit has been reached
if step_index == MAX_SEARCH_LIMIT - 1:
summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n"
yield {"answer": think + summary_think + "</think>", "reference": {}, "audio_binary": None}
all_reasoning_steps.append(summary_think)
msg_history.append({"role": "assistant", "content": summary_think})
break
# Step 1: Generate reasoning
query_think = ""
for ans in self._generate_reasoning(msg_history):
query_think = ans
yield {"answer": think + self._remove_query_tags(query_think) + "</think>", "reference": {}, "audio_binary": None}
think += self._remove_query_tags(query_think)
all_reasoning_steps.append(query_think)
# Step 2: Extract search queries
queries = self._extract_search_queries(query_think, question, step_index)
if not queries and step_index > 0:
# If not the first step and no queries, end the search process
break
# Process each search query
for search_query in queries:
logging.info(f"[THINK]Query: {step_index}. {search_query}")
msg_history.append({"role": "assistant", "content": search_query})
think += f"\n\n> {step_index + 1}. {search_query}\n\n"
yield {"answer": think + "</think>", "reference": {}, "audio_binary": None}
# Check if the query has already been executed
if search_query in executed_search_queries:
summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n"
yield {"answer": think + summary_think + "</think>", "reference": {}, "audio_binary": None}
all_reasoning_steps.append(summary_think)
msg_history.append({"role": "user", "content": summary_think})
think += summary_think
continue
executed_search_queries.append(search_query)
# Step 3: Truncate previous reasoning steps
truncated_prev_reasoning = self._truncate_previous_reasoning(all_reasoning_steps)
# Step 4: Retrieve information
kbinfos = self._retrieve_information(search_query)
# Step 5: Update chunk information
self._update_chunk_info(chunk_info, kbinfos)
# Step 6: Extract relevant information
think += "\n\n"
summary_think = ""
for ans in self._extract_relevant_info(truncated_prev_reasoning, search_query, kbinfos):
summary_think = ans
yield {"answer": think + self._remove_result_tags(summary_think) + "</think>", "reference": {}, "audio_binary": None}
all_reasoning_steps.append(summary_think)
msg_hisotry.append(
msg_history.append(
{"role": "user", "content": f"\n\n{BEGIN_SEARCH_RESULT}{summary_think}{END_SEARCH_RESULT}\n\n"})
think += rm_result_tags(summary_think)
logging.info(f"[THINK]Summary: {ii}. {summary_think}")
think += self._remove_result_tags(summary_think)
logging.info(f"[THINK]Summary: {step_index}. {summary_think}")
yield think + "</think>"