Feat: apply LLM to optimize citations. (#5935)

### What problem does this PR solve?

#5905

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu 2025-03-11 19:56:21 +08:00 committed by GitHub
parent ed11be23bf
commit caecaa7562
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 77 additions and 17 deletions

View File

@ -38,6 +38,10 @@ class IterationItem(ComponentBase, ABC):
ans = parent.get_input() ans = parent.get_input()
ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else "" ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else ""
ans = [a.strip() for a in ans.split(parent._param.delimiter)] ans = [a.strip() for a in ans.split(parent._param.delimiter)]
if not ans:
self._idx = -1
return pd.DataFrame()
df = pd.DataFrame([{"content": ans[self._idx]}]) df = pd.DataFrame([{"content": ans[self._idx]}])
self._idx += 1 self._idx += 1
if self._idx >= len(ans): if self._idx >= len(ans):

View File

@ -68,6 +68,7 @@ REASON_PROMPT = (
f"- You have a dataset to search, so you just provide a proper search query.\n" f"- You have a dataset to search, so you just provide a proper search query.\n"
f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
"- The language of query MUST be as the same as 'Question' or 'search result'.\n" "- The language of query MUST be as the same as 'Question' or 'search result'.\n"
"- If no helpful information can be found, rewrite the search query to be less and precise keywords.\n"
"- When done searching, continue your reasoning.\n\n" "- When done searching, continue your reasoning.\n\n"
'Please answer the following question. You should think step by step to solve it.\n\n' 'Please answer the following question. You should think step by step to solve it.\n\n'
) )

View File

@ -30,7 +30,8 @@ from api import settings
from rag.app.resume import forbidden_select_fields4resume from rag.app.resume import forbidden_select_fields4resume
from rag.app.tag import label_question from rag.app.tag import label_question
from rag.nlp.search import index_name from rag.nlp.search import index_name
from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format, \
citation_prompt
from rag.utils import rmSpace, num_tokens_from_string from rag.utils import rmSpace, num_tokens_from_string
from rag.utils.tavily_conn import Tavily from rag.utils.tavily_conn import Tavily
@ -235,9 +236,12 @@ def chat(dialog, messages, stream=True, **kwargs):
gen_conf = dialog.llm_setting gen_conf = dialog.llm_setting
msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}] msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}]
prompt4citation = ""
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
prompt4citation = citation_prompt()
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])} msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
for m in messages if m["role"] != "system"]) for m in messages if m["role"] != "system"])
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97)) used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
assert len(msg) >= 2, f"message_fit_in has bug: {msg}" assert len(msg) >= 2, f"message_fit_in has bug: {msg}"
prompt = msg[0]["content"] prompt = msg[0]["content"]
@ -256,14 +260,23 @@ def chat(dialog, messages, stream=True, **kwargs):
think = ans[0] + "</think>" think = ans[0] + "</think>"
answer = ans[1] answer = ans[1]
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)): if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
answer, idx = retriever.insert_citations(answer, answer = re.sub(r"##[ij]\$\$", "", answer, flags=re.DOTALL)
[ck["content_ltks"] if not re.search(r"##[0-9]+\$\$", answer):
for ck in kbinfos["chunks"]], answer, idx = retriever.insert_citations(answer,
[ck["vector"] [ck["content_ltks"]
for ck in kbinfos["chunks"]], for ck in kbinfos["chunks"]],
embd_mdl, [ck["vector"]
tkweight=1 - dialog.vector_similarity_weight, for ck in kbinfos["chunks"]],
vtweight=dialog.vector_similarity_weight) embd_mdl,
tkweight=1 - dialog.vector_similarity_weight,
vtweight=dialog.vector_similarity_weight)
else:
idx = set([])
for r in re.finditer(r"##([0-9]+)\$\$", answer):
i = int(r.group(1))
if i < len(kbinfos["chunks"]):
idx.add(i)
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx]) idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
recall_docs = [ recall_docs = [
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx] d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
@ -298,7 +311,7 @@ def chat(dialog, messages, stream=True, **kwargs):
if stream: if stream:
last_ans = "" last_ans = ""
answer = "" answer = ""
for ans in chat_mdl.chat_streamly(prompt, msg[1:], gen_conf): for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
if thought: if thought:
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL) ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
answer = ans answer = ans
@ -312,7 +325,7 @@ def chat(dialog, messages, stream=True, **kwargs):
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)} yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
yield decorate_answer(thought+answer) yield decorate_answer(thought+answer)
else: else:
answer = chat_mdl.chat(prompt, msg[1:], gen_conf) answer = chat_mdl.chat(prompt+prompt4citation, msg[1:], gen_conf)
user_content = msg[-1].get("content", "[content not available]") user_content = msg[-1].get("content", "[content not available]")
logging.debug("User: {}|Assistant: {}".format(user_content, answer)) logging.debug("User: {}|Assistant: {}".format(user_content, answer))
res = decorate_answer(answer) res = decorate_answer(answer)

View File

@ -108,22 +108,63 @@ def kb_prompt(kbinfos, max_tokens):
docs = {d.id: d.meta_fields for d in docs} docs = {d.id: d.meta_fields for d in docs}
doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
for ck in kbinfos["chunks"][:chunks_num]: for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"]) doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + f"ID: {i}\n" + ck["content_with_weight"])
doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
knowledges = [] knowledges = []
for nm, cks_meta in doc2chunks.items(): for nm, cks_meta in doc2chunks.items():
txt = f"Document: {nm} \n" txt = f"\nDocument: {nm} \n"
for k, v in cks_meta["meta"].items(): for k, v in cks_meta["meta"].items():
txt += f"{k}: {v}\n" txt += f"{k}: {v}\n"
txt += "Relevant fragments as following:\n" txt += "Relevant fragments as following:\n"
for i, chunk in enumerate(cks_meta["chunks"], 1): for i, chunk in enumerate(cks_meta["chunks"], 1):
txt += f"{i}. {chunk}\n" txt += f"{chunk}\n"
knowledges.append(txt) knowledges.append(txt)
return knowledges return knowledges
def citation_prompt():
return """
# Citation requirements:
- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
--- Example START ---
<SYSTEM>: Here is the knowledge base:
Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
ID: 0
The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said its still his favorite crypto...
Document: Elon Musk's Dogecoin tweet sparks social media frenzy
ID: 1
Musk said he is 'willing to serve' D.O.G.E. shorthand for Dogecoin.
Document: Causal effect of Elon Musk tweets on Dogecoin price
ID: 2
If you think of Dogecoin the cryptocurrency based on a meme you cant help but also think of Elon Musk...
Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
ID: 3
The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
The above is the knowledge base.
<USER>: What's the Elon's view on dogecoin?
<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
--- Example END ---
"""
def keyword_extraction(chat_mdl, content, topn=3): def keyword_extraction(chat_mdl, content, topn=3):
prompt = f""" prompt = f"""
Role: You're a text analyzer. Role: You're a text analyzer.

View File

@ -27,7 +27,8 @@ class Tavily:
try: try:
response = self.tavily_client.search( response = self.tavily_client.search(
query=query, query=query,
search_depth="advanced" search_depth="advanced",
max_results=6
) )
return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]] return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]]
except Exception as e: except Exception as e: