mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 04:15:53 +08:00
Feat: apply LLM to optimize citations. (#5935)
### What problem does this PR solve? #5905 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
ed11be23bf
commit
caecaa7562
@ -38,6 +38,10 @@ class IterationItem(ComponentBase, ABC):
|
|||||||
ans = parent.get_input()
|
ans = parent.get_input()
|
||||||
ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else ""
|
ans = parent._param.delimiter.join(ans["content"]) if "content" in ans else ""
|
||||||
ans = [a.strip() for a in ans.split(parent._param.delimiter)]
|
ans = [a.strip() for a in ans.split(parent._param.delimiter)]
|
||||||
|
if not ans:
|
||||||
|
self._idx = -1
|
||||||
|
return pd.DataFrame()
|
||||||
|
|
||||||
df = pd.DataFrame([{"content": ans[self._idx]}])
|
df = pd.DataFrame([{"content": ans[self._idx]}])
|
||||||
self._idx += 1
|
self._idx += 1
|
||||||
if self._idx >= len(ans):
|
if self._idx >= len(ans):
|
||||||
|
@ -68,6 +68,7 @@ REASON_PROMPT = (
|
|||||||
f"- You have a dataset to search, so you just provide a proper search query.\n"
|
f"- You have a dataset to search, so you just provide a proper search query.\n"
|
||||||
f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
|
f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
|
||||||
"- The language of query MUST be as the same as 'Question' or 'search result'.\n"
|
"- The language of query MUST be as the same as 'Question' or 'search result'.\n"
|
||||||
|
"- If no helpful information can be found, rewrite the search query to be less and precise keywords.\n"
|
||||||
"- When done searching, continue your reasoning.\n\n"
|
"- When done searching, continue your reasoning.\n\n"
|
||||||
'Please answer the following question. You should think step by step to solve it.\n\n'
|
'Please answer the following question. You should think step by step to solve it.\n\n'
|
||||||
)
|
)
|
||||||
|
@ -30,7 +30,8 @@ from api import settings
|
|||||||
from rag.app.resume import forbidden_select_fields4resume
|
from rag.app.resume import forbidden_select_fields4resume
|
||||||
from rag.app.tag import label_question
|
from rag.app.tag import label_question
|
||||||
from rag.nlp.search import index_name
|
from rag.nlp.search import index_name
|
||||||
from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format
|
from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question, chunks_format, \
|
||||||
|
citation_prompt
|
||||||
from rag.utils import rmSpace, num_tokens_from_string
|
from rag.utils import rmSpace, num_tokens_from_string
|
||||||
from rag.utils.tavily_conn import Tavily
|
from rag.utils.tavily_conn import Tavily
|
||||||
|
|
||||||
@ -235,9 +236,12 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|||||||
gen_conf = dialog.llm_setting
|
gen_conf = dialog.llm_setting
|
||||||
|
|
||||||
msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}]
|
msg = [{"role": "system", "content": prompt_config["system"].format(**kwargs)}]
|
||||||
|
prompt4citation = ""
|
||||||
|
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||||
|
prompt4citation = citation_prompt()
|
||||||
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
|
msg.extend([{"role": m["role"], "content": re.sub(r"##\d+\$\$", "", m["content"])}
|
||||||
for m in messages if m["role"] != "system"])
|
for m in messages if m["role"] != "system"])
|
||||||
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.97))
|
used_token_count, msg = message_fit_in(msg, int(max_tokens * 0.95))
|
||||||
assert len(msg) >= 2, f"message_fit_in has bug: {msg}"
|
assert len(msg) >= 2, f"message_fit_in has bug: {msg}"
|
||||||
prompt = msg[0]["content"]
|
prompt = msg[0]["content"]
|
||||||
|
|
||||||
@ -256,14 +260,23 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|||||||
think = ans[0] + "</think>"
|
think = ans[0] + "</think>"
|
||||||
answer = ans[1]
|
answer = ans[1]
|
||||||
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||||
answer, idx = retriever.insert_citations(answer,
|
answer = re.sub(r"##[ij]\$\$", "", answer, flags=re.DOTALL)
|
||||||
[ck["content_ltks"]
|
if not re.search(r"##[0-9]+\$\$", answer):
|
||||||
for ck in kbinfos["chunks"]],
|
answer, idx = retriever.insert_citations(answer,
|
||||||
[ck["vector"]
|
[ck["content_ltks"]
|
||||||
for ck in kbinfos["chunks"]],
|
for ck in kbinfos["chunks"]],
|
||||||
embd_mdl,
|
[ck["vector"]
|
||||||
tkweight=1 - dialog.vector_similarity_weight,
|
for ck in kbinfos["chunks"]],
|
||||||
vtweight=dialog.vector_similarity_weight)
|
embd_mdl,
|
||||||
|
tkweight=1 - dialog.vector_similarity_weight,
|
||||||
|
vtweight=dialog.vector_similarity_weight)
|
||||||
|
else:
|
||||||
|
idx = set([])
|
||||||
|
for r in re.finditer(r"##([0-9]+)\$\$", answer):
|
||||||
|
i = int(r.group(1))
|
||||||
|
if i < len(kbinfos["chunks"]):
|
||||||
|
idx.add(i)
|
||||||
|
|
||||||
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
||||||
recall_docs = [
|
recall_docs = [
|
||||||
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
||||||
@ -298,7 +311,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|||||||
if stream:
|
if stream:
|
||||||
last_ans = ""
|
last_ans = ""
|
||||||
answer = ""
|
answer = ""
|
||||||
for ans in chat_mdl.chat_streamly(prompt, msg[1:], gen_conf):
|
for ans in chat_mdl.chat_streamly(prompt+prompt4citation, msg[1:], gen_conf):
|
||||||
if thought:
|
if thought:
|
||||||
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
|
ans = re.sub(r"<think>.*</think>", "", ans, flags=re.DOTALL)
|
||||||
answer = ans
|
answer = ans
|
||||||
@ -312,7 +325,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
|||||||
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
|
yield {"answer": thought+answer, "reference": {}, "audio_binary": tts(tts_mdl, delta_ans)}
|
||||||
yield decorate_answer(thought+answer)
|
yield decorate_answer(thought+answer)
|
||||||
else:
|
else:
|
||||||
answer = chat_mdl.chat(prompt, msg[1:], gen_conf)
|
answer = chat_mdl.chat(prompt+prompt4citation, msg[1:], gen_conf)
|
||||||
user_content = msg[-1].get("content", "[content not available]")
|
user_content = msg[-1].get("content", "[content not available]")
|
||||||
logging.debug("User: {}|Assistant: {}".format(user_content, answer))
|
logging.debug("User: {}|Assistant: {}".format(user_content, answer))
|
||||||
res = decorate_answer(answer)
|
res = decorate_answer(answer)
|
||||||
|
@ -108,22 +108,63 @@ def kb_prompt(kbinfos, max_tokens):
|
|||||||
docs = {d.id: d.meta_fields for d in docs}
|
docs = {d.id: d.meta_fields for d in docs}
|
||||||
|
|
||||||
doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
|
doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
|
||||||
for ck in kbinfos["chunks"][:chunks_num]:
|
for i, ck in enumerate(kbinfos["chunks"][:chunks_num]):
|
||||||
doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"])
|
doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + f"ID: {i}\n" + ck["content_with_weight"])
|
||||||
doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
|
doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
|
||||||
|
|
||||||
knowledges = []
|
knowledges = []
|
||||||
for nm, cks_meta in doc2chunks.items():
|
for nm, cks_meta in doc2chunks.items():
|
||||||
txt = f"Document: {nm} \n"
|
txt = f"\nDocument: {nm} \n"
|
||||||
for k, v in cks_meta["meta"].items():
|
for k, v in cks_meta["meta"].items():
|
||||||
txt += f"{k}: {v}\n"
|
txt += f"{k}: {v}\n"
|
||||||
txt += "Relevant fragments as following:\n"
|
txt += "Relevant fragments as following:\n"
|
||||||
for i, chunk in enumerate(cks_meta["chunks"], 1):
|
for i, chunk in enumerate(cks_meta["chunks"], 1):
|
||||||
txt += f"{i}. {chunk}\n"
|
txt += f"{chunk}\n"
|
||||||
knowledges.append(txt)
|
knowledges.append(txt)
|
||||||
return knowledges
|
return knowledges
|
||||||
|
|
||||||
|
|
||||||
|
def citation_prompt():
|
||||||
|
return """
|
||||||
|
|
||||||
|
# Citation requirements:
|
||||||
|
- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
|
||||||
|
- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
|
||||||
|
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
|
||||||
|
|
||||||
|
--- Example START ---
|
||||||
|
<SYSTEM>: Here is the knowledge base:
|
||||||
|
|
||||||
|
Document: Elon Musk Breaks Silence on Crypto, Warns Against Dogecoin ...
|
||||||
|
URL: https://blockworks.co/news/elon-musk-crypto-dogecoin
|
||||||
|
ID: 0
|
||||||
|
The Tesla co-founder advised against going all-in on dogecoin, but Elon Musk said it’s still his favorite crypto...
|
||||||
|
|
||||||
|
Document: Elon Musk's Dogecoin tweet sparks social media frenzy
|
||||||
|
ID: 1
|
||||||
|
Musk said he is 'willing to serve' D.O.G.E. – shorthand for Dogecoin.
|
||||||
|
|
||||||
|
Document: Causal effect of Elon Musk tweets on Dogecoin price
|
||||||
|
ID: 2
|
||||||
|
If you think of Dogecoin — the cryptocurrency based on a meme — you can’t help but also think of Elon Musk...
|
||||||
|
|
||||||
|
Document: Elon Musk's Tweet Ignites Dogecoin's Future In Public Services
|
||||||
|
ID: 3
|
||||||
|
The market is heating up after Elon Musk's announcement about Dogecoin. Is this a new era for crypto?...
|
||||||
|
|
||||||
|
The above is the knowledge base.
|
||||||
|
|
||||||
|
<USER>: What's the Elon's view on dogecoin?
|
||||||
|
|
||||||
|
<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
|
||||||
|
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
|
||||||
|
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
|
||||||
|
|
||||||
|
--- Example END ---
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
def keyword_extraction(chat_mdl, content, topn=3):
|
def keyword_extraction(chat_mdl, content, topn=3):
|
||||||
prompt = f"""
|
prompt = f"""
|
||||||
Role: You're a text analyzer.
|
Role: You're a text analyzer.
|
||||||
|
@ -27,7 +27,8 @@ class Tavily:
|
|||||||
try:
|
try:
|
||||||
response = self.tavily_client.search(
|
response = self.tavily_client.search(
|
||||||
query=query,
|
query=query,
|
||||||
search_depth="advanced"
|
search_depth="advanced",
|
||||||
|
max_results=6
|
||||||
)
|
)
|
||||||
return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]]
|
return [{"url": res["url"], "title": res["title"], "content": res["content"], "score": res["score"]} for res in response["results"]]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user