mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 22:38:58 +08:00
Refa: change citation mark as [ID:n] (#7923)
### What problem does this PR solve? Change citation mark as [ID:n], it's easier for LLMs to follow the instruction :) #7904 ### Type of change - [x] Refactoring
This commit is contained in:
parent
7c098f9fd1
commit
0c562f0a9f
@ -127,6 +127,14 @@ def chat_solo(dialog, messages, stream=True):
|
||||
yield {"answer": answer, "reference": {}, "audio_binary": tts(tts_mdl, answer), "prompt": "", "created_at": time.time()}
|
||||
|
||||
|
||||
BAD_CITATION_PATTERNS = [
|
||||
re.compile(r"\(\s*ID\s*[: ]*\s*(\d+)\s*\)"), # (ID: 12)
|
||||
re.compile(r"\[\s*ID\s*[: ]*\s*(\d+)\s*\]"), # [ID: 12]
|
||||
re.compile(r"【\s*ID\s*[: ]*\s*(\d+)\s*】"), # 【ID: 12】
|
||||
re.compile(r"ref\s*(\d+)", flags=re.IGNORECASE), # ref12、REF 12
|
||||
]
|
||||
|
||||
|
||||
def chat(dialog, messages, stream=True, **kwargs):
|
||||
assert messages[-1]["role"] == "user", "The last content of this conversation is not from user."
|
||||
if not dialog.kb_ids:
|
||||
@ -311,27 +319,22 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
return True
|
||||
return False
|
||||
|
||||
def find_and_replace(pattern, group_index=1, repl=lambda i: f"##{i}$$", flags=0):
|
||||
def find_and_replace(pattern, group_index=1, repl=lambda i: f"ID:{i}", flags=0):
|
||||
nonlocal answer
|
||||
for match in re.finditer(pattern, answer, flags=flags):
|
||||
|
||||
def replacement(match):
|
||||
try:
|
||||
i = int(match.group(group_index))
|
||||
if safe_add(i):
|
||||
answer = answer.replace(match.group(0), repl(i))
|
||||
return f"[{repl(i)}]"
|
||||
except Exception:
|
||||
continue
|
||||
pass
|
||||
return match.group(0)
|
||||
|
||||
find_and_replace(r"\(\s*ID:\s*(\d+)\s*\)") # (ID: 12)
|
||||
find_and_replace(r"ID[: ]+(\d+)") # ID: 12, ID 12
|
||||
find_and_replace(r"\$\$(\d+)\$\$") # $$12$$
|
||||
find_and_replace(r"\$\[(\d+)\]\$") # $[12]$
|
||||
find_and_replace(r"\$\$(\d+)\${2,}") # $$12$$$$
|
||||
find_and_replace(r"\$(\d+)\$") # $12$
|
||||
find_and_replace(r"(#{2,})(\d+)(\${2,})", group_index=2) # 2+ # and 2+ $
|
||||
find_and_replace(r"(#{2,})(\d+)(#{1,})", group_index=2) # 2+ # and 1+ #
|
||||
find_and_replace(r"##(\d+)#{2,}") # ##12###
|
||||
find_and_replace(r"【(\d+)】") # 【12】
|
||||
find_and_replace(r"ref\s*(\d+)", flags=re.IGNORECASE) # ref12, ref 12, REF 12
|
||||
answer = re.sub(pattern, replacement, answer, flags=flags)
|
||||
|
||||
for pattern in BAD_CITATION_PATTERNS:
|
||||
find_and_replace(pattern)
|
||||
|
||||
return answer, idx
|
||||
|
||||
@ -346,9 +349,8 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
answer = ans[1]
|
||||
|
||||
if knowledges and (prompt_config.get("quote", True) and kwargs.get("quote", True)):
|
||||
answer = re.sub(r"##[ij]\$\$", "", answer, flags=re.DOTALL)
|
||||
idx = set([])
|
||||
if not re.search(r"##[0-9]+\$\$", answer):
|
||||
if not re.search(r"\[ID:([0-9]+)\]", answer):
|
||||
answer, idx = retriever.insert_citations(
|
||||
answer,
|
||||
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
||||
@ -358,7 +360,7 @@ def chat(dialog, messages, stream=True, **kwargs):
|
||||
vtweight=dialog.vector_similarity_weight,
|
||||
)
|
||||
else:
|
||||
for match in re.finditer(r"##([0-9]+)\$\$", answer):
|
||||
for match in re.finditer(r"\[ID:([0-9]+)\]", answer):
|
||||
i = int(match.group(1))
|
||||
if i < len(kbinfos["chunks"]):
|
||||
idx.add(i)
|
||||
|
@ -245,7 +245,7 @@ class Dealer:
|
||||
for c in cites[i]:
|
||||
if c in seted:
|
||||
continue
|
||||
res += f" ##{c}$$"
|
||||
res += f" [ID:{c}]"
|
||||
seted.add(c)
|
||||
|
||||
return res, seted
|
||||
|
@ -136,16 +136,18 @@ def kb_prompt(kbinfos, max_tokens):
|
||||
|
||||
|
||||
def citation_prompt():
|
||||
print("USE PROMPT", flush=True)
|
||||
return """
|
||||
|
||||
# Citation requirements:
|
||||
- Inserts CITATIONS in format '##i$$ ##j$$' where i,j are the ID of the content you are citing and encapsulated with '##' and '$$'.
|
||||
- Inserts the CITATION symbols at the end of a sentence, AND NO MORE than 4 citations.
|
||||
|
||||
- Use a uniform citation format of like [ID:i] [ID:j], where "i" and "j" are the document ID enclosed in square brackets. Separate multiple IDs with spaces (e.g., [ID:0] [ID:1]).
|
||||
- Citation markers must be placed at the end of a sentence, separated by a space from the final punctuation (e.g., period, question mark). A maximum of 4 citations are allowed per sentence.
|
||||
- DO NOT insert CITATION in the answer if the content is not from retrieved chunks.
|
||||
- DO NOT use standalone Document IDs (e.g., '#ID#').
|
||||
- Under NO circumstances any other citation styles or formats (e.g., '~~i==', '[i]', '(i)', etc.) be used.
|
||||
- Citations ALWAYS the '##i$$' format.
|
||||
- Any failure to adhere to the above rules, including but not limited to incorrect formatting, use of prohibited styles, or unsupported citations, will be considered a error, should skip adding Citation for this sentence.
|
||||
- Citations ALWAYS in the "[ID:i]" format.
|
||||
- STRICTLY prohibit the use of strikethrough symbols (e.g., ~~) or any other non-standard formatting syntax.
|
||||
- Any failure to adhere to the above rules, including but not limited to incorrect formatting, use of prohibited styles, or unsupported citations, will be considered an error, and no citation will be added for that sentence.
|
||||
|
||||
--- Example START ---
|
||||
<SYSTEM>: Here is the knowledge base:
|
||||
@ -171,8 +173,8 @@ The market is heating up after Elon Musk's announcement about Dogecoin. Is this
|
||||
|
||||
<USER>: What's the Elon's view on dogecoin?
|
||||
|
||||
<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency ##0$$ ##1$$.
|
||||
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services ##3$$.
|
||||
<ASSISTANT>: Musk has consistently expressed his fondness for Dogecoin, often citing its humor and the inclusion of dogs in its branding. He has referred to it as his favorite cryptocurrency [ID:0] [ID:1].
|
||||
Recently, Musk has hinted at potential future roles for Dogecoin. His tweets have sparked speculation about Dogecoin's potential integration into public services [ID:3].
|
||||
Overall, while Musk enjoys Dogecoin and often promotes it, he also warns against over-investing in it, reflecting both his personal amusement and caution regarding its speculative nature.
|
||||
|
||||
--- Example END ---
|
||||
@ -309,6 +311,7 @@ Output: What's the weather in Rochester on {tomorrow}?
|
||||
ans = re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
|
||||
return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]
|
||||
|
||||
|
||||
def cross_languages(tenant_id, llm_id, query, languages=[]):
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
|
||||
@ -348,11 +351,11 @@ Bonjour le monde ! Parlons de la sécurité de l'IA.
|
||||
###
|
||||
こんにちは世界!AIの安全性について話し合いましょう。
|
||||
"""
|
||||
user_prompt=f"""
|
||||
user_prompt = f"""
|
||||
Input:
|
||||
{query}
|
||||
===
|
||||
{', '.join(languages)}
|
||||
{", ".join(languages)}
|
||||
|
||||
Output:
|
||||
"""
|
||||
|
Loading…
x
Reference in New Issue
Block a user