diff --git a/Dockerfile b/Dockerfile
index 1b231ad62..7c518d411 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -196,6 +196,7 @@ COPY deepdoc deepdoc
COPY rag rag
COPY agent agent
COPY graphrag graphrag
+COPY agentic_reasoning agentic_reasoning
COPY pyproject.toml uv.lock ./
COPY docker/service_conf.yaml.template ./conf/service_conf.yaml.template
diff --git a/agent/component/generate.py b/agent/component/generate.py
index ea12f9040..b9ed75c38 100644
--- a/agent/component/generate.py
+++ b/agent/component/generate.py
@@ -18,10 +18,10 @@ from functools import partial
import pandas as pd
from api.db import LLMType
from api.db.services.conversation_service import structure_answer
-from api.db.services.dialog_service import message_fit_in
from api.db.services.llm_service import LLMBundle
from api import settings
from agent.component.base import ComponentBase, ComponentParamBase
+from rag.prompts import message_fit_in
class GenerateParam(ComponentParamBase):
diff --git a/agent/component/retrieval.py b/agent/component/retrieval.py
index 766035c7a..50c6c7963 100644
--- a/agent/component/retrieval.py
+++ b/agent/component/retrieval.py
@@ -19,11 +19,11 @@ from abc import ABC
import pandas as pd
from api.db import LLMType
-from api.db.services.dialog_service import label_question
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api import settings
from agent.component.base import ComponentBase, ComponentParamBase
+from rag.app.tag import label_question
class RetrievalParam(ComponentParamBase):
diff --git a/agentic_reasoning/__init__.py b/agentic_reasoning/__init__.py
new file mode 100644
index 000000000..1422de46e
--- /dev/null
+++ b/agentic_reasoning/__init__.py
@@ -0,0 +1 @@
+from .deep_research import DeepResearcher as DeepResearcher
\ No newline at end of file
diff --git a/agentic_reasoning/deep_research.py b/agentic_reasoning/deep_research.py
new file mode 100644
index 000000000..f61c3a881
--- /dev/null
+++ b/agentic_reasoning/deep_research.py
@@ -0,0 +1,167 @@
+#
+# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import logging
+import re
+from functools import partial
+from agentic_reasoning.prompts import BEGIN_SEARCH_QUERY, BEGIN_SEARCH_RESULT, END_SEARCH_RESULT, MAX_SEARCH_LIMIT, \
+ END_SEARCH_QUERY, REASON_PROMPT, RELEVANT_EXTRACTION_PROMPT
+from api.db.services.llm_service import LLMBundle
+from rag.nlp import extract_between
+from rag.prompts import kb_prompt
+from rag.utils.tavily_conn import Tavily
+
+
+class DeepResearcher:
+ def __init__(self,
+ chat_mdl: LLMBundle,
+ prompt_config: dict,
+ kb_retrieve: partial = None,
+ kg_retrieve: partial = None
+ ):
+ self.chat_mdl = chat_mdl
+ self.prompt_config = prompt_config
+ self._kb_retrieve = kb_retrieve
+ self._kg_retrieve = kg_retrieve
+
+ def thinking(self, chunk_info: dict, question: str):
+ def rm_query_tags(line):
+ pattern = re.escape(BEGIN_SEARCH_QUERY) + r"(.*?)" + re.escape(END_SEARCH_QUERY)
+ return re.sub(pattern, "", line)
+
+ def rm_result_tags(line):
+ pattern = re.escape(BEGIN_SEARCH_RESULT) + r"(.*?)" + re.escape(END_SEARCH_RESULT)
+ return re.sub(pattern, "", line)
+
+ executed_search_queries = []
+ msg_hisotry = [{"role": "user", "content": f'Question:\"{question}\"\n'}]
+ all_reasoning_steps = []
+ think = ""
+ for ii in range(MAX_SEARCH_LIMIT + 1):
+ if ii == MAX_SEARCH_LIMIT - 1:
+ summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n"
+ yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None}
+ all_reasoning_steps.append(summary_think)
+ msg_hisotry.append({"role": "assistant", "content": summary_think})
+ break
+
+ query_think = ""
+ if msg_hisotry[-1]["role"] != "user":
+ msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"})
+ else:
+ msg_hisotry[-1]["content"] += "\n\nContinues reasoning with the new information.\n"
+ for ans in self.chat_mdl.chat_streamly(REASON_PROMPT, msg_hisotry, {"temperature": 0.7}):
+ ans = re.sub(r".*", "", ans, flags=re.DOTALL)
+ if not ans:
+ continue
+ query_think = ans
+ yield {"answer": think + rm_query_tags(query_think) + "", "reference": {}, "audio_binary": None}
+
+ think += rm_query_tags(query_think)
+ all_reasoning_steps.append(query_think)
+ queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY)
+ if not queries:
+ if ii > 0:
+ break
+ queries = [question]
+
+ for search_query in queries:
+ logging.info(f"[THINK]Query: {ii}. {search_query}")
+ msg_hisotry.append({"role": "assistant", "content": search_query})
+ think += f"\n\n> {ii +1}. {search_query}\n\n"
+ yield {"answer": think + "", "reference": {}, "audio_binary": None}
+
+ summary_think = ""
+ # The search query has been searched in previous steps.
+ if search_query in executed_search_queries:
+ summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n"
+ yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None}
+ all_reasoning_steps.append(summary_think)
+ msg_hisotry.append({"role": "user", "content": summary_think})
+ think += summary_think
+ continue
+
+ truncated_prev_reasoning = ""
+ for i, step in enumerate(all_reasoning_steps):
+ truncated_prev_reasoning += f"Step {i + 1}: {step}\n\n"
+
+ prev_steps = truncated_prev_reasoning.split('\n\n')
+ if len(prev_steps) <= 5:
+ truncated_prev_reasoning = '\n\n'.join(prev_steps)
+ else:
+ truncated_prev_reasoning = ''
+ for i, step in enumerate(prev_steps):
+ if i == 0 or i >= len(prev_steps) - 4 or BEGIN_SEARCH_QUERY in step or BEGIN_SEARCH_RESULT in step:
+ truncated_prev_reasoning += step + '\n\n'
+ else:
+ if truncated_prev_reasoning[-len('\n\n...\n\n'):] != '\n\n...\n\n':
+ truncated_prev_reasoning += '...\n\n'
+ truncated_prev_reasoning = truncated_prev_reasoning.strip('\n')
+
+ # Retrieval procedure:
+ # 1. KB search
+ # 2. Web search (optional)
+ # 3. KG search (optional)
+ kbinfos = self._kb_retrieve(question=search_query) if self._kb_retrieve else {"chunks": [], "doc_aggs": []}
+
+ if self.prompt_config.get("tavily_api_key"):
+ tav = Tavily(self.prompt_config["tavily_api_key"])
+ tav_res = tav.retrieve_chunks(" ".join(search_query))
+ kbinfos["chunks"].extend(tav_res["chunks"])
+ kbinfos["doc_aggs"].extend(tav_res["doc_aggs"])
+ if self.prompt_config.get("use_kg") and self._kg_retrieve:
+ ck = self._kg_retrieve(question=search_query)
+ if ck["content_with_weight"]:
+ kbinfos["chunks"].insert(0, ck)
+
+ # Merge chunk info for citations
+ if not chunk_info["chunks"]:
+ for k in chunk_info.keys():
+ chunk_info[k] = kbinfos[k]
+ else:
+ cids = [c["chunk_id"] for c in chunk_info["chunks"]]
+ for c in kbinfos["chunks"]:
+ if c["chunk_id"] in cids:
+ continue
+ chunk_info["chunks"].append(c)
+ dids = [d["doc_id"] for d in chunk_info["doc_aggs"]]
+ for d in kbinfos["doc_aggs"]:
+ if d["doc_id"] in dids:
+ continue
+ chunk_info["doc_aggs"].append(d)
+
+ think += "\n\n"
+ for ans in self.chat_mdl.chat_streamly(
+ RELEVANT_EXTRACTION_PROMPT.format(
+ prev_reasoning=truncated_prev_reasoning,
+ search_query=search_query,
+ document="\n".join(kb_prompt(kbinfos, 4096))
+ ),
+ [{"role": "user",
+ "content": f'Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.'}],
+ {"temperature": 0.7}):
+ ans = re.sub(r".*", "", ans, flags=re.DOTALL)
+ if not ans:
+ continue
+ summary_think = ans
+ yield {"answer": think + rm_result_tags(summary_think) + "", "reference": {}, "audio_binary": None}
+
+ all_reasoning_steps.append(summary_think)
+ msg_hisotry.append(
+ {"role": "user", "content": f"\n\n{BEGIN_SEARCH_RESULT}{summary_think}{END_SEARCH_RESULT}\n\n"})
+ think += rm_result_tags(summary_think)
+ logging.info(f"[THINK]Summary: {ii}. {summary_think}")
+
+ yield think + ""
diff --git a/agentic_reasoning/prompts.py b/agentic_reasoning/prompts.py
new file mode 100644
index 000000000..610409af1
--- /dev/null
+++ b/agentic_reasoning/prompts.py
@@ -0,0 +1,112 @@
+#
+# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+BEGIN_SEARCH_QUERY = "<|begin_search_query|>"
+END_SEARCH_QUERY = "<|end_search_query|>"
+BEGIN_SEARCH_RESULT = "<|begin_search_result|>"
+END_SEARCH_RESULT = "<|end_search_result|>"
+MAX_SEARCH_LIMIT = 6
+
+REASON_PROMPT = (
+ "You are a reasoning assistant with the ability to perform dataset searches to help "
+ "you answer the user's question accurately. You have special tools:\n\n"
+ f"- To perform a search: write {BEGIN_SEARCH_QUERY} your query here {END_SEARCH_QUERY}.\n"
+ f"Then, the system will search and analyze relevant content, then provide you with helpful information in the format {BEGIN_SEARCH_RESULT} ...search results... {END_SEARCH_RESULT}.\n\n"
+ f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
+ "Once you have all the information you need, continue your reasoning.\n\n"
+ "-- Example 1 --\n" ########################################
+ "Question: \"Are both the directors of Jaws and Casino Royale from the same country?\"\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY}Who is the director of Jaws?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nThe director of Jaws is Steven Spielberg...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information.\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY}Where is Steven Spielberg from?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nSteven Allan Spielberg is an American filmmaker...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information...\n\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY}Who is the director of Casino Royale?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nCasino Royale is a 2006 spy film directed by Martin Campbell...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information...\n\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY}Where is Martin Campbell from?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nMartin Campbell (born 24 October 1943) is a New Zealand film and television director...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information...\n\n"
+ "Assistant:\nIt's enough to answer the question\n"
+
+ "-- Example 2 --\n" #########################################
+ "Question: \"When was the founder of craigslist born?\"\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY}Who was the founder of craigslist?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nCraigslist was founded by Craig Newmark...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information.\n"
+ "Assistant:\n"
+ f" {BEGIN_SEARCH_QUERY} When was Craig Newmark born?{END_SEARCH_QUERY}\n\n"
+ "User:\n"
+ f" {BEGIN_SEARCH_RESULT}\nCraig Newmark was born on December 6, 1952...\n{END_SEARCH_RESULT}\n\n"
+ "Continues reasoning with the new information...\n\n"
+ "Assistant:\nIt's enough to answer the question\n"
+ "**Remember**:\n"
+ f"- You have a dataset to search, so you just provide a proper search query.\n"
+ f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
+ "- The language of query MUST be as the same as 'Question' or 'search result'.\n"
+ "- When done searching, continue your reasoning.\n\n"
+ 'Please answer the following question. You should think step by step to solve it.\n\n'
+ )
+
+RELEVANT_EXTRACTION_PROMPT = """**Task Instruction:**
+
+ You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
+
+ **Guidelines:**
+
+ 1. **Analyze the Searched Web Pages:**
+ - Carefully review the content of each searched web page.
+ - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
+
+ 2. **Extract Relevant Information:**
+ - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
+ - Ensure that the extracted information is accurate and relevant.
+
+ 3. **Output Format:**
+ - **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below.
+ - The language of query **MUST BE** as the same as 'Search Query' or 'Web Pages'.\n"
+ **Final Information**
+
+ [Helpful information]
+
+ - **If the web pages do not provide any helpful information for current search query:** Output the following text.
+
+ **Final Information**
+
+ No helpful information found.
+
+ **Inputs:**
+ - **Previous Reasoning Steps:**
+ {prev_reasoning}
+
+ - **Current Search Query:**
+ {search_query}
+
+ - **Searched Web Pages:**
+ {document}
+
+ """
diff --git a/api/apps/api_app.py b/api/apps/api_app.py
index cb260b6b2..533cffaee 100644
--- a/api/apps/api_app.py
+++ b/api/apps/api_app.py
@@ -25,7 +25,7 @@ from api.db import FileType, LLMType, ParserType, FileSource
from api.db.db_models import APIToken, Task, File
from api.db.services import duplicate_name
from api.db.services.api_service import APITokenService, API4ConversationService
-from api.db.services.dialog_service import DialogService, chat, keyword_extraction, label_question
+from api.db.services.dialog_service import DialogService, chat
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
@@ -38,6 +38,8 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge
generate_confirmation_token
from api.utils.file_utils import filename_type, thumbnail
+from rag.app.tag import label_question
+from rag.prompts import keyword_extraction
from rag.utils.storage_factory import STORAGE_IMPL
from api.db.services.canvas_service import UserCanvasService
diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py
index 0de3db753..aff1ba866 100644
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@@ -19,9 +19,10 @@ import json
from flask import request
from flask_login import login_required, current_user
-from api.db.services.dialog_service import keyword_extraction, label_question
from rag.app.qa import rmPrefix, beAdoc
+from rag.app.tag import label_question
from rag.nlp import search, rag_tokenizer
+from rag.prompts import keyword_extraction
from rag.settings import PAGERANK_FLD
from rag.utils import rmSpace
from api.db import LLMType, ParserType
diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py
index 7abe52a4d..eba943a47 100644
--- a/api/apps/conversation_app.py
+++ b/api/apps/conversation_app.py
@@ -25,13 +25,14 @@ from flask import request, Response
from flask_login import login_required, current_user
from api.db import LLMType
-from api.db.services.dialog_service import DialogService, chat, ask, label_question
+from api.db.services.dialog_service import DialogService, chat, ask
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle, TenantService
from api import settings
from api.utils.api_utils import get_json_result
from api.utils.api_utils import server_error_response, get_data_error_result, validate_request
from graphrag.general.mind_map_extractor import MindMapExtractor
+from rag.app.tag import label_question
@manager.route('/set', methods=['POST']) # noqa: F821
diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py
index 0ccb468bd..4a1ff183a 100644
--- a/api/apps/llm_app.py
+++ b/api/apps/llm_app.py
@@ -152,6 +152,7 @@ def add_llm():
elif factory == "Tencent Cloud":
req["api_key"] = apikey_json(["tencent_cloud_sid", "tencent_cloud_sk"])
+ return set_api_key()
elif factory == "Bedrock":
# For Bedrock, due to its special authentication method
diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py
index 166f97049..5d6a8c896 100644
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@@ -16,11 +16,11 @@
from flask import request, jsonify
from api.db import LLMType
-from api.db.services.dialog_service import label_question
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle
from api import settings
from api.utils.api_utils import validate_request, build_error_result, apikey_required
+from rag.app.tag import label_question
@manager.route('/dify/retrieval', methods=['POST']) # noqa: F821
diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index 98e2c4227..db3242251 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -16,7 +16,6 @@
import pathlib
import datetime
-from api.db.services.dialog_service import keyword_extraction, label_question
from rag.app.qa import rmPrefix, beAdoc
from rag.nlp import rag_tokenizer
from api.db import LLMType, ParserType
@@ -39,6 +38,8 @@ from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.utils.api_utils import construct_json_result, get_parser_config
from rag.nlp import search
+from rag.prompts import keyword_extraction
+from rag.app.tag import label_question
from rag.utils import rmSpace
from rag.utils.storage_factory import STORAGE_IMPL
diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py
index 0a4ab0f0e..6126ef150 100644
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@@ -15,31 +15,23 @@
#
import logging
import binascii
-import os
-import json
import time
-
-import json_repair
+from functools import partial
import re
-from collections import defaultdict
from copy import deepcopy
from timeit import default_timer as timer
-import datetime
-from datetime import timedelta
+from agentic_reasoning import DeepResearcher
from api.db import LLMType, ParserType, StatusEnum
from api.db.db_models import Dialog, DB
from api.db.services.common_service import CommonService
-from api.db.services.document_service import DocumentService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import TenantLLMService, LLMBundle
from api import settings
-from graphrag.utils import get_tags_from_cache, set_tags_to_cache
from rag.app.resume import forbidden_select_fields4resume
-from rag.nlp import extract_between
+from rag.app.tag import label_question
from rag.nlp.search import index_name
-from rag.settings import TAG_FLD
-from rag.utils import rmSpace, num_tokens_from_string, encoder
-from api.utils.file_utils import get_project_base_directory
+from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question
+from rag.utils import rmSpace, num_tokens_from_string
from rag.utils.tavily_conn import Tavily
@@ -69,109 +61,6 @@ class DialogService(CommonService):
return list(chats.dicts())
-def message_fit_in(msg, max_length=4000):
- def count():
- nonlocal msg
- tks_cnts = []
- for m in msg:
- tks_cnts.append(
- {"role": m["role"], "count": num_tokens_from_string(m["content"])})
- total = 0
- for m in tks_cnts:
- total += m["count"]
- return total
-
- c = count()
- if c < max_length:
- return c, msg
-
- msg_ = [m for m in msg[:-1] if m["role"] == "system"]
- if len(msg) > 1:
- msg_.append(msg[-1])
- msg = msg_
- c = count()
- if c < max_length:
- return c, msg
-
- ll = num_tokens_from_string(msg_[0]["content"])
- ll2 = num_tokens_from_string(msg_[-1]["content"])
- if ll / (ll + ll2) > 0.8:
- m = msg_[0]["content"]
- m = encoder.decode(encoder.encode(m)[:max_length - ll2])
- msg[0]["content"] = m
- return max_length, msg
-
- m = msg_[1]["content"]
- m = encoder.decode(encoder.encode(m)[:max_length - ll2])
- msg[1]["content"] = m
- return max_length, msg
-
-
-def llm_id2llm_type(llm_id):
- llm_id, _ = TenantLLMService.split_model_name_and_factory(llm_id)
- fnm = os.path.join(get_project_base_directory(), "conf")
- llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r"))
- for llm_factory in llm_factories["factory_llm_infos"]:
- for llm in llm_factory["llm"]:
- if llm_id == llm["llm_name"]:
- return llm["model_type"].strip(",")[-1]
-
-
-def kb_prompt(kbinfos, max_tokens):
- knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
- used_token_count = 0
- chunks_num = 0
- for i, c in enumerate(knowledges):
- used_token_count += num_tokens_from_string(c)
- chunks_num += 1
- if max_tokens * 0.97 < used_token_count:
- knowledges = knowledges[:i]
- logging.warning(f"Not all the retrieval into prompt: {i+1}/{len(knowledges)}")
- break
-
- docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
- docs = {d.id: d.meta_fields for d in docs}
-
- doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
- for ck in kbinfos["chunks"][:chunks_num]:
- doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"])
- doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
-
- knowledges = []
- for nm, cks_meta in doc2chunks.items():
- txt = f"Document: {nm} \n"
- for k, v in cks_meta["meta"].items():
- txt += f"{k}: {v}\n"
- txt += "Relevant fragments as following:\n"
- for i, chunk in enumerate(cks_meta["chunks"], 1):
- txt += f"{i}. {chunk}\n"
- knowledges.append(txt)
- return knowledges
-
-
-def label_question(question, kbs):
- tags = None
- tag_kb_ids = []
- for kb in kbs:
- if kb.parser_config.get("tag_kb_ids"):
- tag_kb_ids.extend(kb.parser_config["tag_kb_ids"])
- if tag_kb_ids:
- all_tags = get_tags_from_cache(tag_kb_ids)
- if not all_tags:
- all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
- set_tags_to_cache(all_tags, tag_kb_ids)
- else:
- all_tags = json.loads(all_tags)
- tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids)
- tags = settings.retrievaler.tag_query(question,
- list(set([kb.tenant_id for kb in tag_kbs])),
- tag_kb_ids,
- all_tags,
- kb.parser_config.get("topn_tags", 3)
- )
- return tags
-
-
def chat_solo(dialog, messages, stream=True):
if llm_id2llm_type(dialog.llm_id) == "image2text":
chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id)
@@ -297,7 +186,11 @@ def chat(dialog, messages, stream=True, **kwargs):
knowledges = []
if prompt_config.get("reasoning", False):
- for think in reasoning(kbinfos, " ".join(questions), chat_mdl, embd_mdl, tenant_ids, dialog.kb_ids, prompt_config, MAX_SEARCH_LIMIT=3):
+ reasoner = DeepResearcher(chat_mdl,
+ prompt_config,
+ partial(retriever.retrieval, embd_mdl=embd_mdl, tenant_ids=tenant_ids, kb_ids=dialog.kb_ids, page=1, page_size=dialog.top_n, similarity_threshold=0.2, vector_similarity_weight=0.3))
+
+ for think in reasoner.thinking(kbinfos, " ".join(questions)):
if isinstance(think, str):
thought = think
knowledges = [t for t in think.split("\n") if t]
@@ -552,175 +445,6 @@ Please write the SQL, only SQL, without any other explanations or text.
}
-def relevant(tenant_id, llm_id, question, contents: list):
- if llm_id2llm_type(llm_id) == "image2text":
- chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
- else:
- chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
- prompt = """
- You are a grader assessing relevance of a retrieved document to a user question.
- It does not need to be a stringent test. The goal is to filter out erroneous retrievals.
- If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant.
- Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.
- No other words needed except 'yes' or 'no'.
- """
- if not contents:
- return False
- contents = "Documents: \n" + " - ".join(contents)
- contents = f"Question: {question}\n" + contents
- if num_tokens_from_string(contents) >= chat_mdl.max_length - 4:
- contents = encoder.decode(encoder.encode(contents)[:chat_mdl.max_length - 4])
- ans = chat_mdl.chat(prompt, [{"role": "user", "content": contents}], {"temperature": 0.01})
- if ans.lower().find("yes") >= 0:
- return True
- return False
-
-
-def rewrite(tenant_id, llm_id, question):
- if llm_id2llm_type(llm_id) == "image2text":
- chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
- else:
- chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
- prompt = """
- You are an expert at query expansion to generate a paraphrasing of a question.
- I can't retrieval relevant information from the knowledge base by using user's question directly.
- You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase,
- writing the abbreviation in its entirety, adding some extra descriptions or explanations,
- changing the way of expression, translating the original question into another language (English/Chinese), etc.
- And return 5 versions of question and one is from translation.
- Just list the question. No other words are needed.
- """
- ans = chat_mdl.chat(prompt, [{"role": "user", "content": question}], {"temperature": 0.8})
- return ans
-
-
-def keyword_extraction(chat_mdl, content, topn=3):
- prompt = f"""
-Role: You're a text analyzer.
-Task: extract the most important keywords/phrases of a given piece of text content.
-Requirements:
- - Summarize the text content, and give top {topn} important keywords/phrases.
- - The keywords MUST be in language of the given piece of text content.
- - The keywords are delimited by ENGLISH COMMA.
- - Keywords ONLY in output.
-
-### Text Content
-{content}
-
-"""
- msg = [
- {"role": "system", "content": prompt},
- {"role": "user", "content": "Output: "}
- ]
- _, msg = message_fit_in(msg, chat_mdl.max_length)
- kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
- if isinstance(kwd, tuple):
- kwd = kwd[0]
- kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
- if kwd.find("**ERROR**") >= 0:
- return ""
- return kwd
-
-
-def question_proposal(chat_mdl, content, topn=3):
- prompt = f"""
-Role: You're a text analyzer.
-Task: propose {topn} questions about a given piece of text content.
-Requirements:
- - Understand and summarize the text content, and propose top {topn} important questions.
- - The questions SHOULD NOT have overlapping meanings.
- - The questions SHOULD cover the main content of the text as much as possible.
- - The questions MUST be in language of the given piece of text content.
- - One question per line.
- - Question ONLY in output.
-
-### Text Content
-{content}
-
-"""
- msg = [
- {"role": "system", "content": prompt},
- {"role": "user", "content": "Output: "}
- ]
- _, msg = message_fit_in(msg, chat_mdl.max_length)
- kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
- if isinstance(kwd, tuple):
- kwd = kwd[0]
- kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
- if kwd.find("**ERROR**") >= 0:
- return ""
- return kwd
-
-
-def full_question(tenant_id, llm_id, messages):
- if llm_id2llm_type(llm_id) == "image2text":
- chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
- else:
- chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
- conv = []
- for m in messages:
- if m["role"] not in ["user", "assistant"]:
- continue
- conv.append("{}: {}".format(m["role"].upper(), m["content"]))
- conv = "\n".join(conv)
- today = datetime.date.today().isoformat()
- yesterday = (datetime.date.today() - timedelta(days=1)).isoformat()
- tomorrow = (datetime.date.today() + timedelta(days=1)).isoformat()
- prompt = f"""
-Role: A helpful assistant
-
-Task and steps:
- 1. Generate a full user question that would follow the conversation.
- 2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}.
-
-Requirements & Restrictions:
- - Text generated MUST be in the same language of the original user's question.
- - If the user's latest question is completely, don't do anything, just return the original question.
- - DON'T generate anything except a refined question.
-
-######################
--Examples-
-######################
-
-# Example 1
-## Conversation
-USER: What is the name of Donald Trump's father?
-ASSISTANT: Fred Trump.
-USER: And his mother?
-###############
-Output: What's the name of Donald Trump's mother?
-
-------------
-# Example 2
-## Conversation
-USER: What is the name of Donald Trump's father?
-ASSISTANT: Fred Trump.
-USER: And his mother?
-ASSISTANT: Mary Trump.
-User: What's her full name?
-###############
-Output: What's the full name of Donald Trump's mother Mary Trump?
-
-------------
-# Example 3
-## Conversation
-USER: What's the weather today in London?
-ASSISTANT: Cloudy.
-USER: What's about tomorrow in Rochester?
-###############
-Output: What's the weather in Rochester on {tomorrow}?
-######################
-
-# Real Data
-## Conversation
-{conv}
-###############
- """
- ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2})
- ans = re.sub(r".*", "", ans, flags=re.DOTALL)
- return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]
-
-
def tts(tts_mdl, text):
if not tts_mdl or not text:
return
@@ -796,298 +520,3 @@ def ask(question, kb_ids, tenant_id):
yield decorate_answer(answer)
-def content_tagging(chat_mdl, content, all_tags, examples, topn=3):
- prompt = f"""
-Role: You're a text analyzer.
-
-Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set.
-
-Steps::
- - Comprehend the tag/label set.
- - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON.
- - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score.
-
-Requirements
- - The tags MUST be from the tag set.
- - The output MUST be in JSON format only, the key is tag and the value is its relevance score.
- - The relevance score must be range from 1 to 10.
- - Keywords ONLY in output.
-
-# TAG SET
-{", ".join(all_tags)}
-
-"""
- for i, ex in enumerate(examples):
- prompt += """
-# Examples {}
-### Text Content
-{}
-
-Output:
-{}
-
- """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))
-
- prompt += f"""
-# Real Data
-### Text Content
-{content}
-
-"""
- msg = [
- {"role": "system", "content": prompt},
- {"role": "user", "content": "Output: "}
- ]
- _, msg = message_fit_in(msg, chat_mdl.max_length)
- kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5})
- if isinstance(kwd, tuple):
- kwd = kwd[0]
- kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
- if kwd.find("**ERROR**") >= 0:
- raise Exception(kwd)
-
- try:
- return json_repair.loads(kwd)
- except json_repair.JSONDecodeError:
- try:
- result = kwd.replace(prompt[:-1], '').replace('user', '').replace('model', '').strip()
- result = '{' + result.split('{')[1].split('}')[0] + '}'
- return json_repair.loads(result)
- except Exception as e:
- logging.exception(f"JSON parsing error: {result} -> {e}")
- raise e
-
-
-def reasoning(chunk_info: dict, question: str, chat_mdl: LLMBundle, embd_mdl: LLMBundle,
- tenant_ids: list[str], kb_ids: list[str], prompt_config, MAX_SEARCH_LIMIT: int = 6,
- top_n: int = 5, similarity_threshold: float = 0.4, vector_similarity_weight: float = 0.3):
- BEGIN_SEARCH_QUERY = "<|begin_search_query|>"
- END_SEARCH_QUERY = "<|end_search_query|>"
- BEGIN_SEARCH_RESULT = "<|begin_search_result|>"
- END_SEARCH_RESULT = "<|end_search_result|>"
-
- def rm_query_tags(line):
- pattern = re.escape(BEGIN_SEARCH_QUERY) + r"(.*?)" + re.escape(END_SEARCH_QUERY)
- return re.sub(pattern, "", line)
-
- def rm_result_tags(line):
- pattern = re.escape(BEGIN_SEARCH_RESULT) + r"(.*?)" + re.escape(END_SEARCH_RESULT)
- return re.sub(pattern, "", line)
-
- reason_prompt = (
- "You are a reasoning assistant with the ability to perform dataset searches to help "
- "you answer the user's question accurately. You have special tools:\n\n"
- f"- To perform a search: write {BEGIN_SEARCH_QUERY} your query here {END_SEARCH_QUERY}.\n"
- f"Then, the system will search and analyze relevant content, then provide you with helpful information in the format {BEGIN_SEARCH_RESULT} ...search results... {END_SEARCH_RESULT}.\n\n"
- f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n"
- "Once you have all the information you need, continue your reasoning.\n\n"
- "-- Example 1 --\n" ########################################
- "Question: \"Are both the directors of Jaws and Casino Royale from the same country?\"\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY}Who is the director of Jaws?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nThe director of Jaws is Steven Spielberg...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information.\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY}Where is Steven Spielberg from?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nSteven Allan Spielberg is an American filmmaker...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information...\n\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY}Who is the director of Casino Royale?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nCasino Royale is a 2006 spy film directed by Martin Campbell...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information...\n\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY}Where is Martin Campbell from?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nMartin Campbell (born 24 October 1943) is a New Zealand film and television director...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information...\n\n"
- "Assistant:\nIt's enough to answer the question\n"
-
- "-- Example 2 --\n" #########################################
- "Question: \"When was the founder of craigslist born?\"\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY}Who was the founder of craigslist?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nCraigslist was founded by Craig Newmark...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information.\n"
- "Assistant:\n"
- f" {BEGIN_SEARCH_QUERY} When was Craig Newmark born?{END_SEARCH_QUERY}\n\n"
- "User:\n"
- f" {BEGIN_SEARCH_RESULT}\nCraig Newmark was born on December 6, 1952...\n{END_SEARCH_RESULT}\n\n"
- "Continues reasoning with the new information...\n\n"
- "Assistant:\nIt's enough to answer the question\n"
- "**Remember**:\n"
- f"- You have a dataset to search, so you just provide a proper search query.\n"
- f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n"
- "- The language of query MUST be as the same as 'Question' or 'search result'.\n"
- "- When done searching, continue your reasoning.\n\n"
- 'Please answer the following question. You should think step by step to solve it.\n\n'
- )
-
- relevant_extraction_prompt = """**Task Instruction:**
-
- You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question.
-
- **Guidelines:**
-
- 1. **Analyze the Searched Web Pages:**
- - Carefully review the content of each searched web page.
- - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question.
-
- 2. **Extract Relevant Information:**
- - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**.
- - Ensure that the extracted information is accurate and relevant.
-
- 3. **Output Format:**
- - **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below.
- - The language of query **MUST BE** as the same as 'Search Query' or 'Web Pages'.\n"
- **Final Information**
-
- [Helpful information]
-
- - **If the web pages do not provide any helpful information for current search query:** Output the following text.
-
- **Final Information**
-
- No helpful information found.
-
- **Inputs:**
- - **Previous Reasoning Steps:**
- {prev_reasoning}
-
- - **Current Search Query:**
- {search_query}
-
- - **Searched Web Pages:**
- {document}
-
- """
-
- executed_search_queries = []
- msg_hisotry = [{"role": "user", "content": f'Question:\"{question}\"\n'}]
- all_reasoning_steps = []
- think = ""
- for ii in range(MAX_SEARCH_LIMIT + 1):
- if ii == MAX_SEARCH_LIMIT - 1:
- summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n"
- yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None}
- all_reasoning_steps.append(summary_think)
- msg_hisotry.append({"role": "assistant", "content": summary_think})
- break
-
- query_think = ""
- if msg_hisotry[-1]["role"] != "user":
- msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"})
- else:
- msg_hisotry[-1]["content"] += "\n\nContinues reasoning with the new information.\n"
- for ans in chat_mdl.chat_streamly(reason_prompt, msg_hisotry, {"temperature": 0.7}):
- ans = re.sub(r".*", "", ans, flags=re.DOTALL)
- if not ans:
- continue
- query_think = ans
- yield {"answer": think + rm_query_tags(query_think) + "", "reference": {}, "audio_binary": None}
-
- think += rm_query_tags(query_think)
- all_reasoning_steps.append(query_think)
- queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY)
- if not queries:
- if ii > 0:
- break
- queries = [question]
-
- for search_query in queries:
- logging.info(f"[THINK]Query: {ii}. {search_query}")
- msg_hisotry.append({"role": "assistant", "content": search_query})
- think += f"\n\n> {ii+1}. {search_query}\n\n"
- yield {"answer": think + "", "reference": {}, "audio_binary": None}
-
- summary_think = ""
- # The search query has been searched in previous steps.
- if search_query in executed_search_queries:
- summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n"
- yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None}
- all_reasoning_steps.append(summary_think)
- msg_hisotry.append({"role": "user", "content": summary_think})
- think += summary_think
- continue
-
- truncated_prev_reasoning = ""
- for i, step in enumerate(all_reasoning_steps):
- truncated_prev_reasoning += f"Step {i + 1}: {step}\n\n"
-
- prev_steps = truncated_prev_reasoning.split('\n\n')
- if len(prev_steps) <= 5:
- truncated_prev_reasoning = '\n\n'.join(prev_steps)
- else:
- truncated_prev_reasoning = ''
- for i, step in enumerate(prev_steps):
- if i == 0 or i >= len(prev_steps) - 4 or BEGIN_SEARCH_QUERY in step or BEGIN_SEARCH_RESULT in step:
- truncated_prev_reasoning += step + '\n\n'
- else:
- if truncated_prev_reasoning[-len('\n\n...\n\n'):] != '\n\n...\n\n':
- truncated_prev_reasoning += '...\n\n'
- truncated_prev_reasoning = truncated_prev_reasoning.strip('\n')
-
- # Retrieval procedure:
- # 1. KB search
- # 2. Web search (optional)
- # 3. KG search (optional)
- kbinfos = settings.retrievaler.retrieval(search_query, embd_mdl, tenant_ids, kb_ids, 1, top_n,
- similarity_threshold,
- vector_similarity_weight
- )
- if prompt_config.get("tavily_api_key", "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"):
- tav = Tavily(prompt_config["tavily_api_key"])
- tav_res = tav.retrieve_chunks(" ".join(search_query))
- kbinfos["chunks"].extend(tav_res["chunks"])
- kbinfos["doc_aggs"].extend(tav_res["doc_aggs"])
- if prompt_config.get("use_kg"):
- ck = settings.kg_retrievaler.retrieval(search_query,
- tenant_ids,
- kb_ids,
- embd_mdl,
- chat_mdl)
- if ck["content_with_weight"]:
- kbinfos["chunks"].insert(0, ck)
-
- # Merge chunk info for citations
- if not chunk_info["chunks"]:
- for k in chunk_info.keys():
- chunk_info[k] = kbinfos[k]
- else:
- cids = [c["chunk_id"] for c in chunk_info["chunks"]]
- for c in kbinfos["chunks"]:
- if c["chunk_id"] in cids:
- continue
- chunk_info["chunks"].append(c)
- dids = [d["doc_id"] for d in chunk_info["doc_aggs"]]
- for d in kbinfos["doc_aggs"]:
- if d["doc_id"] in dids:
- continue
- chunk_info["doc_aggs"].append(d)
-
- think += "\n\n"
- for ans in chat_mdl.chat_streamly(
- relevant_extraction_prompt.format(
- prev_reasoning=truncated_prev_reasoning,
- search_query=search_query,
- document="\n".join(kb_prompt(kbinfos, 4096))
- ),
- [{"role": "user",
- "content": f'Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.'}],
- {"temperature": 0.7}):
- ans = re.sub(r".*", "", ans, flags=re.DOTALL)
- if not ans:
- continue
- summary_think = ans
- yield {"answer": think + rm_result_tags(summary_think) + "", "reference": {}, "audio_binary": None}
-
- all_reasoning_steps.append(summary_think)
- msg_hisotry.append(
- {"role": "user", "content": f"\n\n{BEGIN_SEARCH_RESULT}{summary_think}{END_SEARCH_RESULT}\n\n"})
- think += rm_result_tags(summary_think)
- logging.info(f"[THINK]Summary: {ii}. {summary_think}")
-
- yield think + ""
diff --git a/rag/app/tag.py b/rag/app/tag.py
index f5c29f2a8..7263bee5b 100644
--- a/rag/app/tag.py
+++ b/rag/app/tag.py
@@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
-
+import json
import re
import csv
from copy import deepcopy
@@ -121,6 +121,32 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
"Excel, csv(txt) format files are supported.")
+def label_question(question, kbs):
+ from api.db.services.knowledgebase_service import KnowledgebaseService
+ from graphrag.utils import get_tags_from_cache, set_tags_to_cache
+ from api import settings
+ tags = None
+ tag_kb_ids = []
+ for kb in kbs:
+ if kb.parser_config.get("tag_kb_ids"):
+ tag_kb_ids.extend(kb.parser_config["tag_kb_ids"])
+ if tag_kb_ids:
+ all_tags = get_tags_from_cache(tag_kb_ids)
+ if not all_tags:
+ all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids)
+ set_tags_to_cache(all_tags, tag_kb_ids)
+ else:
+ all_tags = json.loads(all_tags)
+ tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids)
+ tags = settings.retrievaler.tag_query(question,
+ list(set([kb.tenant_id for kb in tag_kbs])),
+ tag_kb_ids,
+ all_tags,
+ kb.parser_config.get("topn_tags", 3)
+ )
+ return tags
+
+
if __name__ == "__main__":
import sys
diff --git a/rag/nlp/query.py b/rag/nlp/query.py
index 345f8cffa..d3c2f1432 100644
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@@ -88,7 +88,7 @@ class FulltextQueryer:
tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk]
tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()]
syns = []
- for tk, w in tks_w:
+ for tk, w in tks_w[:256]:
syn = self.syn.lookup(tk)
syn = rag_tokenizer.tokenize(" ".join(syn)).split()
keywords.extend(syn)
diff --git a/rag/prompts.py b/rag/prompts.py
new file mode 100644
index 000000000..e8497fee3
--- /dev/null
+++ b/rag/prompts.py
@@ -0,0 +1,297 @@
+#
+# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+import datetime
+import json
+import logging
+import os
+import re
+from collections import defaultdict
+import json_repair
+from api.db import LLMType
+from api.db.services.document_service import DocumentService
+from api.db.services.llm_service import TenantLLMService, LLMBundle
+from api.utils.file_utils import get_project_base_directory
+from rag.settings import TAG_FLD
+from rag.utils import num_tokens_from_string, encoder
+
+
+def llm_id2llm_type(llm_id):
+ llm_id, _ = TenantLLMService.split_model_name_and_factory(llm_id)
+ fnm = os.path.join(get_project_base_directory(), "conf")
+ llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r"))
+ for llm_factory in llm_factories["factory_llm_infos"]:
+ for llm in llm_factory["llm"]:
+ if llm_id == llm["llm_name"]:
+ return llm["model_type"].strip(",")[-1]
+
+
+def message_fit_in(msg, max_length=4000):
+ def count():
+ nonlocal msg
+ tks_cnts = []
+ for m in msg:
+ tks_cnts.append(
+ {"role": m["role"], "count": num_tokens_from_string(m["content"])})
+ total = 0
+ for m in tks_cnts:
+ total += m["count"]
+ return total
+
+ c = count()
+ if c < max_length:
+ return c, msg
+
+ msg_ = [m for m in msg[:-1] if m["role"] == "system"]
+ if len(msg) > 1:
+ msg_.append(msg[-1])
+ msg = msg_
+ c = count()
+ if c < max_length:
+ return c, msg
+
+ ll = num_tokens_from_string(msg_[0]["content"])
+ ll2 = num_tokens_from_string(msg_[-1]["content"])
+ if ll / (ll + ll2) > 0.8:
+ m = msg_[0]["content"]
+ m = encoder.decode(encoder.encode(m)[:max_length - ll2])
+ msg[0]["content"] = m
+ return max_length, msg
+
+ m = msg_[1]["content"]
+ m = encoder.decode(encoder.encode(m)[:max_length - ll2])
+ msg[1]["content"] = m
+ return max_length, msg
+
+
+def kb_prompt(kbinfos, max_tokens):
+ knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]]
+ used_token_count = 0
+ chunks_num = 0
+ for i, c in enumerate(knowledges):
+ used_token_count += num_tokens_from_string(c)
+ chunks_num += 1
+ if max_tokens * 0.97 < used_token_count:
+ knowledges = knowledges[:i]
+ logging.warning(f"Not all the retrieval into prompt: {i+1}/{len(knowledges)}")
+ break
+
+ docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]])
+ docs = {d.id: d.meta_fields for d in docs}
+
+ doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []})
+ for ck in kbinfos["chunks"][:chunks_num]:
+ doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"])
+ doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {})
+
+ knowledges = []
+ for nm, cks_meta in doc2chunks.items():
+ txt = f"Document: {nm} \n"
+ for k, v in cks_meta["meta"].items():
+ txt += f"{k}: {v}\n"
+ txt += "Relevant fragments as following:\n"
+ for i, chunk in enumerate(cks_meta["chunks"], 1):
+ txt += f"{i}. {chunk}\n"
+ knowledges.append(txt)
+ return knowledges
+
+
+def keyword_extraction(chat_mdl, content, topn=3):
+ prompt = f"""
+Role: You're a text analyzer.
+Task: extract the most important keywords/phrases of a given piece of text content.
+Requirements:
+ - Summarize the text content, and give top {topn} important keywords/phrases.
+ - The keywords MUST be in language of the given piece of text content.
+ - The keywords are delimited by ENGLISH COMMA.
+ - Keywords ONLY in output.
+
+### Text Content
+{content}
+
+"""
+ msg = [
+ {"role": "system", "content": prompt},
+ {"role": "user", "content": "Output: "}
+ ]
+ _, msg = message_fit_in(msg, chat_mdl.max_length)
+ kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
+ if isinstance(kwd, tuple):
+ kwd = kwd[0]
+ kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
+ if kwd.find("**ERROR**") >= 0:
+ return ""
+ return kwd
+
+
+def question_proposal(chat_mdl, content, topn=3):
+ prompt = f"""
+Role: You're a text analyzer.
+Task: propose {topn} questions about a given piece of text content.
+Requirements:
+ - Understand and summarize the text content, and propose top {topn} important questions.
+ - The questions SHOULD NOT have overlapping meanings.
+ - The questions SHOULD cover the main content of the text as much as possible.
+ - The questions MUST be in language of the given piece of text content.
+ - One question per line.
+ - Question ONLY in output.
+
+### Text Content
+{content}
+
+"""
+ msg = [
+ {"role": "system", "content": prompt},
+ {"role": "user", "content": "Output: "}
+ ]
+ _, msg = message_fit_in(msg, chat_mdl.max_length)
+ kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
+ if isinstance(kwd, tuple):
+ kwd = kwd[0]
+ kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
+ if kwd.find("**ERROR**") >= 0:
+ return ""
+ return kwd
+
+
+def full_question(tenant_id, llm_id, messages):
+ if llm_id2llm_type(llm_id) == "image2text":
+ chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)
+ else:
+ chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id)
+ conv = []
+ for m in messages:
+ if m["role"] not in ["user", "assistant"]:
+ continue
+ conv.append("{}: {}".format(m["role"].upper(), m["content"]))
+ conv = "\n".join(conv)
+ today = datetime.date.today().isoformat()
+ yesterday = (datetime.date.today() - datetime.timedelta(days=1)).isoformat()
+ tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat()
+ prompt = f"""
+Role: A helpful assistant
+
+Task and steps:
+ 1. Generate a full user question that would follow the conversation.
+ 2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}.
+
+Requirements & Restrictions:
+ - Text generated MUST be in the same language of the original user's question.
+ - If the user's latest question is completely, don't do anything, just return the original question.
+ - DON'T generate anything except a refined question.
+
+######################
+-Examples-
+######################
+
+# Example 1
+## Conversation
+USER: What is the name of Donald Trump's father?
+ASSISTANT: Fred Trump.
+USER: And his mother?
+###############
+Output: What's the name of Donald Trump's mother?
+
+------------
+# Example 2
+## Conversation
+USER: What is the name of Donald Trump's father?
+ASSISTANT: Fred Trump.
+USER: And his mother?
+ASSISTANT: Mary Trump.
+User: What's her full name?
+###############
+Output: What's the full name of Donald Trump's mother Mary Trump?
+
+------------
+# Example 3
+## Conversation
+USER: What's the weather today in London?
+ASSISTANT: Cloudy.
+USER: What's about tomorrow in Rochester?
+###############
+Output: What's the weather in Rochester on {tomorrow}?
+######################
+
+# Real Data
+## Conversation
+{conv}
+###############
+ """
+ ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2})
+ ans = re.sub(r".*", "", ans, flags=re.DOTALL)
+ return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"]
+
+
+def content_tagging(chat_mdl, content, all_tags, examples, topn=3):
+ prompt = f"""
+Role: You're a text analyzer.
+
+Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set.
+
+Steps::
+ - Comprehend the tag/label set.
+ - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON.
+ - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score.
+
+Requirements
+ - The tags MUST be from the tag set.
+ - The output MUST be in JSON format only, the key is tag and the value is its relevance score.
+ - The relevance score must be range from 1 to 10.
+ - Keywords ONLY in output.
+
+# TAG SET
+{", ".join(all_tags)}
+
+"""
+ for i, ex in enumerate(examples):
+ prompt += """
+# Examples {}
+### Text Content
+{}
+
+Output:
+{}
+
+ """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False))
+
+ prompt += f"""
+# Real Data
+### Text Content
+{content}
+
+"""
+ msg = [
+ {"role": "system", "content": prompt},
+ {"role": "user", "content": "Output: "}
+ ]
+ _, msg = message_fit_in(msg, chat_mdl.max_length)
+ kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5})
+ if isinstance(kwd, tuple):
+ kwd = kwd[0]
+ kwd = re.sub(r".*", "", kwd, flags=re.DOTALL)
+ if kwd.find("**ERROR**") >= 0:
+ raise Exception(kwd)
+
+ try:
+ return json_repair.loads(kwd)
+ except json_repair.JSONDecodeError:
+ try:
+ result = kwd.replace(prompt[:-1], '').replace('user', '').replace('model', '').strip()
+ result = '{' + result.split('{')[1].split('}')[0] + '}'
+ return json_repair.loads(result)
+ except Exception as e:
+ logging.exception(f"JSON parsing error: {result} -> {e}")
+ raise e
diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py
index 952ed0825..0dd239742 100644
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@@ -23,6 +23,7 @@ from graphrag.general.index import WithCommunity, WithResolution, Dealer
from graphrag.light.graph_extractor import GraphExtractor as LightKGExt
from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt
from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
+from rag.prompts import keyword_extraction, question_proposal, content_tagging
CONSUMER_NO = "0" if len(sys.argv) < 2 else sys.argv[1]
CONSUMER_NAME = "task_executor_" + CONSUMER_NO
@@ -49,7 +50,6 @@ import numpy as np
from peewee import DoesNotExist
from api.db import LLMType, ParserType, TaskStatus
-from api.db.services.dialog_service import keyword_extraction, question_proposal, content_tagging
from api.db.services.document_service import DocumentService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService
diff --git a/uv.lock b/uv.lock
index 850cde6e6..d0c120475 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1084,6 +1084,10 @@ name = "datrie"
version = "0.8.2"
source = { registry = "https://mirrors.aliyun.com/pypi/simple" }
sdist = { url = "https://mirrors.aliyun.com/pypi/packages/9d/fe/db74bd405d515f06657f11ad529878fd389576dca4812bea6f98d9b31574/datrie-0.8.2.tar.gz", hash = "sha256:525b08f638d5cf6115df6ccd818e5a01298cd230b2dac91c8ff2e6499d18765d" }
+wheels = [
+ { url = "https://mirrors.aliyun.com/pypi/packages/44/02/53f0cf0bf0cd629ba6c2cc13f2f9db24323459e9c19463783d890a540a96/datrie-0.8.2-pp273-pypy_73-win32.whl", hash = "sha256:b07bd5fdfc3399a6dab86d6e35c72b1dbd598e80c97509c7c7518ab8774d3fda" },
+]
+
[[package]]
name = "decorator"
@@ -4239,6 +4243,10 @@ wheels = [
{ url = "https://mirrors.aliyun.com/pypi/packages/48/7d/0f2b09490b98cc6a902ac15dda8760c568b9c18cfe70e0ef7a16de64d53a/pycryptodomex-3.20.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7a7a8f33a1f1fb762ede6cc9cbab8f2a9ba13b196bfaf7bc6f0b39d2ba315a43" },
{ url = "https://mirrors.aliyun.com/pypi/packages/b0/1c/375adb14b71ee1c8d8232904e928b3e7af5bbbca7c04e4bec94fe8e90c3d/pycryptodomex-3.20.0-cp35-abi3-win32.whl", hash = "sha256:c39778fd0548d78917b61f03c1fa8bfda6cfcf98c767decf360945fe6f97461e" },
{ url = "https://mirrors.aliyun.com/pypi/packages/b2/e8/1b92184ab7e5595bf38000587e6f8cf9556ebd1bf0a583619bee2057afbd/pycryptodomex-3.20.0-cp35-abi3-win_amd64.whl", hash = "sha256:2a47bcc478741b71273b917232f521fd5704ab4b25d301669879e7273d3586cc" },
+
+ { url = "https://mirrors.aliyun.com/pypi/packages/e7/c5/9140bb867141d948c8e242013ec8a8011172233c898dfdba0a2417c3169a/pycryptodomex-3.20.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:1be97461c439a6af4fe1cf8bf6ca5936d3db252737d2f379cc6b2e394e12a458" },
+ { url = "https://mirrors.aliyun.com/pypi/packages/5e/6a/04acb4978ce08ab16890c70611ebc6efd251681341617bbb9e53356dee70/pycryptodomex-3.20.0-pp27-pypy_73-win32.whl", hash = "sha256:19764605feea0df966445d46533729b645033f134baeb3ea26ad518c9fdf212c" },
+
{ url = "https://mirrors.aliyun.com/pypi/packages/eb/df/3f1ea084e43b91e6d2b6b3493cc948864c17ea5d93ff1261a03812fbfd1a/pycryptodomex-3.20.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e497413560e03421484189a6b65e33fe800d3bd75590e6d78d4dfdb7accf3b" },
{ url = "https://mirrors.aliyun.com/pypi/packages/c9/f3/83ffbdfa0c8f9154bcd8866895f6cae5a3ec749da8b0840603cf936c4412/pycryptodomex-3.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e48217c7901edd95f9f097feaa0388da215ed14ce2ece803d3f300b4e694abea" },
{ url = "https://mirrors.aliyun.com/pypi/packages/c9/9d/c113e640aaf02af5631ae2686b742aac5cd0e1402b9d6512b1c7ec5ef05d/pycryptodomex-3.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d00fe8596e1cc46b44bf3907354e9377aa030ec4cd04afbbf6e899fc1e2a7781" },