diff --git a/Dockerfile b/Dockerfile index 1b231ad62..7c518d411 100644 --- a/Dockerfile +++ b/Dockerfile @@ -196,6 +196,7 @@ COPY deepdoc deepdoc COPY rag rag COPY agent agent COPY graphrag graphrag +COPY agentic_reasoning agentic_reasoning COPY pyproject.toml uv.lock ./ COPY docker/service_conf.yaml.template ./conf/service_conf.yaml.template diff --git a/agent/component/generate.py b/agent/component/generate.py index ea12f9040..b9ed75c38 100644 --- a/agent/component/generate.py +++ b/agent/component/generate.py @@ -18,10 +18,10 @@ from functools import partial import pandas as pd from api.db import LLMType from api.db.services.conversation_service import structure_answer -from api.db.services.dialog_service import message_fit_in from api.db.services.llm_service import LLMBundle from api import settings from agent.component.base import ComponentBase, ComponentParamBase +from rag.prompts import message_fit_in class GenerateParam(ComponentParamBase): diff --git a/agent/component/retrieval.py b/agent/component/retrieval.py index 766035c7a..50c6c7963 100644 --- a/agent/component/retrieval.py +++ b/agent/component/retrieval.py @@ -19,11 +19,11 @@ from abc import ABC import pandas as pd from api.db import LLMType -from api.db.services.dialog_service import label_question from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api import settings from agent.component.base import ComponentBase, ComponentParamBase +from rag.app.tag import label_question class RetrievalParam(ComponentParamBase): diff --git a/agentic_reasoning/__init__.py b/agentic_reasoning/__init__.py new file mode 100644 index 000000000..1422de46e --- /dev/null +++ b/agentic_reasoning/__init__.py @@ -0,0 +1 @@ +from .deep_research import DeepResearcher as DeepResearcher \ No newline at end of file diff --git a/agentic_reasoning/deep_research.py b/agentic_reasoning/deep_research.py new file mode 100644 index 000000000..f61c3a881 --- /dev/null +++ b/agentic_reasoning/deep_research.py @@ -0,0 +1,167 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import logging +import re +from functools import partial +from agentic_reasoning.prompts import BEGIN_SEARCH_QUERY, BEGIN_SEARCH_RESULT, END_SEARCH_RESULT, MAX_SEARCH_LIMIT, \ + END_SEARCH_QUERY, REASON_PROMPT, RELEVANT_EXTRACTION_PROMPT +from api.db.services.llm_service import LLMBundle +from rag.nlp import extract_between +from rag.prompts import kb_prompt +from rag.utils.tavily_conn import Tavily + + +class DeepResearcher: + def __init__(self, + chat_mdl: LLMBundle, + prompt_config: dict, + kb_retrieve: partial = None, + kg_retrieve: partial = None + ): + self.chat_mdl = chat_mdl + self.prompt_config = prompt_config + self._kb_retrieve = kb_retrieve + self._kg_retrieve = kg_retrieve + + def thinking(self, chunk_info: dict, question: str): + def rm_query_tags(line): + pattern = re.escape(BEGIN_SEARCH_QUERY) + r"(.*?)" + re.escape(END_SEARCH_QUERY) + return re.sub(pattern, "", line) + + def rm_result_tags(line): + pattern = re.escape(BEGIN_SEARCH_RESULT) + r"(.*?)" + re.escape(END_SEARCH_RESULT) + return re.sub(pattern, "", line) + + executed_search_queries = [] + msg_hisotry = [{"role": "user", "content": f'Question:\"{question}\"\n'}] + all_reasoning_steps = [] + think = "" + for ii in range(MAX_SEARCH_LIMIT + 1): + if ii == MAX_SEARCH_LIMIT - 1: + summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n" + yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None} + all_reasoning_steps.append(summary_think) + msg_hisotry.append({"role": "assistant", "content": summary_think}) + break + + query_think = "" + if msg_hisotry[-1]["role"] != "user": + msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"}) + else: + msg_hisotry[-1]["content"] += "\n\nContinues reasoning with the new information.\n" + for ans in self.chat_mdl.chat_streamly(REASON_PROMPT, msg_hisotry, {"temperature": 0.7}): + ans = re.sub(r".*", "", ans, flags=re.DOTALL) + if not ans: + continue + query_think = ans + yield {"answer": think + rm_query_tags(query_think) + "", "reference": {}, "audio_binary": None} + + think += rm_query_tags(query_think) + all_reasoning_steps.append(query_think) + queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY) + if not queries: + if ii > 0: + break + queries = [question] + + for search_query in queries: + logging.info(f"[THINK]Query: {ii}. {search_query}") + msg_hisotry.append({"role": "assistant", "content": search_query}) + think += f"\n\n> {ii +1}. {search_query}\n\n" + yield {"answer": think + "", "reference": {}, "audio_binary": None} + + summary_think = "" + # The search query has been searched in previous steps. + if search_query in executed_search_queries: + summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n" + yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None} + all_reasoning_steps.append(summary_think) + msg_hisotry.append({"role": "user", "content": summary_think}) + think += summary_think + continue + + truncated_prev_reasoning = "" + for i, step in enumerate(all_reasoning_steps): + truncated_prev_reasoning += f"Step {i + 1}: {step}\n\n" + + prev_steps = truncated_prev_reasoning.split('\n\n') + if len(prev_steps) <= 5: + truncated_prev_reasoning = '\n\n'.join(prev_steps) + else: + truncated_prev_reasoning = '' + for i, step in enumerate(prev_steps): + if i == 0 or i >= len(prev_steps) - 4 or BEGIN_SEARCH_QUERY in step or BEGIN_SEARCH_RESULT in step: + truncated_prev_reasoning += step + '\n\n' + else: + if truncated_prev_reasoning[-len('\n\n...\n\n'):] != '\n\n...\n\n': + truncated_prev_reasoning += '...\n\n' + truncated_prev_reasoning = truncated_prev_reasoning.strip('\n') + + # Retrieval procedure: + # 1. KB search + # 2. Web search (optional) + # 3. KG search (optional) + kbinfos = self._kb_retrieve(question=search_query) if self._kb_retrieve else {"chunks": [], "doc_aggs": []} + + if self.prompt_config.get("tavily_api_key"): + tav = Tavily(self.prompt_config["tavily_api_key"]) + tav_res = tav.retrieve_chunks(" ".join(search_query)) + kbinfos["chunks"].extend(tav_res["chunks"]) + kbinfos["doc_aggs"].extend(tav_res["doc_aggs"]) + if self.prompt_config.get("use_kg") and self._kg_retrieve: + ck = self._kg_retrieve(question=search_query) + if ck["content_with_weight"]: + kbinfos["chunks"].insert(0, ck) + + # Merge chunk info for citations + if not chunk_info["chunks"]: + for k in chunk_info.keys(): + chunk_info[k] = kbinfos[k] + else: + cids = [c["chunk_id"] for c in chunk_info["chunks"]] + for c in kbinfos["chunks"]: + if c["chunk_id"] in cids: + continue + chunk_info["chunks"].append(c) + dids = [d["doc_id"] for d in chunk_info["doc_aggs"]] + for d in kbinfos["doc_aggs"]: + if d["doc_id"] in dids: + continue + chunk_info["doc_aggs"].append(d) + + think += "\n\n" + for ans in self.chat_mdl.chat_streamly( + RELEVANT_EXTRACTION_PROMPT.format( + prev_reasoning=truncated_prev_reasoning, + search_query=search_query, + document="\n".join(kb_prompt(kbinfos, 4096)) + ), + [{"role": "user", + "content": f'Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.'}], + {"temperature": 0.7}): + ans = re.sub(r".*", "", ans, flags=re.DOTALL) + if not ans: + continue + summary_think = ans + yield {"answer": think + rm_result_tags(summary_think) + "", "reference": {}, "audio_binary": None} + + all_reasoning_steps.append(summary_think) + msg_hisotry.append( + {"role": "user", "content": f"\n\n{BEGIN_SEARCH_RESULT}{summary_think}{END_SEARCH_RESULT}\n\n"}) + think += rm_result_tags(summary_think) + logging.info(f"[THINK]Summary: {ii}. {summary_think}") + + yield think + "" diff --git a/agentic_reasoning/prompts.py b/agentic_reasoning/prompts.py new file mode 100644 index 000000000..610409af1 --- /dev/null +++ b/agentic_reasoning/prompts.py @@ -0,0 +1,112 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +BEGIN_SEARCH_QUERY = "<|begin_search_query|>" +END_SEARCH_QUERY = "<|end_search_query|>" +BEGIN_SEARCH_RESULT = "<|begin_search_result|>" +END_SEARCH_RESULT = "<|end_search_result|>" +MAX_SEARCH_LIMIT = 6 + +REASON_PROMPT = ( + "You are a reasoning assistant with the ability to perform dataset searches to help " + "you answer the user's question accurately. You have special tools:\n\n" + f"- To perform a search: write {BEGIN_SEARCH_QUERY} your query here {END_SEARCH_QUERY}.\n" + f"Then, the system will search and analyze relevant content, then provide you with helpful information in the format {BEGIN_SEARCH_RESULT} ...search results... {END_SEARCH_RESULT}.\n\n" + f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n" + "Once you have all the information you need, continue your reasoning.\n\n" + "-- Example 1 --\n" ######################################## + "Question: \"Are both the directors of Jaws and Casino Royale from the same country?\"\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY}Who is the director of Jaws?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nThe director of Jaws is Steven Spielberg...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information.\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY}Where is Steven Spielberg from?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nSteven Allan Spielberg is an American filmmaker...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information...\n\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY}Who is the director of Casino Royale?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nCasino Royale is a 2006 spy film directed by Martin Campbell...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information...\n\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY}Where is Martin Campbell from?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nMartin Campbell (born 24 October 1943) is a New Zealand film and television director...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information...\n\n" + "Assistant:\nIt's enough to answer the question\n" + + "-- Example 2 --\n" ######################################### + "Question: \"When was the founder of craigslist born?\"\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY}Who was the founder of craigslist?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nCraigslist was founded by Craig Newmark...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information.\n" + "Assistant:\n" + f" {BEGIN_SEARCH_QUERY} When was Craig Newmark born?{END_SEARCH_QUERY}\n\n" + "User:\n" + f" {BEGIN_SEARCH_RESULT}\nCraig Newmark was born on December 6, 1952...\n{END_SEARCH_RESULT}\n\n" + "Continues reasoning with the new information...\n\n" + "Assistant:\nIt's enough to answer the question\n" + "**Remember**:\n" + f"- You have a dataset to search, so you just provide a proper search query.\n" + f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" + "- The language of query MUST be as the same as 'Question' or 'search result'.\n" + "- When done searching, continue your reasoning.\n\n" + 'Please answer the following question. You should think step by step to solve it.\n\n' + ) + +RELEVANT_EXTRACTION_PROMPT = """**Task Instruction:** + + You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question. + + **Guidelines:** + + 1. **Analyze the Searched Web Pages:** + - Carefully review the content of each searched web page. + - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question. + + 2. **Extract Relevant Information:** + - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**. + - Ensure that the extracted information is accurate and relevant. + + 3. **Output Format:** + - **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below. + - The language of query **MUST BE** as the same as 'Search Query' or 'Web Pages'.\n" + **Final Information** + + [Helpful information] + + - **If the web pages do not provide any helpful information for current search query:** Output the following text. + + **Final Information** + + No helpful information found. + + **Inputs:** + - **Previous Reasoning Steps:** + {prev_reasoning} + + - **Current Search Query:** + {search_query} + + - **Searched Web Pages:** + {document} + + """ diff --git a/api/apps/api_app.py b/api/apps/api_app.py index cb260b6b2..533cffaee 100644 --- a/api/apps/api_app.py +++ b/api/apps/api_app.py @@ -25,7 +25,7 @@ from api.db import FileType, LLMType, ParserType, FileSource from api.db.db_models import APIToken, Task, File from api.db.services import duplicate_name from api.db.services.api_service import APITokenService, API4ConversationService -from api.db.services.dialog_service import DialogService, chat, keyword_extraction, label_question +from api.db.services.dialog_service import DialogService, chat from api.db.services.document_service import DocumentService, doc_upload_and_parse from api.db.services.file2document_service import File2DocumentService from api.db.services.file_service import FileService @@ -38,6 +38,8 @@ from api.utils.api_utils import server_error_response, get_data_error_result, ge generate_confirmation_token from api.utils.file_utils import filename_type, thumbnail +from rag.app.tag import label_question +from rag.prompts import keyword_extraction from rag.utils.storage_factory import STORAGE_IMPL from api.db.services.canvas_service import UserCanvasService diff --git a/api/apps/chunk_app.py b/api/apps/chunk_app.py index 0de3db753..aff1ba866 100644 --- a/api/apps/chunk_app.py +++ b/api/apps/chunk_app.py @@ -19,9 +19,10 @@ import json from flask import request from flask_login import login_required, current_user -from api.db.services.dialog_service import keyword_extraction, label_question from rag.app.qa import rmPrefix, beAdoc +from rag.app.tag import label_question from rag.nlp import search, rag_tokenizer +from rag.prompts import keyword_extraction from rag.settings import PAGERANK_FLD from rag.utils import rmSpace from api.db import LLMType, ParserType diff --git a/api/apps/conversation_app.py b/api/apps/conversation_app.py index 7abe52a4d..eba943a47 100644 --- a/api/apps/conversation_app.py +++ b/api/apps/conversation_app.py @@ -25,13 +25,14 @@ from flask import request, Response from flask_login import login_required, current_user from api.db import LLMType -from api.db.services.dialog_service import DialogService, chat, ask, label_question +from api.db.services.dialog_service import DialogService, chat, ask from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle, TenantService from api import settings from api.utils.api_utils import get_json_result from api.utils.api_utils import server_error_response, get_data_error_result, validate_request from graphrag.general.mind_map_extractor import MindMapExtractor +from rag.app.tag import label_question @manager.route('/set', methods=['POST']) # noqa: F821 diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 0ccb468bd..4a1ff183a 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -152,6 +152,7 @@ def add_llm(): elif factory == "Tencent Cloud": req["api_key"] = apikey_json(["tencent_cloud_sid", "tencent_cloud_sk"]) + return set_api_key() elif factory == "Bedrock": # For Bedrock, due to its special authentication method diff --git a/api/apps/sdk/dify_retrieval.py b/api/apps/sdk/dify_retrieval.py index 166f97049..5d6a8c896 100644 --- a/api/apps/sdk/dify_retrieval.py +++ b/api/apps/sdk/dify_retrieval.py @@ -16,11 +16,11 @@ from flask import request, jsonify from api.db import LLMType -from api.db.services.dialog_service import label_question from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api import settings from api.utils.api_utils import validate_request, build_error_result, apikey_required +from rag.app.tag import label_question @manager.route('/dify/retrieval', methods=['POST']) # noqa: F821 diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 98e2c4227..db3242251 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -16,7 +16,6 @@ import pathlib import datetime -from api.db.services.dialog_service import keyword_extraction, label_question from rag.app.qa import rmPrefix, beAdoc from rag.nlp import rag_tokenizer from api.db import LLMType, ParserType @@ -39,6 +38,8 @@ from api.db.services.file_service import FileService from api.db.services.knowledgebase_service import KnowledgebaseService from api.utils.api_utils import construct_json_result, get_parser_config from rag.nlp import search +from rag.prompts import keyword_extraction +from rag.app.tag import label_question from rag.utils import rmSpace from rag.utils.storage_factory import STORAGE_IMPL diff --git a/api/db/services/dialog_service.py b/api/db/services/dialog_service.py index 0a4ab0f0e..6126ef150 100644 --- a/api/db/services/dialog_service.py +++ b/api/db/services/dialog_service.py @@ -15,31 +15,23 @@ # import logging import binascii -import os -import json import time - -import json_repair +from functools import partial import re -from collections import defaultdict from copy import deepcopy from timeit import default_timer as timer -import datetime -from datetime import timedelta +from agentic_reasoning import DeepResearcher from api.db import LLMType, ParserType, StatusEnum from api.db.db_models import Dialog, DB from api.db.services.common_service import CommonService -from api.db.services.document_service import DocumentService from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import TenantLLMService, LLMBundle from api import settings -from graphrag.utils import get_tags_from_cache, set_tags_to_cache from rag.app.resume import forbidden_select_fields4resume -from rag.nlp import extract_between +from rag.app.tag import label_question from rag.nlp.search import index_name -from rag.settings import TAG_FLD -from rag.utils import rmSpace, num_tokens_from_string, encoder -from api.utils.file_utils import get_project_base_directory +from rag.prompts import kb_prompt, message_fit_in, llm_id2llm_type, keyword_extraction, full_question +from rag.utils import rmSpace, num_tokens_from_string from rag.utils.tavily_conn import Tavily @@ -69,109 +61,6 @@ class DialogService(CommonService): return list(chats.dicts()) -def message_fit_in(msg, max_length=4000): - def count(): - nonlocal msg - tks_cnts = [] - for m in msg: - tks_cnts.append( - {"role": m["role"], "count": num_tokens_from_string(m["content"])}) - total = 0 - for m in tks_cnts: - total += m["count"] - return total - - c = count() - if c < max_length: - return c, msg - - msg_ = [m for m in msg[:-1] if m["role"] == "system"] - if len(msg) > 1: - msg_.append(msg[-1]) - msg = msg_ - c = count() - if c < max_length: - return c, msg - - ll = num_tokens_from_string(msg_[0]["content"]) - ll2 = num_tokens_from_string(msg_[-1]["content"]) - if ll / (ll + ll2) > 0.8: - m = msg_[0]["content"] - m = encoder.decode(encoder.encode(m)[:max_length - ll2]) - msg[0]["content"] = m - return max_length, msg - - m = msg_[1]["content"] - m = encoder.decode(encoder.encode(m)[:max_length - ll2]) - msg[1]["content"] = m - return max_length, msg - - -def llm_id2llm_type(llm_id): - llm_id, _ = TenantLLMService.split_model_name_and_factory(llm_id) - fnm = os.path.join(get_project_base_directory(), "conf") - llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r")) - for llm_factory in llm_factories["factory_llm_infos"]: - for llm in llm_factory["llm"]: - if llm_id == llm["llm_name"]: - return llm["model_type"].strip(",")[-1] - - -def kb_prompt(kbinfos, max_tokens): - knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] - used_token_count = 0 - chunks_num = 0 - for i, c in enumerate(knowledges): - used_token_count += num_tokens_from_string(c) - chunks_num += 1 - if max_tokens * 0.97 < used_token_count: - knowledges = knowledges[:i] - logging.warning(f"Not all the retrieval into prompt: {i+1}/{len(knowledges)}") - break - - docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]]) - docs = {d.id: d.meta_fields for d in docs} - - doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) - for ck in kbinfos["chunks"][:chunks_num]: - doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"]) - doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) - - knowledges = [] - for nm, cks_meta in doc2chunks.items(): - txt = f"Document: {nm} \n" - for k, v in cks_meta["meta"].items(): - txt += f"{k}: {v}\n" - txt += "Relevant fragments as following:\n" - for i, chunk in enumerate(cks_meta["chunks"], 1): - txt += f"{i}. {chunk}\n" - knowledges.append(txt) - return knowledges - - -def label_question(question, kbs): - tags = None - tag_kb_ids = [] - for kb in kbs: - if kb.parser_config.get("tag_kb_ids"): - tag_kb_ids.extend(kb.parser_config["tag_kb_ids"]) - if tag_kb_ids: - all_tags = get_tags_from_cache(tag_kb_ids) - if not all_tags: - all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids) - set_tags_to_cache(all_tags, tag_kb_ids) - else: - all_tags = json.loads(all_tags) - tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids) - tags = settings.retrievaler.tag_query(question, - list(set([kb.tenant_id for kb in tag_kbs])), - tag_kb_ids, - all_tags, - kb.parser_config.get("topn_tags", 3) - ) - return tags - - def chat_solo(dialog, messages, stream=True): if llm_id2llm_type(dialog.llm_id) == "image2text": chat_mdl = LLMBundle(dialog.tenant_id, LLMType.IMAGE2TEXT, dialog.llm_id) @@ -297,7 +186,11 @@ def chat(dialog, messages, stream=True, **kwargs): knowledges = [] if prompt_config.get("reasoning", False): - for think in reasoning(kbinfos, " ".join(questions), chat_mdl, embd_mdl, tenant_ids, dialog.kb_ids, prompt_config, MAX_SEARCH_LIMIT=3): + reasoner = DeepResearcher(chat_mdl, + prompt_config, + partial(retriever.retrieval, embd_mdl=embd_mdl, tenant_ids=tenant_ids, kb_ids=dialog.kb_ids, page=1, page_size=dialog.top_n, similarity_threshold=0.2, vector_similarity_weight=0.3)) + + for think in reasoner.thinking(kbinfos, " ".join(questions)): if isinstance(think, str): thought = think knowledges = [t for t in think.split("\n") if t] @@ -552,175 +445,6 @@ Please write the SQL, only SQL, without any other explanations or text. } -def relevant(tenant_id, llm_id, question, contents: list): - if llm_id2llm_type(llm_id) == "image2text": - chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) - else: - chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) - prompt = """ - You are a grader assessing relevance of a retrieved document to a user question. - It does not need to be a stringent test. The goal is to filter out erroneous retrievals. - If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. - Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. - No other words needed except 'yes' or 'no'. - """ - if not contents: - return False - contents = "Documents: \n" + " - ".join(contents) - contents = f"Question: {question}\n" + contents - if num_tokens_from_string(contents) >= chat_mdl.max_length - 4: - contents = encoder.decode(encoder.encode(contents)[:chat_mdl.max_length - 4]) - ans = chat_mdl.chat(prompt, [{"role": "user", "content": contents}], {"temperature": 0.01}) - if ans.lower().find("yes") >= 0: - return True - return False - - -def rewrite(tenant_id, llm_id, question): - if llm_id2llm_type(llm_id) == "image2text": - chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) - else: - chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) - prompt = """ - You are an expert at query expansion to generate a paraphrasing of a question. - I can't retrieval relevant information from the knowledge base by using user's question directly. - You need to expand or paraphrase user's question by multiple ways such as using synonyms words/phrase, - writing the abbreviation in its entirety, adding some extra descriptions or explanations, - changing the way of expression, translating the original question into another language (English/Chinese), etc. - And return 5 versions of question and one is from translation. - Just list the question. No other words are needed. - """ - ans = chat_mdl.chat(prompt, [{"role": "user", "content": question}], {"temperature": 0.8}) - return ans - - -def keyword_extraction(chat_mdl, content, topn=3): - prompt = f""" -Role: You're a text analyzer. -Task: extract the most important keywords/phrases of a given piece of text content. -Requirements: - - Summarize the text content, and give top {topn} important keywords/phrases. - - The keywords MUST be in language of the given piece of text content. - - The keywords are delimited by ENGLISH COMMA. - - Keywords ONLY in output. - -### Text Content -{content} - -""" - msg = [ - {"role": "system", "content": prompt}, - {"role": "user", "content": "Output: "} - ] - _, msg = message_fit_in(msg, chat_mdl.max_length) - kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2}) - if isinstance(kwd, tuple): - kwd = kwd[0] - kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) - if kwd.find("**ERROR**") >= 0: - return "" - return kwd - - -def question_proposal(chat_mdl, content, topn=3): - prompt = f""" -Role: You're a text analyzer. -Task: propose {topn} questions about a given piece of text content. -Requirements: - - Understand and summarize the text content, and propose top {topn} important questions. - - The questions SHOULD NOT have overlapping meanings. - - The questions SHOULD cover the main content of the text as much as possible. - - The questions MUST be in language of the given piece of text content. - - One question per line. - - Question ONLY in output. - -### Text Content -{content} - -""" - msg = [ - {"role": "system", "content": prompt}, - {"role": "user", "content": "Output: "} - ] - _, msg = message_fit_in(msg, chat_mdl.max_length) - kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2}) - if isinstance(kwd, tuple): - kwd = kwd[0] - kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) - if kwd.find("**ERROR**") >= 0: - return "" - return kwd - - -def full_question(tenant_id, llm_id, messages): - if llm_id2llm_type(llm_id) == "image2text": - chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) - else: - chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) - conv = [] - for m in messages: - if m["role"] not in ["user", "assistant"]: - continue - conv.append("{}: {}".format(m["role"].upper(), m["content"])) - conv = "\n".join(conv) - today = datetime.date.today().isoformat() - yesterday = (datetime.date.today() - timedelta(days=1)).isoformat() - tomorrow = (datetime.date.today() + timedelta(days=1)).isoformat() - prompt = f""" -Role: A helpful assistant - -Task and steps: - 1. Generate a full user question that would follow the conversation. - 2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}. - -Requirements & Restrictions: - - Text generated MUST be in the same language of the original user's question. - - If the user's latest question is completely, don't do anything, just return the original question. - - DON'T generate anything except a refined question. - -###################### --Examples- -###################### - -# Example 1 -## Conversation -USER: What is the name of Donald Trump's father? -ASSISTANT: Fred Trump. -USER: And his mother? -############### -Output: What's the name of Donald Trump's mother? - ------------- -# Example 2 -## Conversation -USER: What is the name of Donald Trump's father? -ASSISTANT: Fred Trump. -USER: And his mother? -ASSISTANT: Mary Trump. -User: What's her full name? -############### -Output: What's the full name of Donald Trump's mother Mary Trump? - ------------- -# Example 3 -## Conversation -USER: What's the weather today in London? -ASSISTANT: Cloudy. -USER: What's about tomorrow in Rochester? -############### -Output: What's the weather in Rochester on {tomorrow}? -###################### - -# Real Data -## Conversation -{conv} -############### - """ - ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2}) - ans = re.sub(r".*", "", ans, flags=re.DOTALL) - return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"] - - def tts(tts_mdl, text): if not tts_mdl or not text: return @@ -796,298 +520,3 @@ def ask(question, kb_ids, tenant_id): yield decorate_answer(answer) -def content_tagging(chat_mdl, content, all_tags, examples, topn=3): - prompt = f""" -Role: You're a text analyzer. - -Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set. - -Steps:: - - Comprehend the tag/label set. - - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON. - - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score. - -Requirements - - The tags MUST be from the tag set. - - The output MUST be in JSON format only, the key is tag and the value is its relevance score. - - The relevance score must be range from 1 to 10. - - Keywords ONLY in output. - -# TAG SET -{", ".join(all_tags)} - -""" - for i, ex in enumerate(examples): - prompt += """ -# Examples {} -### Text Content -{} - -Output: -{} - - """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False)) - - prompt += f""" -# Real Data -### Text Content -{content} - -""" - msg = [ - {"role": "system", "content": prompt}, - {"role": "user", "content": "Output: "} - ] - _, msg = message_fit_in(msg, chat_mdl.max_length) - kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5}) - if isinstance(kwd, tuple): - kwd = kwd[0] - kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) - if kwd.find("**ERROR**") >= 0: - raise Exception(kwd) - - try: - return json_repair.loads(kwd) - except json_repair.JSONDecodeError: - try: - result = kwd.replace(prompt[:-1], '').replace('user', '').replace('model', '').strip() - result = '{' + result.split('{')[1].split('}')[0] + '}' - return json_repair.loads(result) - except Exception as e: - logging.exception(f"JSON parsing error: {result} -> {e}") - raise e - - -def reasoning(chunk_info: dict, question: str, chat_mdl: LLMBundle, embd_mdl: LLMBundle, - tenant_ids: list[str], kb_ids: list[str], prompt_config, MAX_SEARCH_LIMIT: int = 6, - top_n: int = 5, similarity_threshold: float = 0.4, vector_similarity_weight: float = 0.3): - BEGIN_SEARCH_QUERY = "<|begin_search_query|>" - END_SEARCH_QUERY = "<|end_search_query|>" - BEGIN_SEARCH_RESULT = "<|begin_search_result|>" - END_SEARCH_RESULT = "<|end_search_result|>" - - def rm_query_tags(line): - pattern = re.escape(BEGIN_SEARCH_QUERY) + r"(.*?)" + re.escape(END_SEARCH_QUERY) - return re.sub(pattern, "", line) - - def rm_result_tags(line): - pattern = re.escape(BEGIN_SEARCH_RESULT) + r"(.*?)" + re.escape(END_SEARCH_RESULT) - return re.sub(pattern, "", line) - - reason_prompt = ( - "You are a reasoning assistant with the ability to perform dataset searches to help " - "you answer the user's question accurately. You have special tools:\n\n" - f"- To perform a search: write {BEGIN_SEARCH_QUERY} your query here {END_SEARCH_QUERY}.\n" - f"Then, the system will search and analyze relevant content, then provide you with helpful information in the format {BEGIN_SEARCH_RESULT} ...search results... {END_SEARCH_RESULT}.\n\n" - f"You can repeat the search process multiple times if necessary. The maximum number of search attempts is limited to {MAX_SEARCH_LIMIT}.\n\n" - "Once you have all the information you need, continue your reasoning.\n\n" - "-- Example 1 --\n" ######################################## - "Question: \"Are both the directors of Jaws and Casino Royale from the same country?\"\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY}Who is the director of Jaws?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nThe director of Jaws is Steven Spielberg...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information.\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY}Where is Steven Spielberg from?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nSteven Allan Spielberg is an American filmmaker...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information...\n\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY}Who is the director of Casino Royale?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nCasino Royale is a 2006 spy film directed by Martin Campbell...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information...\n\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY}Where is Martin Campbell from?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nMartin Campbell (born 24 October 1943) is a New Zealand film and television director...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information...\n\n" - "Assistant:\nIt's enough to answer the question\n" - - "-- Example 2 --\n" ######################################### - "Question: \"When was the founder of craigslist born?\"\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY}Who was the founder of craigslist?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nCraigslist was founded by Craig Newmark...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information.\n" - "Assistant:\n" - f" {BEGIN_SEARCH_QUERY} When was Craig Newmark born?{END_SEARCH_QUERY}\n\n" - "User:\n" - f" {BEGIN_SEARCH_RESULT}\nCraig Newmark was born on December 6, 1952...\n{END_SEARCH_RESULT}\n\n" - "Continues reasoning with the new information...\n\n" - "Assistant:\nIt's enough to answer the question\n" - "**Remember**:\n" - f"- You have a dataset to search, so you just provide a proper search query.\n" - f"- Use {BEGIN_SEARCH_QUERY} to request a dataset search and end with {END_SEARCH_QUERY}.\n" - "- The language of query MUST be as the same as 'Question' or 'search result'.\n" - "- When done searching, continue your reasoning.\n\n" - 'Please answer the following question. You should think step by step to solve it.\n\n' - ) - - relevant_extraction_prompt = """**Task Instruction:** - - You are tasked with reading and analyzing web pages based on the following inputs: **Previous Reasoning Steps**, **Current Search Query**, and **Searched Web Pages**. Your objective is to extract relevant and helpful information for **Current Search Query** from the **Searched Web Pages** and seamlessly integrate this information into the **Previous Reasoning Steps** to continue reasoning for the original question. - - **Guidelines:** - - 1. **Analyze the Searched Web Pages:** - - Carefully review the content of each searched web page. - - Identify factual information that is relevant to the **Current Search Query** and can aid in the reasoning process for the original question. - - 2. **Extract Relevant Information:** - - Select the information from the Searched Web Pages that directly contributes to advancing the **Previous Reasoning Steps**. - - Ensure that the extracted information is accurate and relevant. - - 3. **Output Format:** - - **If the web pages provide helpful information for current search query:** Present the information beginning with `**Final Information**` as shown below. - - The language of query **MUST BE** as the same as 'Search Query' or 'Web Pages'.\n" - **Final Information** - - [Helpful information] - - - **If the web pages do not provide any helpful information for current search query:** Output the following text. - - **Final Information** - - No helpful information found. - - **Inputs:** - - **Previous Reasoning Steps:** - {prev_reasoning} - - - **Current Search Query:** - {search_query} - - - **Searched Web Pages:** - {document} - - """ - - executed_search_queries = [] - msg_hisotry = [{"role": "user", "content": f'Question:\"{question}\"\n'}] - all_reasoning_steps = [] - think = "" - for ii in range(MAX_SEARCH_LIMIT + 1): - if ii == MAX_SEARCH_LIMIT - 1: - summary_think = f"\n{BEGIN_SEARCH_RESULT}\nThe maximum search limit is exceeded. You are not allowed to search.\n{END_SEARCH_RESULT}\n" - yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None} - all_reasoning_steps.append(summary_think) - msg_hisotry.append({"role": "assistant", "content": summary_think}) - break - - query_think = "" - if msg_hisotry[-1]["role"] != "user": - msg_hisotry.append({"role": "user", "content": "Continues reasoning with the new information.\n"}) - else: - msg_hisotry[-1]["content"] += "\n\nContinues reasoning with the new information.\n" - for ans in chat_mdl.chat_streamly(reason_prompt, msg_hisotry, {"temperature": 0.7}): - ans = re.sub(r".*", "", ans, flags=re.DOTALL) - if not ans: - continue - query_think = ans - yield {"answer": think + rm_query_tags(query_think) + "", "reference": {}, "audio_binary": None} - - think += rm_query_tags(query_think) - all_reasoning_steps.append(query_think) - queries = extract_between(query_think, BEGIN_SEARCH_QUERY, END_SEARCH_QUERY) - if not queries: - if ii > 0: - break - queries = [question] - - for search_query in queries: - logging.info(f"[THINK]Query: {ii}. {search_query}") - msg_hisotry.append({"role": "assistant", "content": search_query}) - think += f"\n\n> {ii+1}. {search_query}\n\n" - yield {"answer": think + "", "reference": {}, "audio_binary": None} - - summary_think = "" - # The search query has been searched in previous steps. - if search_query in executed_search_queries: - summary_think = f"\n{BEGIN_SEARCH_RESULT}\nYou have searched this query. Please refer to previous results.\n{END_SEARCH_RESULT}\n" - yield {"answer": think + summary_think + "", "reference": {}, "audio_binary": None} - all_reasoning_steps.append(summary_think) - msg_hisotry.append({"role": "user", "content": summary_think}) - think += summary_think - continue - - truncated_prev_reasoning = "" - for i, step in enumerate(all_reasoning_steps): - truncated_prev_reasoning += f"Step {i + 1}: {step}\n\n" - - prev_steps = truncated_prev_reasoning.split('\n\n') - if len(prev_steps) <= 5: - truncated_prev_reasoning = '\n\n'.join(prev_steps) - else: - truncated_prev_reasoning = '' - for i, step in enumerate(prev_steps): - if i == 0 or i >= len(prev_steps) - 4 or BEGIN_SEARCH_QUERY in step or BEGIN_SEARCH_RESULT in step: - truncated_prev_reasoning += step + '\n\n' - else: - if truncated_prev_reasoning[-len('\n\n...\n\n'):] != '\n\n...\n\n': - truncated_prev_reasoning += '...\n\n' - truncated_prev_reasoning = truncated_prev_reasoning.strip('\n') - - # Retrieval procedure: - # 1. KB search - # 2. Web search (optional) - # 3. KG search (optional) - kbinfos = settings.retrievaler.retrieval(search_query, embd_mdl, tenant_ids, kb_ids, 1, top_n, - similarity_threshold, - vector_similarity_weight - ) - if prompt_config.get("tavily_api_key", "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"): - tav = Tavily(prompt_config["tavily_api_key"]) - tav_res = tav.retrieve_chunks(" ".join(search_query)) - kbinfos["chunks"].extend(tav_res["chunks"]) - kbinfos["doc_aggs"].extend(tav_res["doc_aggs"]) - if prompt_config.get("use_kg"): - ck = settings.kg_retrievaler.retrieval(search_query, - tenant_ids, - kb_ids, - embd_mdl, - chat_mdl) - if ck["content_with_weight"]: - kbinfos["chunks"].insert(0, ck) - - # Merge chunk info for citations - if not chunk_info["chunks"]: - for k in chunk_info.keys(): - chunk_info[k] = kbinfos[k] - else: - cids = [c["chunk_id"] for c in chunk_info["chunks"]] - for c in kbinfos["chunks"]: - if c["chunk_id"] in cids: - continue - chunk_info["chunks"].append(c) - dids = [d["doc_id"] for d in chunk_info["doc_aggs"]] - for d in kbinfos["doc_aggs"]: - if d["doc_id"] in dids: - continue - chunk_info["doc_aggs"].append(d) - - think += "\n\n" - for ans in chat_mdl.chat_streamly( - relevant_extraction_prompt.format( - prev_reasoning=truncated_prev_reasoning, - search_query=search_query, - document="\n".join(kb_prompt(kbinfos, 4096)) - ), - [{"role": "user", - "content": f'Now you should analyze each web page and find helpful information based on the current search query "{search_query}" and previous reasoning steps.'}], - {"temperature": 0.7}): - ans = re.sub(r".*", "", ans, flags=re.DOTALL) - if not ans: - continue - summary_think = ans - yield {"answer": think + rm_result_tags(summary_think) + "", "reference": {}, "audio_binary": None} - - all_reasoning_steps.append(summary_think) - msg_hisotry.append( - {"role": "user", "content": f"\n\n{BEGIN_SEARCH_RESULT}{summary_think}{END_SEARCH_RESULT}\n\n"}) - think += rm_result_tags(summary_think) - logging.info(f"[THINK]Summary: {ii}. {summary_think}") - - yield think + "" diff --git a/rag/app/tag.py b/rag/app/tag.py index f5c29f2a8..7263bee5b 100644 --- a/rag/app/tag.py +++ b/rag/app/tag.py @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # - +import json import re import csv from copy import deepcopy @@ -121,6 +121,32 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): "Excel, csv(txt) format files are supported.") +def label_question(question, kbs): + from api.db.services.knowledgebase_service import KnowledgebaseService + from graphrag.utils import get_tags_from_cache, set_tags_to_cache + from api import settings + tags = None + tag_kb_ids = [] + for kb in kbs: + if kb.parser_config.get("tag_kb_ids"): + tag_kb_ids.extend(kb.parser_config["tag_kb_ids"]) + if tag_kb_ids: + all_tags = get_tags_from_cache(tag_kb_ids) + if not all_tags: + all_tags = settings.retrievaler.all_tags_in_portion(kb.tenant_id, tag_kb_ids) + set_tags_to_cache(all_tags, tag_kb_ids) + else: + all_tags = json.loads(all_tags) + tag_kbs = KnowledgebaseService.get_by_ids(tag_kb_ids) + tags = settings.retrievaler.tag_query(question, + list(set([kb.tenant_id for kb in tag_kbs])), + tag_kb_ids, + all_tags, + kb.parser_config.get("topn_tags", 3) + ) + return tags + + if __name__ == "__main__": import sys diff --git a/rag/nlp/query.py b/rag/nlp/query.py index 345f8cffa..d3c2f1432 100644 --- a/rag/nlp/query.py +++ b/rag/nlp/query.py @@ -88,7 +88,7 @@ class FulltextQueryer: tks_w = [(re.sub(r"^[\+-]", "", tk), w) for tk, w in tks_w if tk] tks_w = [(tk.strip(), w) for tk, w in tks_w if tk.strip()] syns = [] - for tk, w in tks_w: + for tk, w in tks_w[:256]: syn = self.syn.lookup(tk) syn = rag_tokenizer.tokenize(" ".join(syn)).split() keywords.extend(syn) diff --git a/rag/prompts.py b/rag/prompts.py new file mode 100644 index 000000000..e8497fee3 --- /dev/null +++ b/rag/prompts.py @@ -0,0 +1,297 @@ +# +# Copyright 2024 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +import datetime +import json +import logging +import os +import re +from collections import defaultdict +import json_repair +from api.db import LLMType +from api.db.services.document_service import DocumentService +from api.db.services.llm_service import TenantLLMService, LLMBundle +from api.utils.file_utils import get_project_base_directory +from rag.settings import TAG_FLD +from rag.utils import num_tokens_from_string, encoder + + +def llm_id2llm_type(llm_id): + llm_id, _ = TenantLLMService.split_model_name_and_factory(llm_id) + fnm = os.path.join(get_project_base_directory(), "conf") + llm_factories = json.load(open(os.path.join(fnm, "llm_factories.json"), "r")) + for llm_factory in llm_factories["factory_llm_infos"]: + for llm in llm_factory["llm"]: + if llm_id == llm["llm_name"]: + return llm["model_type"].strip(",")[-1] + + +def message_fit_in(msg, max_length=4000): + def count(): + nonlocal msg + tks_cnts = [] + for m in msg: + tks_cnts.append( + {"role": m["role"], "count": num_tokens_from_string(m["content"])}) + total = 0 + for m in tks_cnts: + total += m["count"] + return total + + c = count() + if c < max_length: + return c, msg + + msg_ = [m for m in msg[:-1] if m["role"] == "system"] + if len(msg) > 1: + msg_.append(msg[-1]) + msg = msg_ + c = count() + if c < max_length: + return c, msg + + ll = num_tokens_from_string(msg_[0]["content"]) + ll2 = num_tokens_from_string(msg_[-1]["content"]) + if ll / (ll + ll2) > 0.8: + m = msg_[0]["content"] + m = encoder.decode(encoder.encode(m)[:max_length - ll2]) + msg[0]["content"] = m + return max_length, msg + + m = msg_[1]["content"] + m = encoder.decode(encoder.encode(m)[:max_length - ll2]) + msg[1]["content"] = m + return max_length, msg + + +def kb_prompt(kbinfos, max_tokens): + knowledges = [ck["content_with_weight"] for ck in kbinfos["chunks"]] + used_token_count = 0 + chunks_num = 0 + for i, c in enumerate(knowledges): + used_token_count += num_tokens_from_string(c) + chunks_num += 1 + if max_tokens * 0.97 < used_token_count: + knowledges = knowledges[:i] + logging.warning(f"Not all the retrieval into prompt: {i+1}/{len(knowledges)}") + break + + docs = DocumentService.get_by_ids([ck["doc_id"] for ck in kbinfos["chunks"][:chunks_num]]) + docs = {d.id: d.meta_fields for d in docs} + + doc2chunks = defaultdict(lambda: {"chunks": [], "meta": []}) + for ck in kbinfos["chunks"][:chunks_num]: + doc2chunks[ck["docnm_kwd"]]["chunks"].append((f"URL: {ck['url']}\n" if "url" in ck else "") + ck["content_with_weight"]) + doc2chunks[ck["docnm_kwd"]]["meta"] = docs.get(ck["doc_id"], {}) + + knowledges = [] + for nm, cks_meta in doc2chunks.items(): + txt = f"Document: {nm} \n" + for k, v in cks_meta["meta"].items(): + txt += f"{k}: {v}\n" + txt += "Relevant fragments as following:\n" + for i, chunk in enumerate(cks_meta["chunks"], 1): + txt += f"{i}. {chunk}\n" + knowledges.append(txt) + return knowledges + + +def keyword_extraction(chat_mdl, content, topn=3): + prompt = f""" +Role: You're a text analyzer. +Task: extract the most important keywords/phrases of a given piece of text content. +Requirements: + - Summarize the text content, and give top {topn} important keywords/phrases. + - The keywords MUST be in language of the given piece of text content. + - The keywords are delimited by ENGLISH COMMA. + - Keywords ONLY in output. + +### Text Content +{content} + +""" + msg = [ + {"role": "system", "content": prompt}, + {"role": "user", "content": "Output: "} + ] + _, msg = message_fit_in(msg, chat_mdl.max_length) + kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2}) + if isinstance(kwd, tuple): + kwd = kwd[0] + kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) + if kwd.find("**ERROR**") >= 0: + return "" + return kwd + + +def question_proposal(chat_mdl, content, topn=3): + prompt = f""" +Role: You're a text analyzer. +Task: propose {topn} questions about a given piece of text content. +Requirements: + - Understand and summarize the text content, and propose top {topn} important questions. + - The questions SHOULD NOT have overlapping meanings. + - The questions SHOULD cover the main content of the text as much as possible. + - The questions MUST be in language of the given piece of text content. + - One question per line. + - Question ONLY in output. + +### Text Content +{content} + +""" + msg = [ + {"role": "system", "content": prompt}, + {"role": "user", "content": "Output: "} + ] + _, msg = message_fit_in(msg, chat_mdl.max_length) + kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2}) + if isinstance(kwd, tuple): + kwd = kwd[0] + kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) + if kwd.find("**ERROR**") >= 0: + return "" + return kwd + + +def full_question(tenant_id, llm_id, messages): + if llm_id2llm_type(llm_id) == "image2text": + chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id) + else: + chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_id) + conv = [] + for m in messages: + if m["role"] not in ["user", "assistant"]: + continue + conv.append("{}: {}".format(m["role"].upper(), m["content"])) + conv = "\n".join(conv) + today = datetime.date.today().isoformat() + yesterday = (datetime.date.today() - datetime.timedelta(days=1)).isoformat() + tomorrow = (datetime.date.today() + datetime.timedelta(days=1)).isoformat() + prompt = f""" +Role: A helpful assistant + +Task and steps: + 1. Generate a full user question that would follow the conversation. + 2. If the user's question involves relative date, you need to convert it into absolute date based on the current date, which is {today}. For example: 'yesterday' would be converted to {yesterday}. + +Requirements & Restrictions: + - Text generated MUST be in the same language of the original user's question. + - If the user's latest question is completely, don't do anything, just return the original question. + - DON'T generate anything except a refined question. + +###################### +-Examples- +###################### + +# Example 1 +## Conversation +USER: What is the name of Donald Trump's father? +ASSISTANT: Fred Trump. +USER: And his mother? +############### +Output: What's the name of Donald Trump's mother? + +------------ +# Example 2 +## Conversation +USER: What is the name of Donald Trump's father? +ASSISTANT: Fred Trump. +USER: And his mother? +ASSISTANT: Mary Trump. +User: What's her full name? +############### +Output: What's the full name of Donald Trump's mother Mary Trump? + +------------ +# Example 3 +## Conversation +USER: What's the weather today in London? +ASSISTANT: Cloudy. +USER: What's about tomorrow in Rochester? +############### +Output: What's the weather in Rochester on {tomorrow}? +###################### + +# Real Data +## Conversation +{conv} +############### + """ + ans = chat_mdl.chat(prompt, [{"role": "user", "content": "Output: "}], {"temperature": 0.2}) + ans = re.sub(r".*", "", ans, flags=re.DOTALL) + return ans if ans.find("**ERROR**") < 0 else messages[-1]["content"] + + +def content_tagging(chat_mdl, content, all_tags, examples, topn=3): + prompt = f""" +Role: You're a text analyzer. + +Task: Tag (put on some labels) to a given piece of text content based on the examples and the entire tag set. + +Steps:: + - Comprehend the tag/label set. + - Comprehend examples which all consist of both text content and assigned tags with relevance score in format of JSON. + - Summarize the text content, and tag it with top {topn} most relevant tags from the set of tag/label and the corresponding relevance score. + +Requirements + - The tags MUST be from the tag set. + - The output MUST be in JSON format only, the key is tag and the value is its relevance score. + - The relevance score must be range from 1 to 10. + - Keywords ONLY in output. + +# TAG SET +{", ".join(all_tags)} + +""" + for i, ex in enumerate(examples): + prompt += """ +# Examples {} +### Text Content +{} + +Output: +{} + + """.format(i, ex["content"], json.dumps(ex[TAG_FLD], indent=2, ensure_ascii=False)) + + prompt += f""" +# Real Data +### Text Content +{content} + +""" + msg = [ + {"role": "system", "content": prompt}, + {"role": "user", "content": "Output: "} + ] + _, msg = message_fit_in(msg, chat_mdl.max_length) + kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.5}) + if isinstance(kwd, tuple): + kwd = kwd[0] + kwd = re.sub(r".*", "", kwd, flags=re.DOTALL) + if kwd.find("**ERROR**") >= 0: + raise Exception(kwd) + + try: + return json_repair.loads(kwd) + except json_repair.JSONDecodeError: + try: + result = kwd.replace(prompt[:-1], '').replace('user', '').replace('model', '').strip() + result = '{' + result.split('{')[1].split('}')[0] + '}' + return json_repair.loads(result) + except Exception as e: + logging.exception(f"JSON parsing error: {result} -> {e}") + raise e diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 952ed0825..0dd239742 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -23,6 +23,7 @@ from graphrag.general.index import WithCommunity, WithResolution, Dealer from graphrag.light.graph_extractor import GraphExtractor as LightKGExt from graphrag.general.graph_extractor import GraphExtractor as GeneralKGExt from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache +from rag.prompts import keyword_extraction, question_proposal, content_tagging CONSUMER_NO = "0" if len(sys.argv) < 2 else sys.argv[1] CONSUMER_NAME = "task_executor_" + CONSUMER_NO @@ -49,7 +50,6 @@ import numpy as np from peewee import DoesNotExist from api.db import LLMType, ParserType, TaskStatus -from api.db.services.dialog_service import keyword_extraction, question_proposal, content_tagging from api.db.services.document_service import DocumentService from api.db.services.llm_service import LLMBundle from api.db.services.task_service import TaskService diff --git a/uv.lock b/uv.lock index 850cde6e6..d0c120475 100644 --- a/uv.lock +++ b/uv.lock @@ -1084,6 +1084,10 @@ name = "datrie" version = "0.8.2" source = { registry = "https://mirrors.aliyun.com/pypi/simple" } sdist = { url = "https://mirrors.aliyun.com/pypi/packages/9d/fe/db74bd405d515f06657f11ad529878fd389576dca4812bea6f98d9b31574/datrie-0.8.2.tar.gz", hash = "sha256:525b08f638d5cf6115df6ccd818e5a01298cd230b2dac91c8ff2e6499d18765d" } +wheels = [ + { url = "https://mirrors.aliyun.com/pypi/packages/44/02/53f0cf0bf0cd629ba6c2cc13f2f9db24323459e9c19463783d890a540a96/datrie-0.8.2-pp273-pypy_73-win32.whl", hash = "sha256:b07bd5fdfc3399a6dab86d6e35c72b1dbd598e80c97509c7c7518ab8774d3fda" }, +] + [[package]] name = "decorator" @@ -4239,6 +4243,10 @@ wheels = [ { url = "https://mirrors.aliyun.com/pypi/packages/48/7d/0f2b09490b98cc6a902ac15dda8760c568b9c18cfe70e0ef7a16de64d53a/pycryptodomex-3.20.0-cp35-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:7a7a8f33a1f1fb762ede6cc9cbab8f2a9ba13b196bfaf7bc6f0b39d2ba315a43" }, { url = "https://mirrors.aliyun.com/pypi/packages/b0/1c/375adb14b71ee1c8d8232904e928b3e7af5bbbca7c04e4bec94fe8e90c3d/pycryptodomex-3.20.0-cp35-abi3-win32.whl", hash = "sha256:c39778fd0548d78917b61f03c1fa8bfda6cfcf98c767decf360945fe6f97461e" }, { url = "https://mirrors.aliyun.com/pypi/packages/b2/e8/1b92184ab7e5595bf38000587e6f8cf9556ebd1bf0a583619bee2057afbd/pycryptodomex-3.20.0-cp35-abi3-win_amd64.whl", hash = "sha256:2a47bcc478741b71273b917232f521fd5704ab4b25d301669879e7273d3586cc" }, + + { url = "https://mirrors.aliyun.com/pypi/packages/e7/c5/9140bb867141d948c8e242013ec8a8011172233c898dfdba0a2417c3169a/pycryptodomex-3.20.0-pp27-pypy_73-manylinux2010_x86_64.whl", hash = "sha256:1be97461c439a6af4fe1cf8bf6ca5936d3db252737d2f379cc6b2e394e12a458" }, + { url = "https://mirrors.aliyun.com/pypi/packages/5e/6a/04acb4978ce08ab16890c70611ebc6efd251681341617bbb9e53356dee70/pycryptodomex-3.20.0-pp27-pypy_73-win32.whl", hash = "sha256:19764605feea0df966445d46533729b645033f134baeb3ea26ad518c9fdf212c" }, + { url = "https://mirrors.aliyun.com/pypi/packages/eb/df/3f1ea084e43b91e6d2b6b3493cc948864c17ea5d93ff1261a03812fbfd1a/pycryptodomex-3.20.0-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:f2e497413560e03421484189a6b65e33fe800d3bd75590e6d78d4dfdb7accf3b" }, { url = "https://mirrors.aliyun.com/pypi/packages/c9/f3/83ffbdfa0c8f9154bcd8866895f6cae5a3ec749da8b0840603cf936c4412/pycryptodomex-3.20.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e48217c7901edd95f9f097feaa0388da215ed14ce2ece803d3f300b4e694abea" }, { url = "https://mirrors.aliyun.com/pypi/packages/c9/9d/c113e640aaf02af5631ae2686b742aac5cd0e1402b9d6512b1c7ec5ef05d/pycryptodomex-3.20.0-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d00fe8596e1cc46b44bf3907354e9377aa030ec4cd04afbbf6e899fc1e2a7781" },