refine mindmap (#1817)

### What problem does this PR solve?

#1594
### Type of change

- [x] Refactoring
This commit is contained in:
Kevin Hu 2024-08-06 09:24:53 +08:00 committed by GitHub
parent 5650442b0b
commit 3fd7db40ea
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 39 additions and 24 deletions

View File

@ -21,6 +21,7 @@ from typing import List
import networkx as nx import networkx as nx
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.db.services.user_service import TenantService
from graphrag.community_reports_extractor import CommunityReportsExtractor from graphrag.community_reports_extractor import CommunityReportsExtractor
from graphrag.entity_resolution import EntityResolution from graphrag.entity_resolution import EntityResolution
from graphrag.graph_extractor import GraphExtractor from graphrag.graph_extractor import GraphExtractor
@ -30,6 +31,11 @@ from rag.utils import num_tokens_from_string
def be_children(obj: dict, keyset:set): def be_children(obj: dict, keyset:set):
if isinstance(obj, str):
obj = [obj]
if isinstance(obj, list):
for i in obj: keyset.add(i)
return [{"id": i, "children":[]} for i in obj]
arr = [] arr = []
for k,v in obj.items(): for k,v in obj.items():
k = re.sub(r"\*+", "", k) k = re.sub(r"\*+", "", k)
@ -65,7 +71,8 @@ def graph_merge(g1, g2):
def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]): def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, entity_types=["organization", "person", "location", "event", "time"]):
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT) _, tenant = TenantService.get_by_id(tenant_id)
llm_bdl = LLMBundle(tenant_id, LLMType.CHAT, tenant.llm_id)
ext = GraphExtractor(llm_bdl) ext = GraphExtractor(llm_bdl)
left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024 left_token_count = llm_bdl.max_length - ext.prompt_token_count - 1024
left_token_count = max(llm_bdl.max_length * 0.8, left_token_count) left_token_count = max(llm_bdl.max_length * 0.8, left_token_count)

View File

@ -13,7 +13,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import collections
import logging
import re
import logging import logging
import traceback import traceback
from concurrent.futures import ThreadPoolExecutor from concurrent.futures import ThreadPoolExecutor
@ -65,7 +67,7 @@ class MindMapExtractor:
try: try:
exe = ThreadPoolExecutor(max_workers=12) exe = ThreadPoolExecutor(max_workers=12)
threads = [] threads = []
token_count = self._llm.max_length * 0.7 token_count = max(self._llm.max_length * 0.8, self._llm.max_length-512)
texts = [] texts = []
res = [] res = []
cnt = 0 cnt = 0
@ -122,6 +124,19 @@ class MindMapExtractor:
continue continue
return data return data
def _todict(self, layer:collections.OrderedDict):
to_ret = layer
if isinstance(layer, collections.OrderedDict):
to_ret = dict(layer)
try:
for key, value in to_ret.items():
to_ret[key] = self._todict(value)
except AttributeError:
pass
return self._list_to_kv(to_ret)
def _process_document( def _process_document(
self, text: str, prompt_variables: dict[str, str] self, text: str, prompt_variables: dict[str, str]
) -> str: ) -> str:
@ -132,6 +147,7 @@ class MindMapExtractor:
text = perform_variable_replacements(self._mind_map_prompt, variables=variables) text = perform_variable_replacements(self._mind_map_prompt, variables=variables)
gen_conf = {"temperature": 0.5} gen_conf = {"temperature": 0.5}
response = self._llm.chat(text, [], gen_conf) response = self._llm.chat(text, [], gen_conf)
response = re.sub(r"```[^\n]*", "", response)
print(response) print(response)
print("---------------------------------------------------\n", markdown_to_json.dictify(response)) print("---------------------------------------------------\n", self._todict(markdown_to_json.dictify(response)))
return dict(markdown_to_json.dictify(response)) return self._todict(markdown_to_json.dictify(response))

View File

@ -14,28 +14,20 @@
# limitations under the License. # limitations under the License.
# #
MIND_MAP_EXTRACTION_PROMPT = """ MIND_MAP_EXTRACTION_PROMPT = """
- Role: You're a talent text processor. - Role: You're a talent text processor to summarize a piece of text into a mind map.
- Step of task: - Step of task:
1. Generate a title for user's 'TEXT' 1. Generate a title for user's 'TEXT'
2. Classify the 'TEXT' into sections as you see fit. 2. Classify the 'TEXT' into sections of a mind map.
3. If the subject matter is really complex, split them into sub-sections. 3. If the subject matter is really complex, split them into sub-sections and sub-subsections.
4. Add a shot content summary of the bottom level section.
- Output requirement: - Output requirement:
- In language of - Always try to maximize the number of sub-sections.
- In language of 'Text'
- MUST IN FORMAT OF MARKDOWN - MUST IN FORMAT OF MARKDOWN
Output:
## <Title>
<Section Name>
<Section Name>
<Subsection Name>
<Subsection Name>
<Section Name>
<Subsection Name>
-TEXT- -TEXT-
{input_text} {input_text}
Output:
""" """