From 2452c5624f75077abd91f80601edc768f8c3e04b Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Mon, 5 Aug 2024 15:57:33 +0800
Subject: [PATCH] remove duplicated key in mind map (#1809)

### What problem does this PR solve?

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 graphrag/index.py   | 15 ++++++++++-----
 rag/nlp/__init__.py |  1 +
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/graphrag/index.py b/graphrag/index.py
index e0b5a1993..6c0a96e8f 100644
--- a/graphrag/index.py
+++ b/graphrag/index.py
@@ -29,14 +29,15 @@ from rag.nlp import rag_tokenizer
 from rag.utils import num_tokens_from_string
 
 
-def be_children(obj: dict):
+def be_children(obj: dict, keyset:set):
     arr = []
     for k,v in obj.items():
         k = re.sub(r"\*+", "", k)
-        if not k :continue
+        if not k or k in keyset:continue
+        keyset.add(k)
         arr.append({
             "id": k,
-            "children": be_children(v) if isinstance(v, dict) else []
+            "children": be_children(v, keyset) if isinstance(v, dict) else []
         })
     return arr
 
@@ -142,8 +143,12 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
     mg = mindmap(_chunks).output
     if not len(mg.keys()): return chunks
 
-    if len(mg.keys()) > 1: md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
-    else: md_map = {"id": re.sub(r"\*+", "", list(mg.keys())[0]), "children": be_children(list(mg.items())[1])}
+    if len(mg.keys()) > 1:
+        keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)])
+        md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]}
+    else:
+        k = re.sub(r"\*+", "", list(mg.keys())[0])
+        md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))}
     print(json.dumps(md_map, ensure_ascii=False, indent=2))
     chunks.append(
         {
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index af954f962..d82295e88 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -483,6 +483,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。；！？"):
     def add_chunk(t, pos):
         nonlocal cks, tk_nums, delimiter
         tnum = num_tokens_from_string(t)
+        if not pos: pos = ""
         if tnum < 8:
             pos = ""
         # Ensure that the length of the merged chunk does not exceed chunk_token_num