From 2452c5624f75077abd91f80601edc768f8c3e04b Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 5 Aug 2024 15:57:33 +0800 Subject: [PATCH] remove duplicated key in mind map (#1809) ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- graphrag/index.py | 15 ++++++++++----- rag/nlp/__init__.py | 1 + 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/graphrag/index.py b/graphrag/index.py index e0b5a1993..6c0a96e8f 100644 --- a/graphrag/index.py +++ b/graphrag/index.py @@ -29,14 +29,15 @@ from rag.nlp import rag_tokenizer from rag.utils import num_tokens_from_string -def be_children(obj: dict): +def be_children(obj: dict, keyset:set): arr = [] for k,v in obj.items(): k = re.sub(r"\*+", "", k) - if not k :continue + if not k or k in keyset:continue + keyset.add(k) arr.append({ "id": k, - "children": be_children(v) if isinstance(v, dict) else [] + "children": be_children(v, keyset) if isinstance(v, dict) else [] }) return arr @@ -142,8 +143,12 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent mg = mindmap(_chunks).output if not len(mg.keys()): return chunks - if len(mg.keys()) > 1: md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]} - else: md_map = {"id": re.sub(r"\*+", "", list(mg.keys())[0]), "children": be_children(list(mg.items())[1])} + if len(mg.keys()) > 1: + keyset = set([re.sub(r"\*+", "", k) for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]) + md_map = {"id": "root", "children": [{"id": re.sub(r"\*+", "", k), "children": be_children(v, keyset)} for k,v in mg.items() if isinstance(v, dict) and re.sub(r"\*+", "", k)]} + else: + k = re.sub(r"\*+", "", list(mg.keys())[0]) + md_map = {"id": k, "children": be_children(list(mg.items())[0][1], set([k]))} print(json.dumps(md_map, ensure_ascii=False, indent=2)) chunks.append( { diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index af954f962..d82295e88 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -483,6 +483,7 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): def add_chunk(t, pos): nonlocal cks, tk_nums, delimiter tnum = num_tokens_from_string(t) + if not pos: pos = "" if tnum < 8: pos = "" # Ensure that the length of the merged chunk does not exceed chunk_token_num