diff --git a/graphrag/index.py b/graphrag/index.py index a60834719..e0b5a1993 100644 --- a/graphrag/index.py +++ b/graphrag/index.py @@ -45,7 +45,7 @@ def graph_merge(g1, g2): g = g2.copy() for n, attr in g1.nodes(data=True): if n not in g2.nodes(): - g2.add_node(n, **attr) + g.add_node(n, **attr) continue g.nodes[n]["weight"] += 1 @@ -75,7 +75,7 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent cnt = 0 threads = [] exe = ThreadPoolExecutor(max_workers=12) - for i in range(len(chunks[:512])): + for i in range(len(chunks)): tkn_cnt = num_tokens_from_string(chunks[i]) if cnt+tkn_cnt >= left_token_count and texts: threads.append(exe.submit(ext, texts, {"entity_types": entity_types})) diff --git a/rag/app/knowledge_graph.py b/rag/app/knowledge_graph.py index a8775f9cc..9f47769b0 100644 --- a/rag/app/knowledge_graph.py +++ b/rag/app/knowledge_graph.py @@ -13,7 +13,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, eng = lang.lower() == "english" parser_config["layout_recognize"] = False - sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config) + sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config) chunks = build_knowlege_graph_chunks(tenant_id, sections, callback, parser_config.get("entity_types", ["organization", "person", "location", "event", "time"]) ) @@ -27,4 +27,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000, doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) chunks.extend(tokenize_chunks(sections, doc, eng)) - return chunks \ No newline at end of file + return chunks