Fix graphrag callback (#1806)

### What problem does this PR solve?

#1800 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
H 2024-08-05 14:44:54 +08:00 committed by GitHub
parent 3da3260eb5
commit d2213141e0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 4 additions and 4 deletions

View File

@ -45,7 +45,7 @@ def graph_merge(g1, g2):
g = g2.copy()
for n, attr in g1.nodes(data=True):
if n not in g2.nodes():
g2.add_node(n, **attr)
g.add_node(n, **attr)
continue
g.nodes[n]["weight"] += 1
@ -75,7 +75,7 @@ def build_knowlege_graph_chunks(tenant_id: str, chunks: List[str], callback, ent
cnt = 0
threads = []
exe = ThreadPoolExecutor(max_workers=12)
for i in range(len(chunks[:512])):
for i in range(len(chunks)):
tkn_cnt = num_tokens_from_string(chunks[i])
if cnt+tkn_cnt >= left_token_count and texts:
threads.append(exe.submit(ext, texts, {"entity_types": entity_types}))

View File

@ -13,7 +13,7 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
eng = lang.lower() == "english"
parser_config["layout_recognize"] = False
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, parser_config=parser_config)
sections = naive.chunk(filename, binary, from_page=from_page, to_page=to_page, section_only=True, callback=callback ,parser_config=parser_config)
chunks = build_knowlege_graph_chunks(tenant_id, sections, callback,
parser_config.get("entity_types", ["organization", "person", "location", "event", "time"])
)
@ -27,4 +27,4 @@ def chunk(filename, binary, tenant_id, from_page=0, to_page=100000,
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
chunks.extend(tokenize_chunks(sections, doc, eng))
return chunks
return chunks