From 6f30397bb52e12863700074366f395b4c7ade922 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 27 Jan 2025 18:35:18 +0800 Subject: [PATCH] Infinity adapt to graphrag. (#4663) ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- conf/infinity_mapping.json | 3 +-- deepdoc/parser/pdf_parser.py | 8 +++----- graphrag/utils.py | 6 +++--- 3 files changed, 7 insertions(+), 10 deletions(-) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 85d415974..f71ad65f7 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -32,12 +32,11 @@ "pagerank_fea": {"type": "integer", "default": 0}, "tag_feas": {"type": "varchar", "default": ""}, - "important_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "from_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "to_entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "entity_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"}, - "source_id": {"type": "varchar", "default": ""}, + "source_id": {"type": "varchar", "default": "", "analyzer": "whitespace"}, "n_hop_with_weight": {"type": "varchar", "default": ""}, "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace"} } diff --git a/deepdoc/parser/pdf_parser.py b/deepdoc/parser/pdf_parser.py index 114339fa5..990c0355c 100644 --- a/deepdoc/parser/pdf_parser.py +++ b/deepdoc/parser/pdf_parser.py @@ -956,8 +956,6 @@ class RAGFlowPdfParser: fnm, str) else pdfplumber.open(BytesIO(fnm)) self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in enumerate(self.pdf.pages[page_from:page_to])] - self.page_images_x2 = [p.to_image(resolution=72 * zoomin * 2).annotated for i, p in - enumerate(self.pdf.pages[page_from:page_to])] try: self.page_chars = [[{**c, 'top': c['top'], 'bottom': c['bottom']} for c in page.dedupe_chars().chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]] except Exception as e: @@ -997,7 +995,7 @@ class RAGFlowPdfParser: self.is_english = False # st = timer() - for i, img in enumerate(self.page_images_x2): + for i, img in enumerate(self.page_images): chars = self.page_chars[i] if not self.is_english else [] self.mean_height.append( np.median(sorted([c["height"] for c in chars])) if chars else 0 @@ -1005,7 +1003,7 @@ class RAGFlowPdfParser: self.mean_width.append( np.median(sorted([c["width"] for c in chars])) if chars else 8 ) - self.page_cum_height.append(img.size[1] / zoomin/2) + self.page_cum_height.append(img.size[1] / zoomin) j = 0 while j + 1 < len(chars): if chars[j]["text"] and chars[j + 1]["text"] \ @@ -1015,7 +1013,7 @@ class RAGFlowPdfParser: chars[j]["text"] += " " j += 1 - self.__ocr(i + 1, img, chars, zoomin*2) + self.__ocr(i + 1, img, chars, zoomin) if callback and i % 6 == 5: callback(prog=(i + 1) * 0.6 / len(self.page_images), msg="") # print("OCR:", timer()-st) diff --git a/graphrag/utils.py b/graphrag/utils.py index 2fbe18d88..1c715072b 100644 --- a/graphrag/utils.py +++ b/graphrag/utils.py @@ -284,7 +284,7 @@ def set_entity(tenant_id, kb_id, embd_mdl, ent_name, meta): logging.exception(f"Fail to embed entity: {e}") if ebd is not None: chunk["q_%d_vec" % len(ebd)] = ebd - settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)) + settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id) def get_relation(tenant_id, kb_id, from_ent_name, to_ent_name, size=1): @@ -347,7 +347,7 @@ def set_relation(tenant_id, kb_id, embd_mdl, from_ent_name, to_ent_name, meta): logging.exception(f"Fail to embed entity relation: {e}") if ebd is not None: chunk["q_%d_vec" % len(ebd)] = ebd - settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)) + settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id) def get_graph(tenant_id, kb_id): @@ -382,7 +382,7 @@ def set_graph(tenant_id, kb_id, graph, docids): settings.docStoreConn.update({"knowledge_graph_kwd": "graph"}, chunk, search.index_name(tenant_id), kb_id) else: - settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id)) + settings.docStoreConn.insert([{"id": chunk_id(chunk), **chunk}], search.index_name(tenant_id), kb_id) def is_continuous_subsequence(subseq, seq):