From 321a280031e89baf540c09b80f3f5e4009c3d72b Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 13 May 2025 14:30:36 +0800 Subject: [PATCH] Feat: add image preview to retrieval test. (#7610) ### What problem does this PR solve? #7608 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- conf/infinity_mapping.json | 4 +++- rag/app/manual.py | 4 +++- rag/app/picture.py | 3 ++- rag/app/presentation.py | 1 + rag/app/qa.py | 8 ++++++-- rag/nlp/__init__.py | 5 ++++- rag/nlp/search.py | 3 ++- 7 files changed, 21 insertions(+), 7 deletions(-) diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json index 1fc084c22..f6772852c 100644 --- a/conf/infinity_mapping.json +++ b/conf/infinity_mapping.json @@ -38,5 +38,7 @@ "entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, "n_hop_with_weight": {"type": "varchar", "default": ""}, - "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} + "removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}, + + "doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"} } diff --git a/rag/app/manual.py b/rag/app/manual.py index a05670a0e..e73b13c0a 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -265,7 +265,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res = tokenize_table(tbls, doc, eng) for text, image in ti_list: d = copy.deepcopy(doc) - d['image'] = image + if image: + d['image'] = image + d["doc_type_kwd"] = "image" tokenize(d, text, eng) res.append(d) return res diff --git a/rag/app/picture.py b/rag/app/picture.py index f5fb3ae72..db9bb3729 100644 --- a/rag/app/picture.py +++ b/rag/app/picture.py @@ -36,7 +36,8 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs): doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)), - "image": img + "image": img, + "doc_type_kwd": "image" } bxs = ocr(np.array(img)) txt = "\n".join([t[0] for _, t in bxs if t[0]]) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 741470d5e..d3e4f021f 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -115,6 +115,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d = copy.deepcopy(doc) pn += from_page d["image"] = img + d["doc_type_kwd"] = "image" d["page_num_int"] = [pn + 1] d["top_int"] = [0] d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])] diff --git a/rag/app/qa.py b/rag/app/qa.py index 986618989..7ce0afabc 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -271,7 +271,9 @@ def beAdocPdf(d, q, a, eng, image, poss): [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) d["content_ltks"] = rag_tokenizer.tokenize(q) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["image"] = image + if image: + d["image"] = image + d["doc_type_kwd"] = "image" add_positions(d, poss) return d @@ -283,7 +285,9 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1): [qprefix + rmPrefix(q), aprefix + rmPrefix(a)]) d["content_ltks"] = rag_tokenizer.tokenize(q) d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"]) - d["image"] = image + if image: + d["image"] = image + d["doc_type_kwd"] = "image" if row_num >= 0: d["top_int"] = [row_num] return d diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index a1edb3fe7..5b0d4ff36 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -301,6 +301,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10): d["content_with_weight"] = rows if img: d["image"] = img + d["doc_type_kwd"] = "image" if poss: add_positions(d, poss) res.append(d) @@ -310,7 +311,9 @@ def tokenize_table(tbls, doc, eng, batch_size=10): d = copy.deepcopy(doc) r = de.join(rows[i:i + batch_size]) tokenize(d, r, eng) - d["image"] = img + if img: + d["image"] = img + d["doc_type_kwd"] = "image" add_positions(d, poss) res.append(d) return res diff --git a/rag/nlp/search.py b/rag/nlp/search.py index c37b956f6..cf024a381 100644 --- a/rag/nlp/search.py +++ b/rag/nlp/search.py @@ -83,7 +83,7 @@ class Dealer: src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int", "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd", - "question_kwd", "question_tks", + "question_kwd", "question_tks", "doc_type_kwd", "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD]) kwds = set([]) @@ -417,6 +417,7 @@ class Dealer: "term_similarity": tsim[i], "vector": chunk.get(vector_column, zero_vector), "positions": position_int, + "doc_type_kwd": chunk.get("doc_type_kwd", "") } if highlight and sres.highlight: if id in sres.highlight: