From 321a280031e89baf540c09b80f3f5e4009c3d72b Mon Sep 17 00:00:00 2001
From: Kevin Hu <kevinhu.sh@gmail.com>
Date: Tue, 13 May 2025 14:30:36 +0800
Subject: [PATCH] Feat: add image preview to retrieval test. (#7610)

### What problem does this PR solve?

#7608

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
---
 conf/infinity_mapping.json | 4 +++-
 rag/app/manual.py          | 4 +++-
 rag/app/picture.py         | 3 ++-
 rag/app/presentation.py    | 1 +
 rag/app/qa.py              | 8 ++++++--
 rag/nlp/__init__.py        | 5 ++++-
 rag/nlp/search.py          | 3 ++-
 7 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/conf/infinity_mapping.json b/conf/infinity_mapping.json
index 1fc084c22..f6772852c 100644
--- a/conf/infinity_mapping.json
+++ b/conf/infinity_mapping.json
@@ -38,5 +38,7 @@
 	"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
 	"n_hop_with_weight": {"type": "varchar", "default": ""},
-	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
+	"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
+
+	"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
 }
diff --git a/rag/app/manual.py b/rag/app/manual.py
index a05670a0e..e73b13c0a 100644
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@@ -265,7 +265,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
         res = tokenize_table(tbls, doc, eng)
         for text, image in ti_list:
             d = copy.deepcopy(doc)
-            d['image'] = image
+            if image:
+                d['image'] = image
+                d["doc_type_kwd"] = "image"
             tokenize(d, text, eng)
             res.append(d)
         return res
diff --git a/rag/app/picture.py b/rag/app/picture.py
index f5fb3ae72..db9bb3729 100644
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@@ -36,7 +36,8 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
     doc = {
         "docnm_kwd": filename,
         "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
-        "image": img
+        "image": img,
+        "doc_type_kwd": "image"
     }
     bxs = ocr(np.array(img))
     txt = "\n".join([t[0] for _, t in bxs if t[0]])
diff --git a/rag/app/presentation.py b/rag/app/presentation.py
index 741470d5e..d3e4f021f 100644
--- a/rag/app/presentation.py
+++ b/rag/app/presentation.py
@@ -115,6 +115,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             d = copy.deepcopy(doc)
             pn += from_page
             d["image"] = img
+            d["doc_type_kwd"] = "image"
             d["page_num_int"] = [pn + 1]
             d["top_int"] = [0]
             d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
diff --git a/rag/app/qa.py b/rag/app/qa.py
index 986618989..7ce0afabc 100644
--- a/rag/app/qa.py
+++ b/rag/app/qa.py
@@ -271,7 +271,9 @@ def beAdocPdf(d, q, a, eng, image, poss):
         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
     d["content_ltks"] = rag_tokenizer.tokenize(q)
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["image"] = image
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
     add_positions(d, poss)
     return d
 
@@ -283,7 +285,9 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1):
         [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
     d["content_ltks"] = rag_tokenizer.tokenize(q)
     d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
-    d["image"] = image
+    if image:
+        d["image"] = image
+        d["doc_type_kwd"] = "image"
     if row_num >= 0:
         d["top_int"] = [row_num]
     return d
diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py
index a1edb3fe7..5b0d4ff36 100644
--- a/rag/nlp/__init__.py
+++ b/rag/nlp/__init__.py
@@ -301,6 +301,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
             d["content_with_weight"] = rows
             if img:
                 d["image"] = img
+                d["doc_type_kwd"] = "image"
             if poss:
                 add_positions(d, poss)
             res.append(d)
@@ -310,7 +311,9 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
             d = copy.deepcopy(doc)
             r = de.join(rows[i:i + batch_size])
             tokenize(d, r, eng)
-            d["image"] = img
+            if img:
+                d["image"] = img
+                d["doc_type_kwd"] = "image"
             add_positions(d, poss)
             res.append(d)
     return res
diff --git a/rag/nlp/search.py b/rag/nlp/search.py
index c37b956f6..cf024a381 100644
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@@ -83,7 +83,7 @@ class Dealer:
         src = req.get("fields",
                       ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
                        "doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
-                       "question_kwd", "question_tks",
+                       "question_kwd", "question_tks", "doc_type_kwd",
                        "available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
         kwds = set([])
 
@@ -417,6 +417,7 @@ class Dealer:
                 "term_similarity": tsim[i],
                 "vector": chunk.get(vector_column, zero_vector),
                 "positions": position_int,
+                "doc_type_kwd": chunk.get("doc_type_kwd", "")
             }
             if highlight and sres.highlight:
                 if id in sres.highlight: