mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-07-31 08:42:00 +08:00
Feat: add image preview to retrieval test. (#7610)
### What problem does this PR solve? #7608 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
5c9025918a
commit
321a280031
@ -38,5 +38,7 @@
|
||||
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
"n_hop_with_weight": {"type": "varchar", "default": ""},
|
||||
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
|
||||
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||
|
||||
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
|
||||
}
|
||||
|
@ -265,7 +265,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
for text, image in ti_list:
|
||||
d = copy.deepcopy(doc)
|
||||
d['image'] = image
|
||||
if image:
|
||||
d['image'] = image
|
||||
d["doc_type_kwd"] = "image"
|
||||
tokenize(d, text, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
@ -36,7 +36,8 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
||||
"image": img
|
||||
"image": img,
|
||||
"doc_type_kwd": "image"
|
||||
}
|
||||
bxs = ocr(np.array(img))
|
||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||
|
@ -115,6 +115,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
d["doc_type_kwd"] = "image"
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||
|
@ -271,7 +271,9 @@ def beAdocPdf(d, q, a, eng, image, poss):
|
||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["image"] = image
|
||||
if image:
|
||||
d["image"] = image
|
||||
d["doc_type_kwd"] = "image"
|
||||
add_positions(d, poss)
|
||||
return d
|
||||
|
||||
@ -283,7 +285,9 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1):
|
||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||
d["image"] = image
|
||||
if image:
|
||||
d["image"] = image
|
||||
d["doc_type_kwd"] = "image"
|
||||
if row_num >= 0:
|
||||
d["top_int"] = [row_num]
|
||||
return d
|
||||
|
@ -301,6 +301,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||
d["content_with_weight"] = rows
|
||||
if img:
|
||||
d["image"] = img
|
||||
d["doc_type_kwd"] = "image"
|
||||
if poss:
|
||||
add_positions(d, poss)
|
||||
res.append(d)
|
||||
@ -310,7 +311,9 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
||||
d = copy.deepcopy(doc)
|
||||
r = de.join(rows[i:i + batch_size])
|
||||
tokenize(d, r, eng)
|
||||
d["image"] = img
|
||||
if img:
|
||||
d["image"] = img
|
||||
d["doc_type_kwd"] = "image"
|
||||
add_positions(d, poss)
|
||||
res.append(d)
|
||||
return res
|
||||
|
@ -83,7 +83,7 @@ class Dealer:
|
||||
src = req.get("fields",
|
||||
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
||||
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
||||
"question_kwd", "question_tks",
|
||||
"question_kwd", "question_tks", "doc_type_kwd",
|
||||
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
||||
kwds = set([])
|
||||
|
||||
@ -417,6 +417,7 @@ class Dealer:
|
||||
"term_similarity": tsim[i],
|
||||
"vector": chunk.get(vector_column, zero_vector),
|
||||
"positions": position_int,
|
||||
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
||||
}
|
||||
if highlight and sres.highlight:
|
||||
if id in sres.highlight:
|
||||
|
Loading…
x
Reference in New Issue
Block a user