mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-01 04:12:00 +08:00
Feat: add image preview to retrieval test. (#7610)
### What problem does this PR solve? #7608 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
5c9025918a
commit
321a280031
@ -38,5 +38,7 @@
|
|||||||
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
"entity_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
"source_id": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
"n_hop_with_weight": {"type": "varchar", "default": ""},
|
"n_hop_with_weight": {"type": "varchar", "default": ""},
|
||||||
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
|
"removed_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"},
|
||||||
|
|
||||||
|
"doc_type_kwd": {"type": "varchar", "default": "", "analyzer": "whitespace-#"}
|
||||||
}
|
}
|
||||||
|
@ -265,7 +265,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
res = tokenize_table(tbls, doc, eng)
|
res = tokenize_table(tbls, doc, eng)
|
||||||
for text, image in ti_list:
|
for text, image in ti_list:
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
d['image'] = image
|
if image:
|
||||||
|
d['image'] = image
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
tokenize(d, text, eng)
|
tokenize(d, text, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
@ -36,7 +36,8 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
|
|||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
||||||
"image": img
|
"image": img,
|
||||||
|
"doc_type_kwd": "image"
|
||||||
}
|
}
|
||||||
bxs = ocr(np.array(img))
|
bxs = ocr(np.array(img))
|
||||||
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
txt = "\n".join([t[0] for _, t in bxs if t[0]])
|
||||||
|
@ -115,6 +115,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
pn += from_page
|
pn += from_page
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
d["page_num_int"] = [pn + 1]
|
d["page_num_int"] = [pn + 1]
|
||||||
d["top_int"] = [0]
|
d["top_int"] = [0]
|
||||||
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||||
|
@ -271,7 +271,9 @@ def beAdocPdf(d, q, a, eng, image, poss):
|
|||||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["image"] = image
|
if image:
|
||||||
|
d["image"] = image
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
add_positions(d, poss)
|
add_positions(d, poss)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
@ -283,7 +285,9 @@ def beAdocDocx(d, q, a, eng, image, row_num=-1):
|
|||||||
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
[qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
|
||||||
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
d["content_ltks"] = rag_tokenizer.tokenize(q)
|
||||||
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
d["content_sm_ltks"] = rag_tokenizer.fine_grained_tokenize(d["content_ltks"])
|
||||||
d["image"] = image
|
if image:
|
||||||
|
d["image"] = image
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
if row_num >= 0:
|
if row_num >= 0:
|
||||||
d["top_int"] = [row_num]
|
d["top_int"] = [row_num]
|
||||||
return d
|
return d
|
||||||
|
@ -301,6 +301,7 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
d["content_with_weight"] = rows
|
d["content_with_weight"] = rows
|
||||||
if img:
|
if img:
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
if poss:
|
if poss:
|
||||||
add_positions(d, poss)
|
add_positions(d, poss)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
@ -310,7 +311,9 @@ def tokenize_table(tbls, doc, eng, batch_size=10):
|
|||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
r = de.join(rows[i:i + batch_size])
|
r = de.join(rows[i:i + batch_size])
|
||||||
tokenize(d, r, eng)
|
tokenize(d, r, eng)
|
||||||
d["image"] = img
|
if img:
|
||||||
|
d["image"] = img
|
||||||
|
d["doc_type_kwd"] = "image"
|
||||||
add_positions(d, poss)
|
add_positions(d, poss)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
@ -83,7 +83,7 @@ class Dealer:
|
|||||||
src = req.get("fields",
|
src = req.get("fields",
|
||||||
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd", "position_int",
|
||||||
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
"doc_id", "page_num_int", "top_int", "create_timestamp_flt", "knowledge_graph_kwd",
|
||||||
"question_kwd", "question_tks",
|
"question_kwd", "question_tks", "doc_type_kwd",
|
||||||
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
"available_int", "content_with_weight", PAGERANK_FLD, TAG_FLD])
|
||||||
kwds = set([])
|
kwds = set([])
|
||||||
|
|
||||||
@ -417,6 +417,7 @@ class Dealer:
|
|||||||
"term_similarity": tsim[i],
|
"term_similarity": tsim[i],
|
||||||
"vector": chunk.get(vector_column, zero_vector),
|
"vector": chunk.get(vector_column, zero_vector),
|
||||||
"positions": position_int,
|
"positions": position_int,
|
||||||
|
"doc_type_kwd": chunk.get("doc_type_kwd", "")
|
||||||
}
|
}
|
||||||
if highlight and sres.highlight:
|
if highlight and sres.highlight:
|
||||||
if id in sres.highlight:
|
if id in sres.highlight:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user