mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 07:35:55 +08:00
refine presentation parser (#110)
This commit is contained in:
parent
2d7c9080f4
commit
436c52bbc5
@ -212,14 +212,17 @@ def chat(dialog, messages, **kwargs):
|
|||||||
if "max_tokens" in gen_conf:
|
if "max_tokens" in gen_conf:
|
||||||
gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
|
gen_conf["max_tokens"] = min(gen_conf["max_tokens"], llm.max_tokens - used_token_count)
|
||||||
answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
|
answer = chat_mdl.chat(prompt_config["system"].format(**kwargs), msg, gen_conf)
|
||||||
|
stat_logger.info("User: {}|Assistant: {}".format(msg[-1]["content"], answer))
|
||||||
|
|
||||||
if knowledges:
|
if knowledges:
|
||||||
answer = retrievaler.insert_citations(answer,
|
answer, idx = retrievaler.insert_citations(answer,
|
||||||
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
[ck["content_ltks"] for ck in kbinfos["chunks"]],
|
||||||
[ck["vector"] for ck in kbinfos["chunks"]],
|
[ck["vector"] for ck in kbinfos["chunks"]],
|
||||||
embd_mdl,
|
embd_mdl,
|
||||||
tkweight=1 - dialog.vector_similarity_weight,
|
tkweight=1 - dialog.vector_similarity_weight,
|
||||||
vtweight=dialog.vector_similarity_weight)
|
vtweight=dialog.vector_similarity_weight)
|
||||||
|
idx = set([kbinfos["chunks"][int(i)]["doc_id"] for i in idx])
|
||||||
|
kbinfos["doc_aggs"] = [d for d in kbinfos["doc_aggs"] if d["doc_id"] in idx]
|
||||||
for c in kbinfos["chunks"]:
|
for c in kbinfos["chunks"]:
|
||||||
if c.get("vector"): del c["vector"]
|
if c.get("vector"): del c["vector"]
|
||||||
return {"answer": answer, "reference": kbinfos}
|
return {"answer": answer, "reference": kbinfos}
|
||||||
|
@ -88,20 +88,25 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
res = []
|
res = []
|
||||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||||
ppt_parser = Ppt()
|
ppt_parser = Ppt()
|
||||||
for txt,img in ppt_parser(filename if not binary else binary, from_page, 1000000, callback):
|
for pn, (txt,img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
|
pn += from_page
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
tokenize(d, txt, ppt_parser.is_english)
|
d["page_num_int"] = [pn+1]
|
||||||
|
d["top_int"] = [0]
|
||||||
|
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||||
|
tokenize(d, txt, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||||
pdf_parser = Pdf()
|
pdf_parser = Pdf()
|
||||||
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
|
for pn, (txt,img) in enumerate(pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)):
|
||||||
d = copy.deepcopy(doc)
|
d = copy.deepcopy(doc)
|
||||||
|
pn += from_page
|
||||||
d["image"] = img
|
d["image"] = img
|
||||||
d["page_num_int"] = [pn+1]
|
d["page_num_int"] = [pn+1]
|
||||||
d["top_int"] = [0]
|
d["top_int"] = [0]
|
||||||
d["position_int"].append((pn + 1, 0, img.size[0], 0, img.size[1]))
|
d["position_int"] = [(pn + 1, 0, img.size[0], 0, img.size[1])]
|
||||||
tokenize(d, txt, eng)
|
tokenize(d, txt, eng)
|
||||||
res.append(d)
|
res.append(d)
|
||||||
return res
|
return res
|
||||||
|
@ -243,7 +243,7 @@ class Dealer:
|
|||||||
res += f" ##{c}$$"
|
res += f" ##{c}$$"
|
||||||
seted.add(c)
|
seted.add(c)
|
||||||
|
|
||||||
return res
|
return res, seted
|
||||||
|
|
||||||
def rerank(self, sres, query, tkweight=0.3,
|
def rerank(self, sres, query, tkweight=0.3,
|
||||||
vtweight=0.7, cfield="content_ltks"):
|
vtweight=0.7, cfield="content_ltks"):
|
||||||
@ -290,7 +290,7 @@ class Dealer:
|
|||||||
start_idx -= 1
|
start_idx -= 1
|
||||||
if start_idx >= 0:
|
if start_idx >= 0:
|
||||||
continue
|
continue
|
||||||
if len(ranks["chunks"]) == page_size:
|
if len(ranks["chunks"]) >= page_size:
|
||||||
if aggs:
|
if aggs:
|
||||||
continue
|
continue
|
||||||
break
|
break
|
||||||
@ -322,7 +322,7 @@ class Dealer:
|
|||||||
if dnm not in ranks["doc_aggs"]:
|
if dnm not in ranks["doc_aggs"]:
|
||||||
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
ranks["doc_aggs"][dnm] = {"doc_id": did, "count": 0}
|
||||||
ranks["doc_aggs"][dnm]["count"] += 1
|
ranks["doc_aggs"][dnm]["count"] += 1
|
||||||
ranks["doc_aggs"] = []#[{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
|
ranks["doc_aggs"] = [{"doc_name": k, "doc_id": v["doc_id"], "count": v["count"]} for k,v in sorted(ranks["doc_aggs"].items(), key=lambda x:x[1]["count"]*-1)]
|
||||||
|
|
||||||
return ranks
|
return ranks
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user