mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 10:15:55 +08:00
fix position extraction bug (#93)
* fix position extraction bug * remove delimiter for naive parser
This commit is contained in:
parent
fae00827e6
commit
7bfaf0df29
@ -60,7 +60,8 @@ def list():
|
|||||||
for id in sres.ids:
|
for id in sres.ids:
|
||||||
d = {
|
d = {
|
||||||
"chunk_id": id,
|
"chunk_id": id,
|
||||||
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""),
|
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
|
||||||
|
"content_with_weight", ""),
|
||||||
"doc_id": sres.field[id]["doc_id"],
|
"doc_id": sres.field[id]["doc_id"],
|
||||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
"important_kwd": sres.field[id].get("important_kwd", []),
|
||||||
@ -68,10 +69,12 @@ def list():
|
|||||||
"available_int": sres.field[id].get("available_int", 1),
|
"available_int": sres.field[id].get("available_int", 1),
|
||||||
"positions": sres.field[id].get("position_int", "").split("\t")
|
"positions": sres.field[id].get("position_int", "").split("\t")
|
||||||
}
|
}
|
||||||
poss = []
|
if len(d["positions"]) % 5 == 0:
|
||||||
for i in range(0, len(d["positions"]), 5):
|
poss = []
|
||||||
poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])])
|
for i in range(0, len(d["positions"]), 5):
|
||||||
d["positions"] = poss
|
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
|
||||||
|
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
|
||||||
|
d["positions"] = poss
|
||||||
res["chunks"].append(d)
|
res["chunks"].append(d)
|
||||||
return get_json_result(data=res)
|
return get_json_result(data=res)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@ -137,10 +140,10 @@ def set():
|
|||||||
return get_data_error_result(retmsg="Document not found!")
|
return get_data_error_result(retmsg="Document not found!")
|
||||||
|
|
||||||
if doc.parser_id == ParserType.QA:
|
if doc.parser_id == ParserType.QA:
|
||||||
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
|
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1]
|
||||||
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
|
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
|
||||||
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
||||||
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
|
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a]))
|
||||||
|
|
||||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||||
@ -189,7 +192,8 @@ def create():
|
|||||||
md5 = hashlib.md5()
|
md5 = hashlib.md5()
|
||||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||||
chunck_id = md5.hexdigest()
|
chunck_id = md5.hexdigest()
|
||||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]}
|
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
||||||
|
"content_with_weight": req["content_with_weight"]}
|
||||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||||
d["important_kwd"] = req.get("important_kwd", [])
|
d["important_kwd"] = req.get("important_kwd", [])
|
||||||
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
||||||
|
@ -527,7 +527,7 @@ class Dialog(DataBaseModel):
|
|||||||
tenant_id = CharField(max_length=32, null=False)
|
tenant_id = CharField(max_length=32, null=False)
|
||||||
name = CharField(max_length=255, null=True, help_text="dialog application name")
|
name = CharField(max_length=255, null=True, help_text="dialog application name")
|
||||||
description = TextField(null=True, help_text="Dialog description")
|
description = TextField(null=True, help_text="Dialog description")
|
||||||
icon = CharField(max_length=16, null=False, help_text="dialog icon")
|
icon = TextField(null=True, help_text="icon base64 string")
|
||||||
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
||||||
llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
|
llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
|
||||||
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
||||||
|
@ -35,6 +35,7 @@ class HuParser:
|
|||||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||||
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
||||||
filename="updown_concat_xgb.model"))
|
filename="updown_concat_xgb.model"))
|
||||||
|
self.page_from = 0
|
||||||
"""
|
"""
|
||||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||||
|
|
||||||
@ -683,7 +684,7 @@ class HuParser:
|
|||||||
"layoutno", "")))
|
"layoutno", "")))
|
||||||
|
|
||||||
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
||||||
poss.append((pn, left, right, top, bott))
|
poss.append((pn+self.page_from, left, right, top, bott))
|
||||||
return self.page_images[pn] \
|
return self.page_images[pn] \
|
||||||
.crop((left * ZM, top * ZM,
|
.crop((left * ZM, top * ZM,
|
||||||
right * ZM, bott * ZM))
|
right * ZM, bott * ZM))
|
||||||
@ -863,6 +864,7 @@ class HuParser:
|
|||||||
self.garbages = {}
|
self.garbages = {}
|
||||||
self.page_cum_height = [0]
|
self.page_cum_height = [0]
|
||||||
self.page_layout = []
|
self.page_layout = []
|
||||||
|
self.page_from = page_from
|
||||||
try:
|
try:
|
||||||
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||||
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||||
@ -947,7 +949,9 @@ class HuParser:
|
|||||||
left, right, top, bottom = float(left), float(
|
left, right, top, bottom = float(left), float(
|
||||||
right), float(top), float(bottom)
|
right), float(top), float(bottom)
|
||||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||||
if not poss: return
|
if not poss:
|
||||||
|
if need_position: return None, None
|
||||||
|
return
|
||||||
|
|
||||||
max_width = np.max([right-left for (_, left, right, _, _) in poss])
|
max_width = np.max([right-left for (_, left, right, _, _) in poss])
|
||||||
GAP = 6
|
GAP = 6
|
||||||
@ -969,7 +973,8 @@ class HuParser:
|
|||||||
bottom, self.page_images[pns[0]].size[1])
|
bottom, self.page_images[pns[0]].size[1])
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
positions.append((pns[0], left, right, top, min(
|
if 0 < ii < len(poss)-1:
|
||||||
|
positions.append((pns[0]+self.page_from, left, right, top, min(
|
||||||
bottom, self.page_images[pns[0]].size[1])/ZM))
|
bottom, self.page_images[pns[0]].size[1])/ZM))
|
||||||
bottom -= self.page_images[pns[0]].size[1]
|
bottom -= self.page_images[pns[0]].size[1]
|
||||||
for pn in pns[1:]:
|
for pn in pns[1:]:
|
||||||
@ -980,8 +985,9 @@ class HuParser:
|
|||||||
self.page_images[pn].size[1])
|
self.page_images[pn].size[1])
|
||||||
))
|
))
|
||||||
)
|
)
|
||||||
positions.append((pn, left, right, 0, min(
|
if 0 < ii < len(poss) - 1:
|
||||||
bottom, self.page_images[pn].size[1]) / ZM))
|
positions.append((pn+self.page_from, left, right, 0, min(
|
||||||
|
bottom, self.page_images[pn].size[1]) / ZM))
|
||||||
bottom -= self.page_images[pn].size[1]
|
bottom -= self.page_images[pn].size[1]
|
||||||
|
|
||||||
if not imgs:
|
if not imgs:
|
||||||
|
@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python
|
|||||||
|
|
||||||
function task_exe(){
|
function task_exe(){
|
||||||
sleep 60;
|
sleep 60;
|
||||||
while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done
|
while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
|
||||||
}
|
}
|
||||||
|
|
||||||
function watch_broker(){
|
function watch_broker(){
|
||||||
|
@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
|||||||
self._filter_forpages()
|
self._filter_forpages()
|
||||||
self._merge_with_same_bullet()
|
self._merge_with_same_bullet()
|
||||||
callback(0.75, "Text merging finished.")
|
callback(0.75, "Text merging finished.")
|
||||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||||
|
|
||||||
callback(0.8, "Text extraction finished")
|
callback(0.8, "Text extraction finished")
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
|||||||
self._concat_downward(concat_between_pages=False)
|
self._concat_downward(concat_between_pages=False)
|
||||||
self._filter_forpages()
|
self._filter_forpages()
|
||||||
callback(0.77, "Text merging finished")
|
callback(0.77, "Text merging finished")
|
||||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||||
|
|
||||||
# clean mess
|
# clean mess
|
||||||
for b in self.boxes:
|
for b in self.boxes:
|
||||||
|
@ -40,7 +40,7 @@ class Pdf(PdfParser):
|
|||||||
self._concat_downward(concat_between_pages=False)
|
self._concat_downward(concat_between_pages=False)
|
||||||
self._filter_forpages()
|
self._filter_forpages()
|
||||||
callback(0.77, "Text merging finished")
|
callback(0.77, "Text merging finished")
|
||||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||||
|
|
||||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||||
#self._naive_vertical_merge()
|
#self._naive_vertical_merge()
|
||||||
|
@ -48,7 +48,7 @@ class Pdf(PdfParser):
|
|||||||
self._concat_downward(concat_between_pages=False)
|
self._concat_downward(concat_between_pages=False)
|
||||||
self._filter_forpages()
|
self._filter_forpages()
|
||||||
callback(0.75, "Text merging finished.")
|
callback(0.75, "Text merging finished.")
|
||||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||||
|
|
||||||
# clean mess
|
# clean mess
|
||||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||||
|
@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
|||||||
tk_nums[-1] += tnum
|
tk_nums[-1] += tnum
|
||||||
|
|
||||||
for sec, pos in sections:
|
for sec, pos in sections:
|
||||||
|
add_chunk(sec, pos)
|
||||||
|
continue
|
||||||
s, e = 0, 1
|
s, e = 0, 1
|
||||||
while e < len(sec):
|
while e < len(sec):
|
||||||
if sec[e] in delimiter:
|
if sec[e] in delimiter:
|
||||||
|
@ -83,7 +83,7 @@ class Dealer:
|
|||||||
else:
|
else:
|
||||||
s = s.sort(
|
s = s.sort(
|
||||||
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
||||||
{"top_int": {"order": "asc", "unmapped_type": "float"}},
|
{"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
||||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||||
)
|
)
|
||||||
|
@ -83,10 +83,10 @@ def dispatch():
|
|||||||
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||||
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
||||||
e = min(e, pages)
|
e = min(e, pages)
|
||||||
for p in range(s, e, 10):
|
for p in range(s, e, 5):
|
||||||
task = new_task()
|
task = new_task()
|
||||||
task["from_page"] = p
|
task["from_page"] = p
|
||||||
task["to_page"] = min(p + 10, e)
|
task["to_page"] = min(p + 5, e)
|
||||||
tsks.append(task)
|
tsks.append(task)
|
||||||
else:
|
else:
|
||||||
tsks.append(new_task())
|
tsks.append(new_task())
|
||||||
|
Loading…
x
Reference in New Issue
Block a user