fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
This commit is contained in:
KevinHuSh 2024-03-04 17:08:35 +08:00 committed by GitHub
parent fae00827e6
commit 7bfaf0df29
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 34 additions and 22 deletions

View File

@ -60,7 +60,8 @@ def list():
for id in sres.ids: for id in sres.ids:
d = { d = {
"chunk_id": id, "chunk_id": id,
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""), "content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
"content_with_weight", ""),
"doc_id": sres.field[id]["doc_id"], "doc_id": sres.field[id]["doc_id"],
"docnm_kwd": sres.field[id]["docnm_kwd"], "docnm_kwd": sres.field[id]["docnm_kwd"],
"important_kwd": sres.field[id].get("important_kwd", []), "important_kwd": sres.field[id].get("important_kwd", []),
@ -68,10 +69,12 @@ def list():
"available_int": sres.field[id].get("available_int", 1), "available_int": sres.field[id].get("available_int", 1),
"positions": sres.field[id].get("position_int", "").split("\t") "positions": sres.field[id].get("position_int", "").split("\t")
} }
poss = [] if len(d["positions"]) % 5 == 0:
for i in range(0, len(d["positions"]), 5): poss = []
poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])]) for i in range(0, len(d["positions"]), 5):
d["positions"] = poss poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
d["positions"] = poss
res["chunks"].append(d) res["chunks"].append(d)
return get_json_result(data=res) return get_json_result(data=res)
except Exception as e: except Exception as e:
@ -137,10 +140,10 @@ def set():
return get_data_error_result(retmsg="Document not found!") return get_data_error_result(retmsg="Document not found!")
if doc.parser_id == ParserType.QA: if doc.parser_id == ParserType.QA:
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1] arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1]
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.") if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]] q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a])) d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a]))
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]]) v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1] v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
@ -189,7 +192,8 @@ def create():
md5 = hashlib.md5() md5 = hashlib.md5()
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8")) md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
chunck_id = md5.hexdigest() chunck_id = md5.hexdigest()
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]} d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
"content_with_weight": req["content_with_weight"]}
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"]) d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
d["important_kwd"] = req.get("important_kwd", []) d["important_kwd"] = req.get("important_kwd", [])
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", []))) d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))

View File

@ -527,7 +527,7 @@ class Dialog(DataBaseModel):
tenant_id = CharField(max_length=32, null=False) tenant_id = CharField(max_length=32, null=False)
name = CharField(max_length=255, null=True, help_text="dialog application name") name = CharField(max_length=255, null=True, help_text="dialog application name")
description = TextField(null=True, help_text="Dialog description") description = TextField(null=True, help_text="Dialog description")
icon = CharField(max_length=16, null=False, help_text="dialog icon") icon = TextField(null=True, help_text="icon base64 string")
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese") language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
llm_id = CharField(max_length=32, null=False, help_text="default llm ID") llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7, llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,

View File

@ -35,6 +35,7 @@ class HuParser:
self.updown_cnt_mdl.set_param({"device": "cuda"}) self.updown_cnt_mdl.set_param({"device": "cuda"})
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0", self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
filename="updown_concat_xgb.model")) filename="updown_concat_xgb.model"))
self.page_from = 0
""" """
If you have trouble downloading HuggingFace models, -_^ this might help!! If you have trouble downloading HuggingFace models, -_^ this might help!!
@ -683,7 +684,7 @@ class HuParser:
"layoutno", ""))) "layoutno", "")))
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"] left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
poss.append((pn, left, right, top, bott)) poss.append((pn+self.page_from, left, right, top, bott))
return self.page_images[pn] \ return self.page_images[pn] \
.crop((left * ZM, top * ZM, .crop((left * ZM, top * ZM,
right * ZM, bott * ZM)) right * ZM, bott * ZM))
@ -863,6 +864,7 @@ class HuParser:
self.garbages = {} self.garbages = {}
self.page_cum_height = [0] self.page_cum_height = [0]
self.page_layout = [] self.page_layout = []
self.page_from = page_from
try: try:
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm)) self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
@ -947,7 +949,9 @@ class HuParser:
left, right, top, bottom = float(left), float( left, right, top, bottom = float(left), float(
right), float(top), float(bottom) right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
if not poss: return if not poss:
if need_position: return None, None
return
max_width = np.max([right-left for (_, left, right, _, _) in poss]) max_width = np.max([right-left for (_, left, right, _, _) in poss])
GAP = 6 GAP = 6
@ -969,7 +973,8 @@ class HuParser:
bottom, self.page_images[pns[0]].size[1]) bottom, self.page_images[pns[0]].size[1])
)) ))
) )
positions.append((pns[0], left, right, top, min( if 0 < ii < len(poss)-1:
positions.append((pns[0]+self.page_from, left, right, top, min(
bottom, self.page_images[pns[0]].size[1])/ZM)) bottom, self.page_images[pns[0]].size[1])/ZM))
bottom -= self.page_images[pns[0]].size[1] bottom -= self.page_images[pns[0]].size[1]
for pn in pns[1:]: for pn in pns[1:]:
@ -980,8 +985,9 @@ class HuParser:
self.page_images[pn].size[1]) self.page_images[pn].size[1])
)) ))
) )
positions.append((pn, left, right, 0, min( if 0 < ii < len(poss) - 1:
bottom, self.page_images[pn].size[1]) / ZM)) positions.append((pn+self.page_from, left, right, 0, min(
bottom, self.page_images[pn].size[1]) / ZM))
bottom -= self.page_images[pn].size[1] bottom -= self.page_images[pn].size[1]
if not imgs: if not imgs:

View File

@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python
function task_exe(){ function task_exe(){
sleep 60; sleep 60;
while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
} }
function watch_broker(){ function watch_broker(){

View File

@ -41,7 +41,7 @@ class Pdf(PdfParser):
self._filter_forpages() self._filter_forpages()
self._merge_with_same_bullet() self._merge_with_same_bullet()
callback(0.75, "Text merging finished.") callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False, True) tbls = self._extract_table_figure(True, zoomin, True, True)
callback(0.8, "Text extraction finished") callback(0.8, "Text extraction finished")

View File

@ -33,7 +33,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback(0.77, "Text merging finished") callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, False, True) tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess # clean mess
for b in self.boxes: for b in self.boxes:

View File

@ -40,7 +40,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback(0.77, "Text merging finished") callback(0.77, "Text merging finished")
tbls = self._extract_table_figure(True, zoomin, False, True) tbls = self._extract_table_figure(True, zoomin, True, True)
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1))) cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
#self._naive_vertical_merge() #self._naive_vertical_merge()

View File

@ -48,7 +48,7 @@ class Pdf(PdfParser):
self._concat_downward(concat_between_pages=False) self._concat_downward(concat_between_pages=False)
self._filter_forpages() self._filter_forpages()
callback(0.75, "Text merging finished.") callback(0.75, "Text merging finished.")
tbls = self._extract_table_figure(True, zoomin, False, True) tbls = self._extract_table_figure(True, zoomin, True, True)
# clean mess # clean mess
if column_width < self.page_images[0].size[0] / zoomin / 2: if column_width < self.page_images[0].size[0] / zoomin / 2:

View File

@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。"):
tk_nums[-1] += tnum tk_nums[-1] += tnum
for sec, pos in sections: for sec, pos in sections:
add_chunk(sec, pos)
continue
s, e = 0, 1 s, e = 0, 1
while e < len(sec): while e < len(sec):
if sec[e] in delimiter: if sec[e] in delimiter:

View File

@ -83,7 +83,7 @@ class Dealer:
else: else:
s = s.sort( s = s.sort(
{"page_num_int": {"order": "asc", "unmapped_type": "float"}}, {"page_num_int": {"order": "asc", "unmapped_type": "float"}},
{"top_int": {"order": "asc", "unmapped_type": "float"}}, {"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
{"create_time": {"order": "desc", "unmapped_type": "date"}}, {"create_time": {"order": "desc", "unmapped_type": "date"}},
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}} {"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
) )

View File

@ -83,10 +83,10 @@ def dispatch():
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"])) pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
for s,e in r["parser_config"].get("pages", [(0,100000)]): for s,e in r["parser_config"].get("pages", [(0,100000)]):
e = min(e, pages) e = min(e, pages)
for p in range(s, e, 10): for p in range(s, e, 5):
task = new_task() task = new_task()
task["from_page"] = p task["from_page"] = p
task["to_page"] = min(p + 10, e) task["to_page"] = min(p + 5, e)
tsks.append(task) tsks.append(task)
else: else:
tsks.append(new_task()) tsks.append(new_task())