mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 07:05:54 +08:00
fix position extraction bug (#93)
* fix position extraction bug * remove delimiter for naive parser
This commit is contained in:
parent
fae00827e6
commit
7bfaf0df29
@ -60,7 +60,8 @@ def list():
|
||||
for id in sres.ids:
|
||||
d = {
|
||||
"chunk_id": id,
|
||||
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get("content_with_weight", ""),
|
||||
"content_with_weight": rmSpace(sres.highlight[id]) if question else sres.field[id].get(
|
||||
"content_with_weight", ""),
|
||||
"doc_id": sres.field[id]["doc_id"],
|
||||
"docnm_kwd": sres.field[id]["docnm_kwd"],
|
||||
"important_kwd": sres.field[id].get("important_kwd", []),
|
||||
@ -68,9 +69,11 @@ def list():
|
||||
"available_int": sres.field[id].get("available_int", 1),
|
||||
"positions": sres.field[id].get("position_int", "").split("\t")
|
||||
}
|
||||
if len(d["positions"]) % 5 == 0:
|
||||
poss = []
|
||||
for i in range(0, len(d["positions"]), 5):
|
||||
poss.append([float(d["positions"][i]), float(d["positions"][i+1]), float(d["positions"][i+2]), float(d["positions"][i+3]), float(d["positions"][i+4])])
|
||||
poss.append([float(d["positions"][i]), float(d["positions"][i + 1]), float(d["positions"][i + 2]),
|
||||
float(d["positions"][i + 3]), float(d["positions"][i + 4])])
|
||||
d["positions"] = poss
|
||||
res["chunks"].append(d)
|
||||
return get_json_result(data=res)
|
||||
@ -137,10 +140,10 @@ def set():
|
||||
return get_data_error_result(retmsg="Document not found!")
|
||||
|
||||
if doc.parser_id == ParserType.QA:
|
||||
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t)>1]
|
||||
arr = [t for t in re.split(r"[\n\t]", req["content_with_weight"]) if len(t) > 1]
|
||||
if len(arr) != 2: return get_data_error_result(retmsg="Q&A must be separated by TAB/ENTER key.")
|
||||
q, a = rmPrefix(arr[0]), rmPrefix[arr[1]]
|
||||
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q+a]))
|
||||
d = beAdoc(d, arr[0], arr[1], not any([huqie.is_chinese(t) for t in q + a]))
|
||||
|
||||
v, c = embd_mdl.encode([doc.name, req["content_with_weight"]])
|
||||
v = 0.1 * v[0] + 0.9 * v[1] if doc.parser_id != ParserType.QA else v[1]
|
||||
@ -189,7 +192,8 @@ def create():
|
||||
md5 = hashlib.md5()
|
||||
md5.update((req["content_with_weight"] + req["doc_id"]).encode("utf-8"))
|
||||
chunck_id = md5.hexdigest()
|
||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]), "content_with_weight": req["content_with_weight"]}
|
||||
d = {"id": chunck_id, "content_ltks": huqie.qie(req["content_with_weight"]),
|
||||
"content_with_weight": req["content_with_weight"]}
|
||||
d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
|
||||
d["important_kwd"] = req.get("important_kwd", [])
|
||||
d["important_tks"] = huqie.qie(" ".join(req.get("important_kwd", [])))
|
||||
|
@ -527,7 +527,7 @@ class Dialog(DataBaseModel):
|
||||
tenant_id = CharField(max_length=32, null=False)
|
||||
name = CharField(max_length=255, null=True, help_text="dialog application name")
|
||||
description = TextField(null=True, help_text="Dialog description")
|
||||
icon = CharField(max_length=16, null=False, help_text="dialog icon")
|
||||
icon = TextField(null=True, help_text="icon base64 string")
|
||||
language = CharField(max_length=32, null=True, default="Chinese", help_text="English|Chinese")
|
||||
llm_id = CharField(max_length=32, null=False, help_text="default llm ID")
|
||||
llm_setting = JSONField(null=False, default={"temperature": 0.1, "top_p": 0.3, "frequency_penalty": 0.7,
|
||||
|
@ -35,6 +35,7 @@ class HuParser:
|
||||
self.updown_cnt_mdl.set_param({"device": "cuda"})
|
||||
self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
|
||||
filename="updown_concat_xgb.model"))
|
||||
self.page_from = 0
|
||||
"""
|
||||
If you have trouble downloading HuggingFace models, -_^ this might help!!
|
||||
|
||||
@ -683,7 +684,7 @@ class HuParser:
|
||||
"layoutno", "")))
|
||||
|
||||
left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
|
||||
poss.append((pn, left, right, top, bott))
|
||||
poss.append((pn+self.page_from, left, right, top, bott))
|
||||
return self.page_images[pn] \
|
||||
.crop((left * ZM, top * ZM,
|
||||
right * ZM, bott * ZM))
|
||||
@ -863,6 +864,7 @@ class HuParser:
|
||||
self.garbages = {}
|
||||
self.page_cum_height = [0]
|
||||
self.page_layout = []
|
||||
self.page_from = page_from
|
||||
try:
|
||||
self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
|
||||
self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
|
||||
@ -947,7 +949,9 @@ class HuParser:
|
||||
left, right, top, bottom = float(left), float(
|
||||
right), float(top), float(bottom)
|
||||
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
|
||||
if not poss: return
|
||||
if not poss:
|
||||
if need_position: return None, None
|
||||
return
|
||||
|
||||
max_width = np.max([right-left for (_, left, right, _, _) in poss])
|
||||
GAP = 6
|
||||
@ -969,7 +973,8 @@ class HuParser:
|
||||
bottom, self.page_images[pns[0]].size[1])
|
||||
))
|
||||
)
|
||||
positions.append((pns[0], left, right, top, min(
|
||||
if 0 < ii < len(poss)-1:
|
||||
positions.append((pns[0]+self.page_from, left, right, top, min(
|
||||
bottom, self.page_images[pns[0]].size[1])/ZM))
|
||||
bottom -= self.page_images[pns[0]].size[1]
|
||||
for pn in pns[1:]:
|
||||
@ -980,7 +985,8 @@ class HuParser:
|
||||
self.page_images[pn].size[1])
|
||||
))
|
||||
)
|
||||
positions.append((pn, left, right, 0, min(
|
||||
if 0 < ii < len(poss) - 1:
|
||||
positions.append((pn+self.page_from, left, right, 0, min(
|
||||
bottom, self.page_images[pn].size[1]) / ZM))
|
||||
bottom -= self.page_images[pn].size[1]
|
||||
|
||||
|
@ -10,7 +10,7 @@ PY=/root/miniconda3/envs/py11/bin/python
|
||||
|
||||
function task_exe(){
|
||||
sleep 60;
|
||||
while [ 1 -eq 1 ];do mpirun -n 2 --allow-run-as-root $PY rag/svr/task_executor.py ; done
|
||||
while [ 1 -eq 1 ];do mpirun -n 4 --allow-run-as-root $PY rag/svr/task_executor.py ; done
|
||||
}
|
||||
|
||||
function watch_broker(){
|
||||
|
@ -41,7 +41,7 @@ class Pdf(PdfParser):
|
||||
self._filter_forpages()
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
callback(0.8, "Text extraction finished")
|
||||
|
||||
|
@ -33,7 +33,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
for b in self.boxes:
|
||||
|
@ -40,7 +40,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.77, "Text merging finished")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
cron_logger.info("paddle layouts:".format((timer() - start) / (self.total_page + 0.1)))
|
||||
#self._naive_vertical_merge()
|
||||
|
@ -48,7 +48,7 @@ class Pdf(PdfParser):
|
||||
self._concat_downward(concat_between_pages=False)
|
||||
self._filter_forpages()
|
||||
callback(0.75, "Text merging finished.")
|
||||
tbls = self._extract_table_figure(True, zoomin, False, True)
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
|
||||
# clean mess
|
||||
if column_width < self.page_images[0].size[0] / zoomin / 2:
|
||||
|
@ -246,6 +246,8 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"):
|
||||
tk_nums[-1] += tnum
|
||||
|
||||
for sec, pos in sections:
|
||||
add_chunk(sec, pos)
|
||||
continue
|
||||
s, e = 0, 1
|
||||
while e < len(sec):
|
||||
if sec[e] in delimiter:
|
||||
|
@ -83,7 +83,7 @@ class Dealer:
|
||||
else:
|
||||
s = s.sort(
|
||||
{"page_num_int": {"order": "asc", "unmapped_type": "float"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float"}},
|
||||
{"top_int": {"order": "asc", "unmapped_type": "float", "mode" : "avg"}},
|
||||
{"create_time": {"order": "desc", "unmapped_type": "date"}},
|
||||
{"create_timestamp_flt": {"order": "desc", "unmapped_type": "float"}}
|
||||
)
|
||||
|
@ -83,10 +83,10 @@ def dispatch():
|
||||
pages = PdfParser.total_page_number(r["name"], MINIO.get(r["kb_id"], r["location"]))
|
||||
for s,e in r["parser_config"].get("pages", [(0,100000)]):
|
||||
e = min(e, pages)
|
||||
for p in range(s, e, 10):
|
||||
for p in range(s, e, 5):
|
||||
task = new_task()
|
||||
task["from_page"] = p
|
||||
task["to_page"] = min(p + 10, e)
|
||||
task["to_page"] = min(p + 5, e)
|
||||
tsks.append(task)
|
||||
else:
|
||||
tsks.append(new_task())
|
||||
|
Loading…
x
Reference in New Issue
Block a user