mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-14 01:25:52 +08:00
let presentation do raptor (#2838)
### What problem does this PR solve? #2837 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
260d694bbc
commit
b540d41cdc
@ -439,8 +439,9 @@ def change_parser():
|
|||||||
else:
|
else:
|
||||||
return get_json_result(data=True)
|
return get_json_result(data=True)
|
||||||
|
|
||||||
if doc.type == FileType.VISUAL or re.search(
|
if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture")
|
||||||
r"\.(ppt|pptx|pages)$", doc.name):
|
or (re.search(
|
||||||
|
r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")):
|
||||||
return get_data_error_result(retmsg="Not supported yet!")
|
return get_data_error_result(retmsg="Not supported yet!")
|
||||||
|
|
||||||
e = DocumentService.update_by_id(doc.id,
|
e = DocumentService.update_by_id(doc.id,
|
||||||
|
@ -68,6 +68,7 @@ class Excel(ExcelParser):
|
|||||||
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
[rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|
||||||
class Pdf(PdfParser):
|
class Pdf(PdfParser):
|
||||||
def __call__(self, filename, binary=None, from_page=0,
|
def __call__(self, filename, binary=None, from_page=0,
|
||||||
to_page=100000, zoomin=3, callback=None):
|
to_page=100000, zoomin=3, callback=None):
|
||||||
@ -155,6 +156,7 @@ class Pdf(PdfParser):
|
|||||||
if last_q:
|
if last_q:
|
||||||
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True)))
|
||||||
return qai_list, tbls
|
return qai_list, tbls
|
||||||
|
|
||||||
def get_tbls_info(self, tbls, tbl_index):
|
def get_tbls_info(self, tbls, tbl_index):
|
||||||
if tbl_index >= len(tbls):
|
if tbl_index >= len(tbls):
|
||||||
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', ''
|
||||||
@ -166,10 +168,13 @@ class Pdf(PdfParser):
|
|||||||
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
||||||
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
.format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom)
|
||||||
tbl_text = ''.join(tbls[tbl_index][0][1])
|
tbl_text = ''.join(tbls[tbl_index][0][1])
|
||||||
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text
|
return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag,
|
||||||
|
|
||||||
|
|
||||||
class Docx(DocxParser):
|
class Docx(DocxParser):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def get_picture(self, document, paragraph):
|
def get_picture(self, document, paragraph):
|
||||||
img = paragraph._element.xpath('.//pic:pic')
|
img = paragraph._element.xpath('.//pic:pic')
|
||||||
if not img:
|
if not img:
|
||||||
@ -242,6 +247,7 @@ class Docx(DocxParser):
|
|||||||
tbls.append(((None, html), ""))
|
tbls.append(((None, html), ""))
|
||||||
return qai_list, tbls
|
return qai_list, tbls
|
||||||
|
|
||||||
|
|
||||||
def rmPrefix(txt):
|
def rmPrefix(txt):
|
||||||
return re.sub(
|
return re.sub(
|
||||||
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
|
r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
|
||||||
@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss):
|
|||||||
add_positions(d, poss)
|
add_positions(d, poss)
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def beAdocDocx(d, q, a, eng, image):
|
def beAdocDocx(d, q, a, eng, image):
|
||||||
qprefix = "Question: " if eng else "问题:"
|
qprefix = "Question: " if eng else "问题:"
|
||||||
aprefix = "Answer: " if eng else "回答:"
|
aprefix = "Answer: " if eng else "回答:"
|
||||||
@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image):
|
|||||||
d["image"] = image
|
d["image"] = image
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def beAdoc(d, q, a, eng):
|
def beAdoc(d, q, a, eng):
|
||||||
qprefix = "Question: " if eng else "问题:"
|
qprefix = "Question: " if eng else "问题:"
|
||||||
aprefix = "Answer: " if eng else "回答:"
|
aprefix = "Answer: " if eng else "回答:"
|
||||||
@ -282,6 +290,7 @@ def mdQuestionLevel(s):
|
|||||||
match = re.match(r'#*', s)
|
match = re.match(r'#*', s)
|
||||||
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s)
|
||||||
|
|
||||||
|
|
||||||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Excel and csv(txt) format files are supported.
|
Excel and csv(txt) format files are supported.
|
||||||
|
Loading…
x
Reference in New Issue
Block a user