From b540d41cdceab8217fa7daba80a6e3bdf2703b7f Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Tue, 15 Oct 2024 10:11:09 +0800 Subject: [PATCH] let presentation do raptor (#2838) ### What problem does this PR solve? #2837 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/document_app.py | 5 +++-- rag/app/qa.py | 11 ++++++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 68e05dcf9..dc091eead 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -439,8 +439,9 @@ def change_parser(): else: return get_json_result(data=True) - if doc.type == FileType.VISUAL or re.search( - r"\.(ppt|pptx|pages)$", doc.name): + if ((doc.type == FileType.VISUAL and req["parser_id"] != "picture") + or (re.search( + r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation")): return get_data_error_result(retmsg="Not supported yet!") e = DocumentService.update_by_id(doc.id, diff --git a/rag/app/qa.py b/rag/app/qa.py index 8994b4f0f..fec6e6236 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -68,6 +68,7 @@ class Excel(ExcelParser): [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1]) return res + class Pdf(PdfParser): def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): @@ -155,6 +156,7 @@ class Pdf(PdfParser): if last_q: qai_list.append((last_q, last_a, *self.crop(last_tag, need_position=True))) return qai_list, tbls + def get_tbls_info(self, tbls, tbl_index): if tbl_index >= len(tbls): return 1, 0, 0, 0, 0, '@@0\t0\t0\t0\t0##', '' @@ -166,10 +168,13 @@ class Pdf(PdfParser): tbl_tag = "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ .format(tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom) tbl_text = ''.join(tbls[tbl_index][0][1]) - return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, tbl_text + return tbl_pn, tbl_left, tbl_right, tbl_top, tbl_bottom, tbl_tag, + + class Docx(DocxParser): def __init__(self): pass + def get_picture(self, document, paragraph): img = paragraph._element.xpath('.//pic:pic') if not img: @@ -242,6 +247,7 @@ class Docx(DocxParser): tbls.append(((None, html), "")) return qai_list, tbls + def rmPrefix(txt): return re.sub( r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE) @@ -258,6 +264,7 @@ def beAdocPdf(d, q, a, eng, image, poss): add_positions(d, poss) return d + def beAdocDocx(d, q, a, eng, image): qprefix = "Question: " if eng else "问题:" aprefix = "Answer: " if eng else "回答:" @@ -268,6 +275,7 @@ def beAdocDocx(d, q, a, eng, image): d["image"] = image return d + def beAdoc(d, q, a, eng): qprefix = "Question: " if eng else "问题:" aprefix = "Answer: " if eng else "回答:" @@ -282,6 +290,7 @@ def mdQuestionLevel(s): match = re.match(r'#*', s) return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) + def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): """ Excel and csv(txt) format files are supported.