From 3c1444ab1995757fe22ff9f3ea8e7f145b723373 Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Thu, 20 Jun 2024 17:03:02 +0800 Subject: [PATCH] Add docx support for manual parser (#1227) ### What problem does this PR solve? Add docx support for manual parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/app/manual.py | 255 +++++++++++++++++++++++++++++++------------- rag/app/qa.py | 12 +-- rag/nlp/__init__.py | 6 ++ 3 files changed, 189 insertions(+), 84 deletions(-) diff --git a/rag/app/manual.py b/rag/app/manual.py index 240463e3c..f3a5fa41e 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -18,10 +18,13 @@ import copy import re from api.db import ParserType -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks +from io import BytesIO +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, add_positions, bullets_category, title_frequency, tokenize_chunks, docx_question_level from deepdoc.parser import PdfParser, PlainParser from rag.utils import num_tokens_from_string - +from deepdoc.parser import PdfParser, ExcelParser, DocxParser +from docx import Document +from PIL import Image class Pdf(PdfParser): def __init__(self): @@ -64,6 +67,98 @@ class Pdf(PdfParser): return [(b["text"], b.get("layout_no", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls +class Docx(DocxParser): + def __init__(self): + pass + def get_picture(self, document, paragraph): + img = paragraph._element.xpath('.//pic:pic') + if not img: + return None + img = img[0] + embed = img.xpath('.//a:blip/@r:embed')[0] + related_part = document.part.related_parts[embed] + image = related_part.image + image = Image.open(BytesIO(image.blob)) + return image + def concat_img(self, img1, img2): + if img1 and not img2: + return img1 + if not img1 and img2: + return img2 + if not img1 and not img2: + return None + width1, height1 = img1.size + width2, height2 = img2.size + + new_width = max(width1, width2) + new_height = height1 + height2 + new_image = Image.new('RGB', (new_width, new_height)) + + new_image.paste(img1, (0, 0)) + new_image.paste(img2, (0, height1)) + + return new_image + + def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): + self.doc = Document( + filename) if not binary else Document(BytesIO(binary)) + pn = 0 + last_answer, last_image = "", None + question_stack, level_stack = [], [] + ti_list = [] + for p in self.doc.paragraphs: + if pn > to_page: + break + question_level, p_text = 0, '' + if from_page <= pn < to_page and p.text.strip(): + question_level, p_text = docx_question_level(p) + if not question_level or question_level > 6: # not a question + last_answer = f'{last_answer}\n{p_text}' + current_image = self.get_picture(self.doc, p) + last_image = self.concat_img(last_image, current_image) + else: # is a question + if last_answer or last_image: + sum_question = '\n'.join(question_stack) + if sum_question: + ti_list.append((f'{sum_question}\n{last_answer}', last_image)) + last_answer, last_image = '', None + + i = question_level + while question_stack and i <= level_stack[-1]: + question_stack.pop() + level_stack.pop() + question_stack.append(p_text) + level_stack.append(question_level) + for run in p.runs: + if 'lastRenderedPageBreak' in run._element.xml: + pn += 1 + continue + if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + pn += 1 + if last_answer: + sum_question = '\n'.join(question_stack) + if sum_question: + ti_list.append((f'{sum_question}\n{last_answer}', last_image)) + + tbls = [] + for tb in self.doc.tables: + html= "" + for r in tb.rows: + html += "" + i = 0 + while i < len(r.cells): + span = 1 + c = r.cells[i] + for j in range(i+1, len(r.cells)): + if c.text == r.cells[j].text: + span += 1 + i = j + i += 1 + html += f"" if span == 1 else f"" + html += "" + html += "
{c.text}{c.text}
" + tbls.append(((None, html), "")) + return ti_list, tbls def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): @@ -71,7 +166,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, Only pdf is supported. """ pdf_parser = None - + doc = { + "docnm_kwd": filename + } + doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) + doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) + # is it English + eng = lang.lower() == "english" # pdf_parser.is_english if re.search(r"\.pdf$", filename, re.IGNORECASE): pdf_parser = Pdf() if kwargs.get( "parser_config", {}).get( @@ -80,80 +181,84 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, from_page=from_page, to_page=to_page, callback=callback) if sections and len(sections[0]) < 3: sections = [(t, l, [[0] * 5]) for t, l in sections] + # set pivot using the most frequent type of title, + # then merge between 2 pivot + if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: + max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) + most_level = max(0, max_lvl - 1) + levels = [] + for txt, _, _ in sections: + for t, lvl in pdf_parser.outlines: + tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) + tks_ = set([txt[i] + txt[i + 1] + for i in range(min(len(t), len(txt) - 1))]) + if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: + levels.append(lvl) + break + else: + levels.append(max_lvl + 1) + else: + bull = bullets_category([txt for txt, _, _ in sections]) + most_level, levels = title_frequency( + bull, [(txt, l) for txt, l, poss in sections]) + + assert len(sections) == len(levels) + sec_ids = [] + sid = 0 + for i, lvl in enumerate(levels): + if lvl <= most_level and i > 0 and lvl != levels[i - 1]: + sid += 1 + sec_ids.append(sid) + # print(lvl, self.boxes[i]["text"], most_level, sid) + + sections = [(txt, sec_ids[i], poss) + for i, (txt, _, poss) in enumerate(sections)] + for (img, rows), poss in tbls: + if not rows: continue + sections.append((rows if isinstance(rows, str) else rows[0], -1, + [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) + + def tag(pn, left, right, top, bottom): + if pn + left + right + top + bottom == 0: + return "" + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ + .format(pn, left, right, top, bottom) + + chunks = [] + last_sid = -2 + tk_cnt = 0 + for txt, sec_id, poss in sorted(sections, key=lambda x: ( + x[-1][0][0], x[-1][0][3], x[-1][0][1])): + poss = "\t".join([tag(*pos) for pos in poss]) + if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)): + if chunks: + chunks[-1] += "\n" + txt + poss + tk_cnt += num_tokens_from_string(txt) + continue + chunks.append(txt + poss) + tk_cnt = num_tokens_from_string(txt) + if sec_id > -1: + last_sid = sec_id + + res = tokenize_table(tbls, doc, eng) + res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) + return res + if re.search(r"\.docx$", filename, re.IGNORECASE): + docx_parser = Docx() + ti_list, tbls = docx_parser(filename, binary, + from_page=0, to_page=10000, callback=callback) + res = tokenize_table(tbls, doc, eng) + for text, image in ti_list: + d = copy.deepcopy(doc) + d['image'] = image + tokenize(d, text, eng) + res.append(d) + return res else: - raise NotImplementedError("file type not supported yet(pdf supported)") - doc = { - "docnm_kwd": filename - } - doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) - doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) - # is it English - eng = lang.lower() == "english" # pdf_parser.is_english + raise NotImplementedError("file type not supported yet(pdf and docx supported)") + - # set pivot using the most frequent type of title, - # then merge between 2 pivot - if len(sections) > 0 and len(pdf_parser.outlines) / len(sections) > 0.1: - max_lvl = max([lvl for _, lvl in pdf_parser.outlines]) - most_level = max(0, max_lvl - 1) - levels = [] - for txt, _, _ in sections: - for t, lvl in pdf_parser.outlines: - tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) - tks_ = set([txt[i] + txt[i + 1] - for i in range(min(len(t), len(txt) - 1))]) - if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: - levels.append(lvl) - break - else: - levels.append(max_lvl + 1) - - else: - bull = bullets_category([txt for txt, _, _ in sections]) - most_level, levels = title_frequency( - bull, [(txt, l) for txt, l, poss in sections]) - - assert len(sections) == len(levels) - sec_ids = [] - sid = 0 - for i, lvl in enumerate(levels): - if lvl <= most_level and i > 0 and lvl != levels[i - 1]: - sid += 1 - sec_ids.append(sid) - # print(lvl, self.boxes[i]["text"], most_level, sid) - - sections = [(txt, sec_ids[i], poss) - for i, (txt, _, poss) in enumerate(sections)] - for (img, rows), poss in tbls: - if not rows: continue - sections.append((rows if isinstance(rows, str) else rows[0], -1, - [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) - - def tag(pn, left, right, top, bottom): - if pn + left + right + top + bottom == 0: - return "" - return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ - .format(pn, left, right, top, bottom) - - chunks = [] - last_sid = -2 - tk_cnt = 0 - for txt, sec_id, poss in sorted(sections, key=lambda x: ( - x[-1][0][0], x[-1][0][3], x[-1][0][1])): - poss = "\t".join([tag(*pos) for pos in poss]) - if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)): - if chunks: - chunks[-1] += "\n" + txt + poss - tk_cnt += num_tokens_from_string(txt) - continue - chunks.append(txt + poss) - tk_cnt = num_tokens_from_string(txt) - if sec_id > -1: - last_sid = sec_id - - res = tokenize_table(tbls, doc, eng) - res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser)) - return res if __name__ == "__main__": @@ -164,4 +269,4 @@ if __name__ == "__main__": pass - chunk(sys.argv[1], callback=dummy) + chunk(sys.argv[1], callback=dummy) \ No newline at end of file diff --git a/rag/app/qa.py b/rag/app/qa.py index 397a045b8..bbc8c029d 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -16,7 +16,7 @@ from io import BytesIO from timeit import default_timer as timer from nltk import word_tokenize from openpyxl import load_workbook -from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet +from rag.nlp import is_english, random_choices, find_codec, qbullets_category, add_positions, has_qbullet, docx_question_level from rag.nlp import rag_tokenizer, tokenize_table from rag.settings import cron_logger from deepdoc.parser import PdfParser, ExcelParser, DocxParser @@ -165,7 +165,7 @@ class Docx(DocxParser): break question_level, p_text = 0, '' if from_page <= pn < to_page and p.text.strip(): - question_level, p_text = docxQuestionLevel(p) + question_level, p_text = docx_question_level(p) if not question_level or question_level > 6: # not a question last_answer = f'{last_answer}\n{p_text}' current_image = self.get_picture(self.doc, p) @@ -254,12 +254,6 @@ def mdQuestionLevel(s): match = re.match(r'#*', s) return (len(match.group(0)), s.lstrip('#').lstrip()) if match else (0, s) -def docxQuestionLevel(p): - if p.style.name.startswith('Heading'): - return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip() - else: - return 0, re.sub(r"\u3000", " ", p.text).strip() - def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): """ Excel and csv(txt) format files are supported. @@ -405,4 +399,4 @@ if __name__ == "__main__": def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) + chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) \ No newline at end of file diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index eff298686..2f404c404 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -497,3 +497,9 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): add_chunk(sec[s: e], pos) return cks + +def docx_question_level(p): + if p.style.name.startswith('Heading'): + return int(p.style.name.split(' ')[-1]), re.sub(r"\u3000", " ", p.text).strip() + else: + return 0, re.sub(r"\u3000", " ", p.text).strip() \ No newline at end of file