From b75bb1d8d35653c121a00313009a4b342de5774e Mon Sep 17 00:00:00 2001 From: Zhedong Cen Date: Mon, 24 Jun 2024 19:02:18 +0800 Subject: [PATCH] Support displaying tables in the chunks of pdf file when using QA parser (#1263) ### What problem does this PR solve? Support displaying tables in the chunks of pdf file when using QA parser ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- rag/app/qa.py | 11 +++++------ requirements.txt | 1 + requirements_arm.txt | 3 ++- requirements_dev.txt | 1 + 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rag/app/qa.py b/rag/app/qa.py index 4e95cf518..ace5b4970 100644 --- a/rag/app/qa.py +++ b/rag/app/qa.py @@ -22,6 +22,7 @@ from rag.settings import cron_logger from deepdoc.parser import PdfParser, ExcelParser, DocxParser from docx import Document from PIL import Image +from markdown import markdown class Excel(ExcelParser): def __call__(self, fnm, binary=None, callback=None): if not binary: @@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): code_block = False level_index = [-1] * 7 for index, l in enumerate(lines): - if not l.strip(): - continue if l.strip().startswith('```'): code_block = not code_block question_level, question = 0, '' @@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): if not question_level or question_level > 6: # not a question last_answer = f'{last_answer}\n{l}' else: # is a question - if last_answer: + if last_answer.strip(): sum_question = '\n'.join(question_stack) if sum_question: - res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) + res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) last_answer = '' i = question_level @@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs): level_stack.pop() question_stack.append(question) level_stack.append(question_level) - if last_answer: + if last_answer.strip(): sum_question = '\n'.join(question_stack) if sum_question: - res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) + res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng)) return res elif re.search(r"\.docx$", filename, re.IGNORECASE): docx_parser = Docx() diff --git a/requirements.txt b/requirements.txt index 2e2324fc6..2b4998371 100644 --- a/requirements.txt +++ b/requirements.txt @@ -143,3 +143,4 @@ webdriver-manager==4.0.1 cn2an==0.5.22 roman-numbers==1.0.2 word2number==1.1 +markdown==3.6 \ No newline at end of file diff --git a/requirements_arm.txt b/requirements_arm.txt index 4cb4d83e8..cc35c0521 100644 --- a/requirements_arm.txt +++ b/requirements_arm.txt @@ -143,4 +143,5 @@ selenium==4.21.0 webdriver-manager==4.0.1 cn2an==0.5.22 roman-numbers==1.0.2 -word2number==1.1 \ No newline at end of file +word2number==1.1 +markdown==3.6 diff --git a/requirements_dev.txt b/requirements_dev.txt index 7df7a3002..27db92dad 100644 --- a/requirements_dev.txt +++ b/requirements_dev.txt @@ -129,3 +129,4 @@ html_text==0.6.2 cn2an==0.5.22 roman-numbers==1.0.2 word2number==1.1 +markdown==3.6