Support displaying tables in the chunks of pdf file when using QA parser (#1263)

### What problem does this PR solve?

Support displaying tables in the chunks of pdf file when using QA parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Zhedong Cen 2024-06-24 19:02:18 +08:00 committed by GitHub
parent 6c6f5a3a47
commit b75bb1d8d3
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 9 additions and 7 deletions

View File

@ -22,6 +22,7 @@ from rag.settings import cron_logger
from deepdoc.parser import PdfParser, ExcelParser, DocxParser from deepdoc.parser import PdfParser, ExcelParser, DocxParser
from docx import Document from docx import Document
from PIL import Image from PIL import Image
from markdown import markdown
class Excel(ExcelParser): class Excel(ExcelParser):
def __call__(self, fnm, binary=None, callback=None): def __call__(self, fnm, binary=None, callback=None):
if not binary: if not binary:
@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
code_block = False code_block = False
level_index = [-1] * 7 level_index = [-1] * 7
for index, l in enumerate(lines): for index, l in enumerate(lines):
if not l.strip():
continue
if l.strip().startswith('```'): if l.strip().startswith('```'):
code_block = not code_block code_block = not code_block
question_level, question = 0, '' question_level, question = 0, ''
@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
if not question_level or question_level > 6: # not a question if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{l}' last_answer = f'{last_answer}\n{l}'
else: # is a question else: # is a question
if last_answer: if last_answer.strip():
sum_question = '\n'.join(question_stack) sum_question = '\n'.join(question_stack)
if sum_question: if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
last_answer = '' last_answer = ''
i = question_level i = question_level
@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
level_stack.pop() level_stack.pop()
question_stack.append(question) question_stack.append(question)
level_stack.append(question_level) level_stack.append(question_level)
if last_answer: if last_answer.strip():
sum_question = '\n'.join(question_stack) sum_question = '\n'.join(question_stack)
if sum_question: if sum_question:
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng)) res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
return res return res
elif re.search(r"\.docx$", filename, re.IGNORECASE): elif re.search(r"\.docx$", filename, re.IGNORECASE):
docx_parser = Docx() docx_parser = Docx()

View File

@ -143,3 +143,4 @@ webdriver-manager==4.0.1
cn2an==0.5.22 cn2an==0.5.22
roman-numbers==1.0.2 roman-numbers==1.0.2
word2number==1.1 word2number==1.1
markdown==3.6

View File

@ -143,4 +143,5 @@ selenium==4.21.0
webdriver-manager==4.0.1 webdriver-manager==4.0.1
cn2an==0.5.22 cn2an==0.5.22
roman-numbers==1.0.2 roman-numbers==1.0.2
word2number==1.1 word2number==1.1
markdown==3.6

View File

@ -129,3 +129,4 @@ html_text==0.6.2
cn2an==0.5.22 cn2an==0.5.22
roman-numbers==1.0.2 roman-numbers==1.0.2
word2number==1.1 word2number==1.1
markdown==3.6