mirror of
https://git.mirrors.martin98.com/https://github.com/infiniflow/ragflow.git
synced 2025-08-12 12:29:02 +08:00
Support displaying tables in the chunks of pdf file when using QA parser (#1263)
### What problem does this PR solve? Support displaying tables in the chunks of pdf file when using QA parser ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
parent
6c6f5a3a47
commit
b75bb1d8d3
@ -22,6 +22,7 @@ from rag.settings import cron_logger
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from markdown import markdown
|
||||
class Excel(ExcelParser):
|
||||
def __call__(self, fnm, binary=None, callback=None):
|
||||
if not binary:
|
||||
@ -374,8 +375,6 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
code_block = False
|
||||
level_index = [-1] * 7
|
||||
for index, l in enumerate(lines):
|
||||
if not l.strip():
|
||||
continue
|
||||
if l.strip().startswith('```'):
|
||||
code_block = not code_block
|
||||
question_level, question = 0, ''
|
||||
@ -385,10 +384,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
if not question_level or question_level > 6: # not a question
|
||||
last_answer = f'{last_answer}\n{l}'
|
||||
else: # is a question
|
||||
if last_answer:
|
||||
if last_answer.strip():
|
||||
sum_question = '\n'.join(question_stack)
|
||||
if sum_question:
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
||||
last_answer = ''
|
||||
|
||||
i = question_level
|
||||
@ -397,10 +396,10 @@ def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
|
||||
level_stack.pop()
|
||||
question_stack.append(question)
|
||||
level_stack.append(question_level)
|
||||
if last_answer:
|
||||
if last_answer.strip():
|
||||
sum_question = '\n'.join(question_stack)
|
||||
if sum_question:
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, last_answer, eng))
|
||||
res.append(beAdoc(deepcopy(doc), sum_question, markdown(last_answer, extensions=['markdown.extensions.tables']), eng))
|
||||
return res
|
||||
elif re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
docx_parser = Docx()
|
||||
|
@ -143,3 +143,4 @@ webdriver-manager==4.0.1
|
||||
cn2an==0.5.22
|
||||
roman-numbers==1.0.2
|
||||
word2number==1.1
|
||||
markdown==3.6
|
@ -143,4 +143,5 @@ selenium==4.21.0
|
||||
webdriver-manager==4.0.1
|
||||
cn2an==0.5.22
|
||||
roman-numbers==1.0.2
|
||||
word2number==1.1
|
||||
word2number==1.1
|
||||
markdown==3.6
|
||||
|
@ -129,3 +129,4 @@ html_text==0.6.2
|
||||
cn2an==0.5.22
|
||||
roman-numbers==1.0.2
|
||||
word2number==1.1
|
||||
markdown==3.6
|
||||
|
Loading…
x
Reference in New Issue
Block a user