From 609236f5c1af270bcfb120646e6895df99392bf5 Mon Sep 17 00:00:00 2001 From: Kevin Hu Date: Mon, 25 Nov 2024 09:57:54 +0800 Subject: [PATCH] Let 'One' applicable for tables in docx (#3619) ### What problem does this PR solve? #3598 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Performance Improvement --- rag/app/one.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/rag/app/one.py b/rag/app/one.py index 9f24ccb95..76dc45893 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -16,7 +16,7 @@ from io import BytesIO import re from deepdoc.parser.utils import get_text -from rag.app import laws +from rag.app import naive from rag.nlp import rag_tokenizer, tokenize from deepdoc.parser import PdfParser, ExcelParser, PlainParser, HtmlParser @@ -67,7 +67,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections = [txt for txt in laws.Docx()(filename, binary) if txt] + sections, tbls = naive.Docx()(filename, binary) + sections = [s for s, _ in sections if s] + for (_, html), _ in tbls: + sections.append(html) callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE):