Feat: add VLM-boosted DocX parser (#6307)

### What problem does this PR solve?

Add VLM-boosted DocX parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei 2025-03-20 11:24:44 +08:00 committed by GitHub
parent e4380843c4
commit 9611185eb4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 32 additions and 6 deletions

View File

@ -15,10 +15,19 @@
# #
from PIL import Image
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts import vision_llm_figure_describe_prompt from rag.prompts import vision_llm_figure_describe_prompt
def vision_figure_parser_figure_data_wraper(figures_data_without_positions):
return [(
(figure_data[1], [figure_data[0]]),
[(0, 0, 0, 0, 0)]
) for figure_data in figures_data_without_positions if isinstance(figure_data[1], Image.Image)]
class VisionFigureParser: class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs): def __init__(self, vision_model, figures_data, *args, **kwargs):
self.vision_model = vision_model self.vision_model = vision_model
@ -33,14 +42,14 @@ class VisionFigureParser:
for item in figures_data: for item in figures_data:
# position # position
if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and len(item[1][0]) == 5: if len(item) == 2 and isinstance(item[1], list) and len(item[1]) == 1 and isinstance(item[1][0], tuple) and len(item[1][0]) == 5:
img_desc = item[0] img_desc = item[0]
assert len(img_desc) == 2, "Should be (figure, [description])" assert len(img_desc) == 2 and isinstance(img_desc[0], Image.Image) and isinstance(img_desc[1], list), "Should be (figure, [description])"
self.figures.append(img_desc[0]) self.figures.append(img_desc[0])
self.descriptions.append(img_desc[1]) self.descriptions.append(img_desc[1])
self.positions.append(item[1]) self.positions.append(item[1])
else: else:
assert len(item) == 2 and isinstance(item, tuple), f"get {len(item)=}, {item=}" assert len(item) == 2 and isinstance(item, tuple) and isinstance(item[1], list), f"get {len(item)=}, {item=}"
self.figures.append(item[0]) self.figures.append(item[0])
self.descriptions.append(item[1]) self.descriptions.append(item[1])

View File

@ -29,7 +29,7 @@ from tika import parser
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -226,10 +226,27 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = None pdf_parser = None
if re.search(r"\.docx$", filename, re.IGNORECASE): if re.search(r"\.docx$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
sections, tables = Docx()(filename, binary)
res = tokenize_table(tables, doc, is_english) # just for table
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
sections, tables = Docx()(filename, binary)
if vision_model:
figures_data = vision_figure_parser_figure_data_wraper(sections)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
boosted_figures = docx_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
st = timer() st = timer()
chunks, images = naive_merge_docx( chunks, images = naive_merge_docx(