Feat: add fallback for PDF figure parser (#6305)

### What problem does this PR solve?

Add fallback for PDF figure parser

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei 2025-03-20 10:48:38 +08:00 committed by GitHub
parent 046f0bba74
commit e4380843c4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -29,8 +29,8 @@ from tika import parser
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.figure_parser import VisionFigureParser from deepdoc.parser.figure_parser import VisionFigureParser
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table
from rag.utils import num_tokens_from_string from rag.utils import num_tokens_from_string
@ -246,24 +246,32 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.pdf$", filename, re.IGNORECASE): elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
callback(0.1, "Start to parse.")
if layout_recognizer == "DeepDOC": if layout_recognizer == "DeepDOC":
pdf_parser = Pdf() pdf_parser = Pdf()
try: try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.15, "Visual model detected. Attempting to enhance figure extraction...")
except Exception: except Exception:
vision_model = None vision_model = None
if vision_model: if vision_model:
sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True) sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True)
pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs) callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...")
boosted_figures = pdf_vision_parser(callback=callback) try:
tables.extend(boosted_figures) pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs)
boosted_figures = pdf_vision_parser(callback=callback)
tables.extend(boosted_figures)
except Exception as e:
callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.")
tables.extend(figures)
else: else:
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
else: else:
if layout_recognizer == "Plain Text": if layout_recognizer == "Plain Text":
@ -275,6 +283,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
callback=callback) callback=callback)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")