diff --git a/rag/app/naive.py b/rag/app/naive.py index 95d896d3e..a335659c9 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -29,8 +29,8 @@ from tika import parser from api.db import LLMType from api.db.services.llm_service import LLMBundle from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser -from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.figure_parser import VisionFigureParser +from deepdoc.parser.pdf_parser import PlainParser, VisionParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table from rag.utils import num_tokens_from_string @@ -246,24 +246,32 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.pdf$", filename, re.IGNORECASE): layout_recognizer = parser_config.get("layout_recognize", "DeepDOC") + callback(0.1, "Start to parse.") if layout_recognizer == "DeepDOC": pdf_parser = Pdf() try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.15, "Visual model detected. Attempting to enhance figure extraction...") except Exception: vision_model = None if vision_model: sections, tables, figures = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback, separate_tables_figures=True) - pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs) - boosted_figures = pdf_vision_parser(callback=callback) - tables.extend(boosted_figures) + callback(0.5, "Basic parsing complete. Proceeding with figure enhancement...") + try: + pdf_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures, **kwargs) + boosted_figures = pdf_vision_parser(callback=callback) + tables.extend(boosted_figures) + except Exception as e: + callback(0.6, f"Visual model error: {e}. Skipping figure parsing enhancement.") + tables.extend(figures) else: sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") else: if layout_recognizer == "Plain Text": @@ -275,6 +283,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) res = tokenize_table(tables, doc, is_english) + callback(0.8, "Finish parsing.") elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.")