From 1662c7eda397d07f33eb8928271a3eb3845cce4c Mon Sep 17 00:00:00 2001 From: Stephen Hu Date: Fri, 25 Apr 2025 18:35:28 +0800 Subject: [PATCH] Feat: Markdown add image (#7124) ### What problem does this PR solve? https://github.com/infiniflow/ragflow/issues/6984 1. Markdown parser supports get pictures 2. For Native, when handling Markdown, it will handle images 3. improve merge and ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu --- rag/app/naive.py | 87 ++++++++++++++++--- rag/nlp/__init__.py | 43 ++++++++- web/src/interfaces/database/knowledge.ts | 2 +- .../components/chunk-card/index.tsx | 6 +- 4 files changed, 120 insertions(+), 18 deletions(-) diff --git a/rag/app/naive.py b/rag/app/naive.py index a3c629e1e..dcaea240c 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -22,7 +22,7 @@ from timeit import default_timer as timer from docx import Document from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError -from markdown import markdown +import markdown from PIL import Image from tika import parser @@ -31,7 +31,7 @@ from api.db.services.llm_service import LLMBundle from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wraper from deepdoc.parser.pdf_parser import PlainParser, VisionParser -from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_docx, tokenize_table +from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.utils import num_tokens_from_string @@ -287,6 +287,41 @@ class Pdf(PdfParser): class Markdown(MarkdownParser): + def get_picture_urls(self, sections): + if not sections: + return [] + if isinstance(sections, type("")): + text = sections + elif isinstance(sections[0], type("")): + text = sections[0] + else: + return [] + + from bs4 import BeautifulSoup + md = markdown.Markdown() + html_content = md.convert(text) + soup = BeautifulSoup(html_content, 'html.parser') + html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] + return html_images + + def get_pictures(self, text): + """Download and open all images from markdown text.""" + import requests + image_urls = self.get_picture_urls(text) + images = [] + # Find all image URLs in text + for url in image_urls: + try: + response = requests.get(url, stream=True, timeout=30) + if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'): + img = Image.open(BytesIO(response.content)).convert('RGB') + images.append(img) + except Exception as e: + logging.error(f"Failed to download/open image from {url}: {e}") + continue + + return images if images else None + def __call__(self, filename, binary=None): if binary: encoding = find_codec(binary) @@ -335,6 +370,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None + section_images = None if re.search(r"\.docx$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") @@ -368,7 +404,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if kwargs.get("section_only", False): return chunks - res.extend(tokenize_chunks_docx(chunks, doc, is_english, images)) + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) logging.info("naive_merge({}): {}".format(filename, timer() - st)) return res @@ -432,7 +468,20 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections, tables = Markdown(int(parser_config.get("chunk_token_num", 128)))(filename, binary) + markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) + sections, tables = markdown_parser(filename, binary) + + # Process images for each section + section_images = [] + for section_text, _ in sections: + images = markdown_parser.get_pictures(section_text) if section_text else None + if images: + # If multiple images found, combine them using concat_img + combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] + section_images.append(combined_image) + else: + section_images.append(None) + res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -467,14 +516,30 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() - chunks = naive_merge( - sections, int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) - if kwargs.get("section_only", False): - return chunks + if section_images: + # if all images are None, set section_images to None + if all(image is None for image in section_images): + section_images = None - res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) + if section_images: + chunks, images = naive_merge_with_images(sections, section_images, + int(parser_config.get( + "chunk_token_num", 128)), parser_config.get( + "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + return chunks + + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) + else: + chunks = naive_merge( + sections, int(parser_config.get( + "chunk_token_num", 128)), parser_config.get( + "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + return chunks + + res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) + logging.info("naive_merge({}): {}".format(filename, timer() - st)) return res diff --git a/rag/nlp/__init__.py b/rag/nlp/__init__.py index 171cb0911..a1edb3fe7 100644 --- a/rag/nlp/__init__.py +++ b/rag/nlp/__init__.py @@ -276,8 +276,7 @@ def tokenize_chunks(chunks, doc, eng, pdf_parser=None): res.append(d) return res - -def tokenize_chunks_docx(chunks, doc, eng, images): +def tokenize_chunks_with_images(chunks, doc, eng, images): res = [] # wrap up as es documents for ck, image in zip(chunks, images): @@ -290,7 +289,6 @@ def tokenize_chunks_docx(chunks, doc, eng, images): res.append(d) return res - def tokenize_table(tbls, doc, eng, batch_size=10): res = [] # add tables @@ -539,7 +537,46 @@ def naive_merge(sections, chunk_token_num=128, delimiter="\n。;!?"): add_chunk(sec, pos) return cks + +def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。;!?"): + if not texts or len(texts) != len(images): + return [], [] + # Enuser texts is str not tuple, if it is tuple, convert to str (get the first item) + if isinstance(texts[0], tuple): + texts = [t[0] for t in texts] + cks = [""] + result_images = [None] + tk_nums = [0] + + def add_chunk(t, image, pos=""): + nonlocal cks, result_images, tk_nums, delimiter + tnum = num_tokens_from_string(t) + if not pos: + pos = "" + if tnum < 8: + pos = "" + # Ensure that the length of the merged chunk does not exceed chunk_token_num + if tk_nums[-1] > chunk_token_num: + if t.find(pos) < 0: + t += pos + cks.append(t) + result_images.append(image) + tk_nums.append(tnum) + else: + if cks[-1].find(pos) < 0: + t += pos + cks[-1] += t + if result_images[-1] is None: + result_images[-1] = image + else: + result_images[-1] = concat_img(result_images[-1], image) + tk_nums[-1] += tnum + + for text, image in zip(texts, images): + add_chunk(text, image) + + return cks, result_images def docx_question_level(p, bull=-1): txt = re.sub(r"\u3000", " ", p.text).strip() diff --git a/web/src/interfaces/database/knowledge.ts b/web/src/interfaces/database/knowledge.ts index e0f0b5a58..a479aa625 100644 --- a/web/src/interfaces/database/knowledge.ts +++ b/web/src/interfaces/database/knowledge.ts @@ -103,7 +103,7 @@ export interface IChunk { content_with_weight: string; doc_id: string; doc_name: string; - img_id: string; + image_id: string; important_kwd?: string[]; question_kwd?: string[]; // keywords tag_kwd?: string[]; diff --git a/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx b/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx index 5934eb309..b7e61e06a 100644 --- a/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx +++ b/web/src/pages/add-knowledge/components/knowledge-chunk/components/chunk-card/index.tsx @@ -64,14 +64,14 @@ const ChunkCard = ({ > - {item.img_id && ( + {item.image_id && ( + } > - + )}